|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 4.92004919052124, |
|
"learning_rate": 1.6e-07, |
|
"loss": 7.328, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 4.8694047927856445, |
|
"learning_rate": 3.2e-07, |
|
"loss": 7.5093, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0024, |
|
"grad_norm": 5.063427448272705, |
|
"learning_rate": 4.800000000000001e-07, |
|
"loss": 7.2167, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 5.227718830108643, |
|
"learning_rate": 6.4e-07, |
|
"loss": 7.6708, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 5.120015621185303, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 7.493, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 4.556617736816406, |
|
"learning_rate": 9.600000000000001e-07, |
|
"loss": 7.114, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0056, |
|
"grad_norm": 4.134003162384033, |
|
"learning_rate": 1.12e-06, |
|
"loss": 7.0582, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 3.023643970489502, |
|
"learning_rate": 1.28e-06, |
|
"loss": 6.2808, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0072, |
|
"grad_norm": 2.4297778606414795, |
|
"learning_rate": 1.44e-06, |
|
"loss": 6.353, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 2.867222785949707, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 6.0667, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0088, |
|
"grad_norm": 1.387591004371643, |
|
"learning_rate": 1.76e-06, |
|
"loss": 5.6459, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 2.594311475753784, |
|
"learning_rate": 1.9200000000000003e-06, |
|
"loss": 5.8506, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0104, |
|
"grad_norm": 1.8094066381454468, |
|
"learning_rate": 2.08e-06, |
|
"loss": 5.6573, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0112, |
|
"grad_norm": 0.9034324288368225, |
|
"learning_rate": 2.24e-06, |
|
"loss": 5.231, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 1.4179000854492188, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 5.3384, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.3942221403121948, |
|
"learning_rate": 2.56e-06, |
|
"loss": 5.385, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0136, |
|
"grad_norm": 1.5524358749389648, |
|
"learning_rate": 2.7200000000000002e-06, |
|
"loss": 4.8145, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0144, |
|
"grad_norm": 1.9171744585037231, |
|
"learning_rate": 2.88e-06, |
|
"loss": 5.5795, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0152, |
|
"grad_norm": 2.5012264251708984, |
|
"learning_rate": 3.04e-06, |
|
"loss": 5.1458, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.894866943359375, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 4.7375, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0168, |
|
"grad_norm": 1.083099365234375, |
|
"learning_rate": 3.3600000000000004e-06, |
|
"loss": 4.659, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0176, |
|
"grad_norm": 1.7351473569869995, |
|
"learning_rate": 3.52e-06, |
|
"loss": 4.3309, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0184, |
|
"grad_norm": 1.2516050338745117, |
|
"learning_rate": 3.6800000000000003e-06, |
|
"loss": 4.2946, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.946075439453125, |
|
"learning_rate": 3.8400000000000005e-06, |
|
"loss": 4.2101, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.922369956970215, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 4.5236, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0208, |
|
"grad_norm": 2.0061962604522705, |
|
"learning_rate": 4.16e-06, |
|
"loss": 3.6378, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0216, |
|
"grad_norm": 2.2109129428863525, |
|
"learning_rate": 4.32e-06, |
|
"loss": 3.1168, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 2.374847173690796, |
|
"learning_rate": 4.48e-06, |
|
"loss": 2.9689, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0232, |
|
"grad_norm": 2.2352538108825684, |
|
"learning_rate": 4.6400000000000005e-06, |
|
"loss": 2.2496, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.657749891281128, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 2.9017, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0248, |
|
"grad_norm": 2.9958770275115967, |
|
"learning_rate": 4.960000000000001e-06, |
|
"loss": 2.094, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 1.8515348434448242, |
|
"learning_rate": 5.12e-06, |
|
"loss": 1.9278, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0264, |
|
"grad_norm": 2.1560873985290527, |
|
"learning_rate": 5.28e-06, |
|
"loss": 1.7917, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0272, |
|
"grad_norm": 3.1304805278778076, |
|
"learning_rate": 5.4400000000000004e-06, |
|
"loss": 1.9759, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 3.0489299297332764, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.8824, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 3.545546054840088, |
|
"learning_rate": 5.76e-06, |
|
"loss": 2.0749, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0296, |
|
"grad_norm": 2.089024305343628, |
|
"learning_rate": 5.92e-06, |
|
"loss": 1.0014, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0304, |
|
"grad_norm": 1.475943922996521, |
|
"learning_rate": 6.08e-06, |
|
"loss": 1.6105, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0312, |
|
"grad_norm": 1.4888969659805298, |
|
"learning_rate": 6.24e-06, |
|
"loss": 1.2915, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.1156069040298462, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.6639, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0328, |
|
"grad_norm": 0.5972667932510376, |
|
"learning_rate": 6.560000000000001e-06, |
|
"loss": 0.4398, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0336, |
|
"grad_norm": 1.5191650390625, |
|
"learning_rate": 6.720000000000001e-06, |
|
"loss": 1.0727, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0344, |
|
"grad_norm": 4.311778545379639, |
|
"learning_rate": 6.88e-06, |
|
"loss": 1.6288, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 2.106018543243408, |
|
"learning_rate": 7.04e-06, |
|
"loss": 0.5208, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 2.2589375972747803, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.4426, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0368, |
|
"grad_norm": 8.470863342285156, |
|
"learning_rate": 7.360000000000001e-06, |
|
"loss": 2.5704, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0376, |
|
"grad_norm": 3.564351797103882, |
|
"learning_rate": 7.520000000000001e-06, |
|
"loss": 0.9683, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 3.827845573425293, |
|
"learning_rate": 7.680000000000001e-06, |
|
"loss": 0.5685, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0392, |
|
"grad_norm": 3.611189842224121, |
|
"learning_rate": 7.840000000000001e-06, |
|
"loss": 1.1535, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.9307868480682373, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6147, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0408, |
|
"grad_norm": 3.3078556060791016, |
|
"learning_rate": 8.16e-06, |
|
"loss": 0.9248, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 1.50742506980896, |
|
"learning_rate": 8.32e-06, |
|
"loss": 0.5147, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0424, |
|
"grad_norm": 2.5260138511657715, |
|
"learning_rate": 8.48e-06, |
|
"loss": 0.5129, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0432, |
|
"grad_norm": 0.7977801561355591, |
|
"learning_rate": 8.64e-06, |
|
"loss": 0.5833, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 3.389085054397583, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.4162, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 3.584988832473755, |
|
"learning_rate": 8.96e-06, |
|
"loss": 1.2424, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0456, |
|
"grad_norm": 0.4079441428184509, |
|
"learning_rate": 9.12e-06, |
|
"loss": 0.6532, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0464, |
|
"grad_norm": 2.021636962890625, |
|
"learning_rate": 9.280000000000001e-06, |
|
"loss": 0.2637, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0472, |
|
"grad_norm": 4.71144962310791, |
|
"learning_rate": 9.440000000000001e-06, |
|
"loss": 1.1383, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.831689715385437, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.7587, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0488, |
|
"grad_norm": 2.6891868114471436, |
|
"learning_rate": 9.760000000000001e-06, |
|
"loss": 0.9849, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0496, |
|
"grad_norm": 2.2791528701782227, |
|
"learning_rate": 9.920000000000002e-06, |
|
"loss": 0.74, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0504, |
|
"grad_norm": 2.12807559967041, |
|
"learning_rate": 1.008e-05, |
|
"loss": 0.3742, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 1.7115131616592407, |
|
"learning_rate": 1.024e-05, |
|
"loss": 0.8209, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 1.7355319261550903, |
|
"learning_rate": 1.04e-05, |
|
"loss": 1.1005, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0528, |
|
"grad_norm": 1.7057139873504639, |
|
"learning_rate": 1.056e-05, |
|
"loss": 0.9056, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0536, |
|
"grad_norm": 0.7381348609924316, |
|
"learning_rate": 1.072e-05, |
|
"loss": 0.4982, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 3.6813762187957764, |
|
"learning_rate": 1.0880000000000001e-05, |
|
"loss": 0.8425, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0552, |
|
"grad_norm": 0.6442511677742004, |
|
"learning_rate": 1.1040000000000001e-05, |
|
"loss": 0.4714, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.3334783613681793, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.1425, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0568, |
|
"grad_norm": 2.7860682010650635, |
|
"learning_rate": 1.136e-05, |
|
"loss": 1.0082, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 1.0623384714126587, |
|
"learning_rate": 1.152e-05, |
|
"loss": 0.1404, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.0584, |
|
"grad_norm": 0.49509289860725403, |
|
"learning_rate": 1.168e-05, |
|
"loss": 0.3523, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.0592, |
|
"grad_norm": 0.9187251925468445, |
|
"learning_rate": 1.184e-05, |
|
"loss": 0.7012, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.251713514328003, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.3998, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 3.6301510334014893, |
|
"learning_rate": 1.216e-05, |
|
"loss": 0.991, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.0616, |
|
"grad_norm": 2.2427806854248047, |
|
"learning_rate": 1.232e-05, |
|
"loss": 1.6949, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0624, |
|
"grad_norm": 1.5521266460418701, |
|
"learning_rate": 1.248e-05, |
|
"loss": 0.9281, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.0632, |
|
"grad_norm": 0.9885201454162598, |
|
"learning_rate": 1.2640000000000001e-05, |
|
"loss": 0.3829, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.02154541015625, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.1986, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0648, |
|
"grad_norm": 2.9700698852539062, |
|
"learning_rate": 1.2960000000000001e-05, |
|
"loss": 0.5764, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.0656, |
|
"grad_norm": 1.9716620445251465, |
|
"learning_rate": 1.3120000000000001e-05, |
|
"loss": 0.4319, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.0664, |
|
"grad_norm": 0.23318687081336975, |
|
"learning_rate": 1.3280000000000002e-05, |
|
"loss": 0.7806, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 1.2755334377288818, |
|
"learning_rate": 1.3440000000000002e-05, |
|
"loss": 0.2845, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 0.26743704080581665, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.1663, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0688, |
|
"grad_norm": 1.2172751426696777, |
|
"learning_rate": 1.376e-05, |
|
"loss": 0.3756, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.0696, |
|
"grad_norm": 3.7812864780426025, |
|
"learning_rate": 1.392e-05, |
|
"loss": 0.4978, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 3.4768078327178955, |
|
"learning_rate": 1.408e-05, |
|
"loss": 0.9643, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.0712, |
|
"grad_norm": 3.9793436527252197, |
|
"learning_rate": 1.4240000000000001e-05, |
|
"loss": 0.158, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 2.2694666385650635, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.881, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0728, |
|
"grad_norm": 0.1139988824725151, |
|
"learning_rate": 1.4560000000000001e-05, |
|
"loss": 0.017, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 2.4055392742156982, |
|
"learning_rate": 1.4720000000000001e-05, |
|
"loss": 0.1059, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.0744, |
|
"grad_norm": 1.4160810708999634, |
|
"learning_rate": 1.4880000000000002e-05, |
|
"loss": 0.1963, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.0752, |
|
"grad_norm": 1.9410396814346313, |
|
"learning_rate": 1.5040000000000002e-05, |
|
"loss": 0.1013, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 1.283490777015686, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.2272, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 1.230212688446045, |
|
"learning_rate": 1.5360000000000002e-05, |
|
"loss": 0.7391, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.0776, |
|
"grad_norm": 9.068771362304688, |
|
"learning_rate": 1.552e-05, |
|
"loss": 0.5673, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.0784, |
|
"grad_norm": 3.8310606479644775, |
|
"learning_rate": 1.5680000000000002e-05, |
|
"loss": 1.1112, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.0792, |
|
"grad_norm": 3.0830278396606445, |
|
"learning_rate": 1.584e-05, |
|
"loss": 0.711, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.513973236083984, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.9858, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0808, |
|
"grad_norm": 0.3588366210460663, |
|
"learning_rate": 1.616e-05, |
|
"loss": 0.149, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.0816, |
|
"grad_norm": 1.0215955972671509, |
|
"learning_rate": 1.632e-05, |
|
"loss": 0.7344, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.0824, |
|
"grad_norm": 1.8482775688171387, |
|
"learning_rate": 1.648e-05, |
|
"loss": 1.0866, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 0.7888800501823425, |
|
"learning_rate": 1.664e-05, |
|
"loss": 0.5496, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 0.33580031991004944, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.2747, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0848, |
|
"grad_norm": 1.4474271535873413, |
|
"learning_rate": 1.696e-05, |
|
"loss": 0.3155, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.0856, |
|
"grad_norm": 2.6667094230651855, |
|
"learning_rate": 1.7120000000000002e-05, |
|
"loss": 1.1201, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 1.0721205472946167, |
|
"learning_rate": 1.728e-05, |
|
"loss": 0.3277, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.0872, |
|
"grad_norm": 0.2696588635444641, |
|
"learning_rate": 1.7440000000000002e-05, |
|
"loss": 0.5612, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 2.505485773086548, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.2816, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0888, |
|
"grad_norm": 0.6796432137489319, |
|
"learning_rate": 1.7760000000000003e-05, |
|
"loss": 0.091, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 0.23082202672958374, |
|
"learning_rate": 1.792e-05, |
|
"loss": 0.7718, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.0904, |
|
"grad_norm": 1.6087664365768433, |
|
"learning_rate": 1.8080000000000003e-05, |
|
"loss": 0.8094, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.0912, |
|
"grad_norm": 1.131905198097229, |
|
"learning_rate": 1.824e-05, |
|
"loss": 0.1958, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 1.3417667150497437, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.0899, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 1.7206881046295166, |
|
"learning_rate": 1.8560000000000002e-05, |
|
"loss": 0.1773, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.0936, |
|
"grad_norm": 1.3308982849121094, |
|
"learning_rate": 1.8720000000000004e-05, |
|
"loss": 0.6628, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.0944, |
|
"grad_norm": 0.06212015450000763, |
|
"learning_rate": 1.8880000000000002e-05, |
|
"loss": 0.3871, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.0952, |
|
"grad_norm": 2.4711997509002686, |
|
"learning_rate": 1.904e-05, |
|
"loss": 0.2319, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 3.6728742122650146, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.5643, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0968, |
|
"grad_norm": 0.1921927034854889, |
|
"learning_rate": 1.936e-05, |
|
"loss": 0.0931, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.0976, |
|
"grad_norm": 1.9750112295150757, |
|
"learning_rate": 1.9520000000000003e-05, |
|
"loss": 0.1632, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.0984, |
|
"grad_norm": 2.1710243225097656, |
|
"learning_rate": 1.968e-05, |
|
"loss": 2.1496, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 0.6705166697502136, |
|
"learning_rate": 1.9840000000000003e-05, |
|
"loss": 0.2102, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.9208804368972778, |
|
"learning_rate": 2e-05, |
|
"loss": 0.363, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1008, |
|
"grad_norm": 0.25396543741226196, |
|
"learning_rate": 1.9999961008995607e-05, |
|
"loss": 0.9275, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.1016, |
|
"grad_norm": 3.0983469486236572, |
|
"learning_rate": 1.9999844036286483e-05, |
|
"loss": 1.6902, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 0.8619891405105591, |
|
"learning_rate": 1.9999649082784807e-05, |
|
"loss": 0.9756, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.1032, |
|
"grad_norm": 1.3311007022857666, |
|
"learning_rate": 1.9999376150010868e-05, |
|
"loss": 1.0115, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 1.6311815977096558, |
|
"learning_rate": 1.9999025240093045e-05, |
|
"loss": 1.3444, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1048, |
|
"grad_norm": 1.097953200340271, |
|
"learning_rate": 1.9998596355767805e-05, |
|
"loss": 0.5742, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 0.12066880613565445, |
|
"learning_rate": 1.999808950037968e-05, |
|
"loss": 0.1853, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.1064, |
|
"grad_norm": 1.4088952541351318, |
|
"learning_rate": 1.9997504677881224e-05, |
|
"loss": 0.585, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.1072, |
|
"grad_norm": 0.200229674577713, |
|
"learning_rate": 1.9996841892833e-05, |
|
"loss": 0.0933, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 1.871922254562378, |
|
"learning_rate": 1.9996101150403543e-05, |
|
"loss": 1.4538, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 0.5457477569580078, |
|
"learning_rate": 1.9995282456369313e-05, |
|
"loss": 0.3239, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.1096, |
|
"grad_norm": 0.6347384452819824, |
|
"learning_rate": 1.9994385817114644e-05, |
|
"loss": 0.8208, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.1104, |
|
"grad_norm": 0.8534179925918579, |
|
"learning_rate": 1.9993411239631713e-05, |
|
"loss": 1.9646, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.1112, |
|
"grad_norm": 0.3291590213775635, |
|
"learning_rate": 1.999235873152047e-05, |
|
"loss": 0.0857, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.5380199551582336, |
|
"learning_rate": 1.9991228300988586e-05, |
|
"loss": 0.1706, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1128, |
|
"grad_norm": 1.0301982164382935, |
|
"learning_rate": 1.9990019956851384e-05, |
|
"loss": 0.7458, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.1136, |
|
"grad_norm": 0.8207939863204956, |
|
"learning_rate": 1.9988733708531772e-05, |
|
"loss": 0.8598, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.1144, |
|
"grad_norm": 0.11704489588737488, |
|
"learning_rate": 1.998736956606018e-05, |
|
"loss": 0.0593, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 0.8635666370391846, |
|
"learning_rate": 1.9985927540074453e-05, |
|
"loss": 0.4055, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 0.9759752154350281, |
|
"learning_rate": 1.9984407641819812e-05, |
|
"loss": 0.4182, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1168, |
|
"grad_norm": 1.0212275981903076, |
|
"learning_rate": 1.998280988314872e-05, |
|
"loss": 0.3533, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1176, |
|
"grad_norm": 0.36377808451652527, |
|
"learning_rate": 1.9981134276520828e-05, |
|
"loss": 0.1175, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 1.5665260553359985, |
|
"learning_rate": 1.9979380835002846e-05, |
|
"loss": 0.2428, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.1192, |
|
"grad_norm": 1.0009765625, |
|
"learning_rate": 1.997754957226847e-05, |
|
"loss": 0.3959, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.43519535660743713, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.3812, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1208, |
|
"grad_norm": 1.0461794137954712, |
|
"learning_rate": 1.9973653640879486e-05, |
|
"loss": 0.5793, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.0484498739242554, |
|
"learning_rate": 1.997158900260614e-05, |
|
"loss": 0.3245, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.1224, |
|
"grad_norm": 0.18130800127983093, |
|
"learning_rate": 1.9969446603878673e-05, |
|
"loss": 0.0716, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.1232, |
|
"grad_norm": 0.14019177854061127, |
|
"learning_rate": 1.9967226461403934e-05, |
|
"loss": 0.0946, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 0.48715105652809143, |
|
"learning_rate": 1.9964928592495046e-05, |
|
"loss": 0.1536, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 0.4148877263069153, |
|
"learning_rate": 1.996255301507125e-05, |
|
"loss": 0.1165, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.1256, |
|
"grad_norm": 0.0783982053399086, |
|
"learning_rate": 1.9960099747657774e-05, |
|
"loss": 0.0288, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1264, |
|
"grad_norm": 4.088832378387451, |
|
"learning_rate": 1.9957568809385693e-05, |
|
"loss": 1.1324, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.1272, |
|
"grad_norm": 0.10389735549688339, |
|
"learning_rate": 1.995496021999177e-05, |
|
"loss": 0.0405, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.04948841407895088, |
|
"learning_rate": 1.9952273999818312e-05, |
|
"loss": 0.0372, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1288, |
|
"grad_norm": 0.09340821206569672, |
|
"learning_rate": 1.9949510169813006e-05, |
|
"loss": 0.5878, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.1296, |
|
"grad_norm": 1.2787668704986572, |
|
"learning_rate": 1.9946668751528745e-05, |
|
"loss": 0.3489, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.1304, |
|
"grad_norm": 1.642128586769104, |
|
"learning_rate": 1.994374976712348e-05, |
|
"loss": 0.1129, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 0.1789906769990921, |
|
"learning_rate": 1.9940753239360047e-05, |
|
"loss": 0.0982, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 1.3861182928085327, |
|
"learning_rate": 1.9937679191605964e-05, |
|
"loss": 0.2423, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1328, |
|
"grad_norm": 0.9052829146385193, |
|
"learning_rate": 1.9934527647833276e-05, |
|
"loss": 0.1399, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.1336, |
|
"grad_norm": 1.6789326667785645, |
|
"learning_rate": 1.9931298632618355e-05, |
|
"loss": 0.2957, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 1.9615066051483154, |
|
"learning_rate": 1.9927992171141707e-05, |
|
"loss": 1.2319, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.1352, |
|
"grad_norm": 0.2719954550266266, |
|
"learning_rate": 1.9924608289187786e-05, |
|
"loss": 0.3792, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.131216362118721, |
|
"learning_rate": 1.9921147013144782e-05, |
|
"loss": 0.0956, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1368, |
|
"grad_norm": 0.2020615190267563, |
|
"learning_rate": 1.9917608370004417e-05, |
|
"loss": 0.0629, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 0.49078741669654846, |
|
"learning_rate": 1.9913992387361747e-05, |
|
"loss": 0.096, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.1384, |
|
"grad_norm": 0.09738267213106155, |
|
"learning_rate": 1.991029909341493e-05, |
|
"loss": 0.3201, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.1392, |
|
"grad_norm": 3.8677468299865723, |
|
"learning_rate": 1.990652851696501e-05, |
|
"loss": 0.1698, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.10318366438150406, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.2468, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 1.7756882905960083, |
|
"learning_rate": 1.989875563477316e-05, |
|
"loss": 0.3846, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.1416, |
|
"grad_norm": 8.583524703979492, |
|
"learning_rate": 1.9894753389645723e-05, |
|
"loss": 0.565, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.1424, |
|
"grad_norm": 0.26608502864837646, |
|
"learning_rate": 1.9890673983243708e-05, |
|
"loss": 0.059, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.1432, |
|
"grad_norm": 1.6263763904571533, |
|
"learning_rate": 1.988651744737914e-05, |
|
"loss": 0.9423, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.3715531826019287, |
|
"learning_rate": 1.988228381446553e-05, |
|
"loss": 0.2875, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1448, |
|
"grad_norm": 1.6036182641983032, |
|
"learning_rate": 1.987797311751759e-05, |
|
"loss": 0.6963, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.1456, |
|
"grad_norm": 0.44622594118118286, |
|
"learning_rate": 1.9873585390151003e-05, |
|
"loss": 0.1357, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.1464, |
|
"grad_norm": 0.3719693720340729, |
|
"learning_rate": 1.9869120666582153e-05, |
|
"loss": 0.0775, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 1.655290961265564, |
|
"learning_rate": 1.9864578981627844e-05, |
|
"loss": 0.316, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 0.5241470336914062, |
|
"learning_rate": 1.985996037070505e-05, |
|
"loss": 0.1641, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1488, |
|
"grad_norm": 0.0686238631606102, |
|
"learning_rate": 1.985526486983063e-05, |
|
"loss": 0.3246, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.1496, |
|
"grad_norm": 0.04877380654215813, |
|
"learning_rate": 1.9850492515621038e-05, |
|
"loss": 0.0454, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 0.033756960183382034, |
|
"learning_rate": 1.9845643345292055e-05, |
|
"loss": 0.0306, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.1512, |
|
"grad_norm": 0.050034862011671066, |
|
"learning_rate": 1.9840717396658483e-05, |
|
"loss": 0.0198, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 1.1677488088607788, |
|
"learning_rate": 1.983571470813386e-05, |
|
"loss": 0.9779, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1528, |
|
"grad_norm": 2.3200299739837646, |
|
"learning_rate": 1.9830635318730155e-05, |
|
"loss": 0.2013, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.37079155445098877, |
|
"learning_rate": 1.982547926805747e-05, |
|
"loss": 0.0332, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.1544, |
|
"grad_norm": 0.03381960466504097, |
|
"learning_rate": 1.982024659632372e-05, |
|
"loss": 0.0156, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.1552, |
|
"grad_norm": 1.5234084129333496, |
|
"learning_rate": 1.981493734433433e-05, |
|
"loss": 0.0724, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 1.1328762769699097, |
|
"learning_rate": 1.9809551553491918e-05, |
|
"loss": 0.955, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 4.6101975440979, |
|
"learning_rate": 1.980408926579596e-05, |
|
"loss": 1.726, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.1576, |
|
"grad_norm": 0.9627692103385925, |
|
"learning_rate": 1.979855052384247e-05, |
|
"loss": 0.4459, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.1584, |
|
"grad_norm": 0.19565686583518982, |
|
"learning_rate": 1.9792935370823676e-05, |
|
"loss": 0.1905, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.1592, |
|
"grad_norm": 1.6848326921463013, |
|
"learning_rate": 1.9787243850527663e-05, |
|
"loss": 0.7807, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.4652438163757324, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.4939, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1608, |
|
"grad_norm": 0.5617023706436157, |
|
"learning_rate": 1.9775631886233655e-05, |
|
"loss": 0.5734, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.1616, |
|
"grad_norm": 0.37801074981689453, |
|
"learning_rate": 1.9769711532788083e-05, |
|
"loss": 0.2663, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.1624, |
|
"grad_norm": 2.0784738063812256, |
|
"learning_rate": 1.976371499316945e-05, |
|
"loss": 0.7156, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 0.24609781801700592, |
|
"learning_rate": 1.9757642314139977e-05, |
|
"loss": 0.1335, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 0.18215598165988922, |
|
"learning_rate": 1.9751493543055634e-05, |
|
"loss": 0.1607, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1648, |
|
"grad_norm": 0.1584533452987671, |
|
"learning_rate": 1.9745268727865774e-05, |
|
"loss": 0.1947, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.1656, |
|
"grad_norm": 0.1543274223804474, |
|
"learning_rate": 1.9738967917112752e-05, |
|
"loss": 0.0954, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.09421176463365555, |
|
"learning_rate": 1.9732591159931564e-05, |
|
"loss": 0.058, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.1672, |
|
"grad_norm": 0.037417419254779816, |
|
"learning_rate": 1.9726138506049438e-05, |
|
"loss": 0.0586, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 1.3091963529586792, |
|
"learning_rate": 1.9719610005785466e-05, |
|
"loss": 0.5727, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1688, |
|
"grad_norm": 0.04180837422609329, |
|
"learning_rate": 1.9713005710050203e-05, |
|
"loss": 0.0303, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 0.11628637462854385, |
|
"learning_rate": 1.9706325670345276e-05, |
|
"loss": 0.0236, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.1704, |
|
"grad_norm": 0.028453992679715157, |
|
"learning_rate": 1.9699569938762975e-05, |
|
"loss": 0.015, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.1712, |
|
"grad_norm": 1.2765767574310303, |
|
"learning_rate": 1.9692738567985853e-05, |
|
"loss": 0.6437, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 0.6149804592132568, |
|
"learning_rate": 1.9685831611286312e-05, |
|
"loss": 0.0557, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.0872359424829483, |
|
"learning_rate": 1.967884912252619e-05, |
|
"loss": 0.1832, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.1736, |
|
"grad_norm": 0.031350962817668915, |
|
"learning_rate": 1.967179115615633e-05, |
|
"loss": 0.0422, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.1744, |
|
"grad_norm": 0.1513989269733429, |
|
"learning_rate": 1.9664657767216176e-05, |
|
"loss": 0.033, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.1752, |
|
"grad_norm": 0.05188186839222908, |
|
"learning_rate": 1.9657449011333328e-05, |
|
"loss": 0.0161, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.20884352922439575, |
|
"learning_rate": 1.9650164944723116e-05, |
|
"loss": 0.3403, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1768, |
|
"grad_norm": 1.4495282173156738, |
|
"learning_rate": 1.964280562418815e-05, |
|
"loss": 0.1676, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.1776, |
|
"grad_norm": 0.01325430627912283, |
|
"learning_rate": 1.963537110711789e-05, |
|
"loss": 0.0142, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.1784, |
|
"grad_norm": 0.03731897845864296, |
|
"learning_rate": 1.962786145148819e-05, |
|
"loss": 0.013, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.06609856337308884, |
|
"learning_rate": 1.962027671586086e-05, |
|
"loss": 0.0124, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.02560417540371418, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 0.0291, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1808, |
|
"grad_norm": 0.016771750524640083, |
|
"learning_rate": 1.96048822417875e-05, |
|
"loss": 0.0089, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.1816, |
|
"grad_norm": 7.128602504730225, |
|
"learning_rate": 1.9597072623390668e-05, |
|
"loss": 1.0882, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 0.03642188385128975, |
|
"learning_rate": 1.958918816509367e-05, |
|
"loss": 0.0113, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.1832, |
|
"grad_norm": 0.09811338037252426, |
|
"learning_rate": 1.95812289283811e-05, |
|
"loss": 0.016, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.04652848094701767, |
|
"learning_rate": 1.9573194975320672e-05, |
|
"loss": 0.1104, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1848, |
|
"grad_norm": 1.6137455701828003, |
|
"learning_rate": 1.956508636856278e-05, |
|
"loss": 0.872, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 1.2847460508346558, |
|
"learning_rate": 1.9556903171339963e-05, |
|
"loss": 0.9774, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.1864, |
|
"grad_norm": 0.3364383280277252, |
|
"learning_rate": 1.9548645447466433e-05, |
|
"loss": 0.0612, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.1872, |
|
"grad_norm": 1.6685212850570679, |
|
"learning_rate": 1.954031326133758e-05, |
|
"loss": 0.4306, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 0.12480071187019348, |
|
"learning_rate": 1.9531906677929472e-05, |
|
"loss": 0.1104, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 0.5850446224212646, |
|
"learning_rate": 1.9523425762798328e-05, |
|
"loss": 0.1902, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.1896, |
|
"grad_norm": 0.14481854438781738, |
|
"learning_rate": 1.951487058208003e-05, |
|
"loss": 0.1135, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.1904, |
|
"grad_norm": 1.4755823612213135, |
|
"learning_rate": 1.95062412024896e-05, |
|
"loss": 0.9318, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.1912, |
|
"grad_norm": 0.31880441308021545, |
|
"learning_rate": 1.949753769132067e-05, |
|
"loss": 0.0736, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.3856407403945923, |
|
"learning_rate": 1.9488760116444966e-05, |
|
"loss": 0.4857, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1928, |
|
"grad_norm": 1.6081260442733765, |
|
"learning_rate": 1.9479908546311783e-05, |
|
"loss": 0.2605, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.1936, |
|
"grad_norm": 1.051400065422058, |
|
"learning_rate": 1.9470983049947446e-05, |
|
"loss": 0.1468, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.1944, |
|
"grad_norm": 0.19811314344406128, |
|
"learning_rate": 1.946198369695476e-05, |
|
"loss": 0.5758, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 0.7101706862449646, |
|
"learning_rate": 1.9452910557512497e-05, |
|
"loss": 0.765, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 1.4943866729736328, |
|
"learning_rate": 1.944376370237481e-05, |
|
"loss": 0.4381, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1968, |
|
"grad_norm": 0.22989702224731445, |
|
"learning_rate": 1.9434543202870726e-05, |
|
"loss": 0.0615, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.1976, |
|
"grad_norm": 1.0281144380569458, |
|
"learning_rate": 1.9425249130903544e-05, |
|
"loss": 0.2893, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.912899911403656, |
|
"learning_rate": 1.9415881558950302e-05, |
|
"loss": 0.1344, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.1992, |
|
"grad_norm": 0.4319363534450531, |
|
"learning_rate": 1.9406440560061214e-05, |
|
"loss": 0.162, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.5313297510147095, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.9079, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2008, |
|
"grad_norm": 0.1665397733449936, |
|
"learning_rate": 1.9387338576538743e-05, |
|
"loss": 0.0523, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 0.703409731388092, |
|
"learning_rate": 1.937767774086646e-05, |
|
"loss": 0.4771, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.2024, |
|
"grad_norm": 0.13862833380699158, |
|
"learning_rate": 1.936794377617938e-05, |
|
"loss": 0.0484, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.2032, |
|
"grad_norm": 0.12500865757465363, |
|
"learning_rate": 1.935813675838491e-05, |
|
"loss": 0.033, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 0.12923550605773926, |
|
"learning_rate": 1.9348256763960146e-05, |
|
"loss": 0.0503, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 0.12034843116998672, |
|
"learning_rate": 1.933830386995127e-05, |
|
"loss": 0.2762, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.2056, |
|
"grad_norm": 1.1866302490234375, |
|
"learning_rate": 1.9328278153972947e-05, |
|
"loss": 0.0998, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.2064, |
|
"grad_norm": 0.048314038664102554, |
|
"learning_rate": 1.9318179694207726e-05, |
|
"loss": 0.0296, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.2072, |
|
"grad_norm": 0.2854752242565155, |
|
"learning_rate": 1.9308008569405424e-05, |
|
"loss": 0.0852, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.16209827363491058, |
|
"learning_rate": 1.9297764858882516e-05, |
|
"loss": 0.8268, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2088, |
|
"grad_norm": 0.01691954769194126, |
|
"learning_rate": 1.9287448642521513e-05, |
|
"loss": 0.0093, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.2096, |
|
"grad_norm": 0.09849057346582413, |
|
"learning_rate": 1.9277060000770342e-05, |
|
"loss": 0.0657, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.2104, |
|
"grad_norm": 0.05499891936779022, |
|
"learning_rate": 1.9266599014641724e-05, |
|
"loss": 0.0259, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.5297953486442566, |
|
"learning_rate": 1.9256065765712524e-05, |
|
"loss": 0.0539, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.212, |
|
"grad_norm": 0.053190361708402634, |
|
"learning_rate": 1.9245460336123136e-05, |
|
"loss": 0.035, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2128, |
|
"grad_norm": 0.7372400760650635, |
|
"learning_rate": 1.9234782808576823e-05, |
|
"loss": 0.186, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.2136, |
|
"grad_norm": 0.02464686520397663, |
|
"learning_rate": 1.9224033266339103e-05, |
|
"loss": 0.0097, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 3.046816825866699, |
|
"learning_rate": 1.9213211793237056e-05, |
|
"loss": 0.1614, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.2152, |
|
"grad_norm": 2.2198116779327393, |
|
"learning_rate": 1.9202318473658707e-05, |
|
"loss": 0.6094, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 1.2823199033737183, |
|
"learning_rate": 1.9191353392552346e-05, |
|
"loss": 0.1261, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2168, |
|
"grad_norm": 0.019597092643380165, |
|
"learning_rate": 1.9180316635425883e-05, |
|
"loss": 0.4845, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 0.011679578572511673, |
|
"learning_rate": 1.9169208288346168e-05, |
|
"loss": 0.0162, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.2184, |
|
"grad_norm": 1.827360987663269, |
|
"learning_rate": 1.9158028437938316e-05, |
|
"loss": 0.8432, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.2192, |
|
"grad_norm": 1.7069369554519653, |
|
"learning_rate": 1.914677717138505e-05, |
|
"loss": 0.321, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.022664044052362442, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 0.2938, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 0.5914615392684937, |
|
"learning_rate": 1.9124060741357065e-05, |
|
"loss": 0.0618, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.2216, |
|
"grad_norm": 0.0440104641020298, |
|
"learning_rate": 1.9112595755029625e-05, |
|
"loss": 0.0429, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.2224, |
|
"grad_norm": 0.263875812292099, |
|
"learning_rate": 1.9101059706849957e-05, |
|
"loss": 1.002, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.2232, |
|
"grad_norm": 1.789616584777832, |
|
"learning_rate": 1.908945268677849e-05, |
|
"loss": 0.3908, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 3.8418707847595215, |
|
"learning_rate": 1.907777478532909e-05, |
|
"loss": 0.4097, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2248, |
|
"grad_norm": 0.5954731702804565, |
|
"learning_rate": 1.906602609356838e-05, |
|
"loss": 0.1258, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.2256, |
|
"grad_norm": 3.1581459045410156, |
|
"learning_rate": 1.905420670311502e-05, |
|
"loss": 0.3185, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.2264, |
|
"grad_norm": 0.37356865406036377, |
|
"learning_rate": 1.9042316706138987e-05, |
|
"loss": 0.0718, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 0.6604268550872803, |
|
"learning_rate": 1.9030356195360875e-05, |
|
"loss": 0.1169, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 1.148862361907959, |
|
"learning_rate": 1.901832526405114e-05, |
|
"loss": 0.1075, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2288, |
|
"grad_norm": 4.533411026000977, |
|
"learning_rate": 1.9006224006029404e-05, |
|
"loss": 0.8169, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.2296, |
|
"grad_norm": 0.1384696662425995, |
|
"learning_rate": 1.899405251566371e-05, |
|
"loss": 0.0574, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 4.587916851043701, |
|
"learning_rate": 1.8981810887869784e-05, |
|
"loss": 0.3013, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.2312, |
|
"grad_norm": 0.07302756607532501, |
|
"learning_rate": 1.8969499218110302e-05, |
|
"loss": 0.0345, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.9006990194320679, |
|
"learning_rate": 1.895711760239413e-05, |
|
"loss": 0.4378, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2328, |
|
"grad_norm": 0.07447630912065506, |
|
"learning_rate": 1.89446661372756e-05, |
|
"loss": 0.11, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 0.10830947756767273, |
|
"learning_rate": 1.893214491985374e-05, |
|
"loss": 0.0505, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.2344, |
|
"grad_norm": 0.0925409272313118, |
|
"learning_rate": 1.8919554047771508e-05, |
|
"loss": 0.0449, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.2352, |
|
"grad_norm": 2.633633613586426, |
|
"learning_rate": 1.890689361921507e-05, |
|
"loss": 0.7599, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.236, |
|
"grad_norm": 0.013573708944022655, |
|
"learning_rate": 1.889416373291298e-05, |
|
"loss": 0.2034, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 0.03778607025742531, |
|
"learning_rate": 1.8881364488135448e-05, |
|
"loss": 0.5998, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.2376, |
|
"grad_norm": 0.4637905955314636, |
|
"learning_rate": 1.886849598469356e-05, |
|
"loss": 0.0662, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.2384, |
|
"grad_norm": 0.058186739683151245, |
|
"learning_rate": 1.8855558322938492e-05, |
|
"loss": 0.2836, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.2392, |
|
"grad_norm": 0.09846732765436172, |
|
"learning_rate": 1.8842551603760725e-05, |
|
"loss": 0.1087, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.21428197622299194, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.229, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2408, |
|
"grad_norm": 0.3262503743171692, |
|
"learning_rate": 1.881633139939087e-05, |
|
"loss": 0.104, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.2416, |
|
"grad_norm": 0.9335662126541138, |
|
"learning_rate": 1.8803118118669203e-05, |
|
"loss": 0.5139, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.2424, |
|
"grad_norm": 0.027328329160809517, |
|
"learning_rate": 1.878983618946409e-05, |
|
"loss": 0.0222, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.048500385135412216, |
|
"learning_rate": 1.8776485715350672e-05, |
|
"loss": 0.0422, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.244, |
|
"grad_norm": 0.3864876925945282, |
|
"learning_rate": 1.8763066800438638e-05, |
|
"loss": 0.1505, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2448, |
|
"grad_norm": 1.1042886972427368, |
|
"learning_rate": 1.874957954937138e-05, |
|
"loss": 0.6224, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.2456, |
|
"grad_norm": 0.10346169769763947, |
|
"learning_rate": 1.8736024067325188e-05, |
|
"loss": 0.0375, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 0.01699395291507244, |
|
"learning_rate": 1.8722400460008437e-05, |
|
"loss": 0.0352, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.2472, |
|
"grad_norm": 1.2863647937774658, |
|
"learning_rate": 1.8708708833660755e-05, |
|
"loss": 0.5494, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 2.1446785926818848, |
|
"learning_rate": 1.869494929505219e-05, |
|
"loss": 0.3612, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2488, |
|
"grad_norm": 0.16956354677677155, |
|
"learning_rate": 1.8681121951482397e-05, |
|
"loss": 0.0355, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 1.0704134702682495, |
|
"learning_rate": 1.8667226910779767e-05, |
|
"loss": 0.3591, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.2504, |
|
"grad_norm": 0.17657175660133362, |
|
"learning_rate": 1.8653264281300622e-05, |
|
"loss": 0.0436, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.2512, |
|
"grad_norm": 0.16666510701179504, |
|
"learning_rate": 1.8639234171928355e-05, |
|
"loss": 0.0334, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.252, |
|
"grad_norm": 0.14208954572677612, |
|
"learning_rate": 1.8625136692072577e-05, |
|
"loss": 0.2292, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 1.2962373495101929, |
|
"learning_rate": 1.8610971951668265e-05, |
|
"loss": 0.2592, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.2536, |
|
"grad_norm": 0.09842484444379807, |
|
"learning_rate": 1.8596740061174912e-05, |
|
"loss": 0.0721, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.2544, |
|
"grad_norm": 0.07477198541164398, |
|
"learning_rate": 1.8582441131575658e-05, |
|
"loss": 0.0468, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.2552, |
|
"grad_norm": 1.2548961639404297, |
|
"learning_rate": 1.856807527437643e-05, |
|
"loss": 0.3335, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.20232835412025452, |
|
"learning_rate": 1.855364260160507e-05, |
|
"loss": 0.4149, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2568, |
|
"grad_norm": 0.03189453110098839, |
|
"learning_rate": 1.8539143225810453e-05, |
|
"loss": 0.174, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.2576, |
|
"grad_norm": 0.026663975790143013, |
|
"learning_rate": 1.8524577260061628e-05, |
|
"loss": 0.0207, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.2584, |
|
"grad_norm": 0.03870954364538193, |
|
"learning_rate": 1.850994481794692e-05, |
|
"loss": 0.0197, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 0.05131769925355911, |
|
"learning_rate": 1.8495246013573057e-05, |
|
"loss": 0.0187, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.278886079788208, |
|
"learning_rate": 1.848048096156426e-05, |
|
"loss": 0.1129, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2608, |
|
"grad_norm": 0.030908726155757904, |
|
"learning_rate": 1.8465649777061377e-05, |
|
"loss": 0.0288, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.2616, |
|
"grad_norm": 1.7882804870605469, |
|
"learning_rate": 1.8450752575720967e-05, |
|
"loss": 0.3299, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 0.9408004879951477, |
|
"learning_rate": 1.843578947371439e-05, |
|
"loss": 0.1842, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.2632, |
|
"grad_norm": 0.03317919746041298, |
|
"learning_rate": 1.8420760587726925e-05, |
|
"loss": 0.0249, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.0062853083945810795, |
|
"learning_rate": 1.8405666034956842e-05, |
|
"loss": 0.0765, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2648, |
|
"grad_norm": 1.696451187133789, |
|
"learning_rate": 1.8390505933114503e-05, |
|
"loss": 0.4928, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 0.11740172654390335, |
|
"learning_rate": 1.837528040042142e-05, |
|
"loss": 0.0175, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.2664, |
|
"grad_norm": 0.437165230512619, |
|
"learning_rate": 1.8359989555609355e-05, |
|
"loss": 0.0775, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.2672, |
|
"grad_norm": 0.01961176097393036, |
|
"learning_rate": 1.834463351791939e-05, |
|
"loss": 0.012, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.268, |
|
"grad_norm": 2.6758432388305664, |
|
"learning_rate": 1.8329212407100996e-05, |
|
"loss": 0.1287, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.007790145929902792, |
|
"learning_rate": 1.8313726343411085e-05, |
|
"loss": 0.0071, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.2696, |
|
"grad_norm": 0.01610707677900791, |
|
"learning_rate": 1.82981754476131e-05, |
|
"loss": 0.004, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.2704, |
|
"grad_norm": 0.10419867932796478, |
|
"learning_rate": 1.8282559840976043e-05, |
|
"loss": 0.022, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.2712, |
|
"grad_norm": 0.020517872646450996, |
|
"learning_rate": 1.8266879645273557e-05, |
|
"loss": 0.0028, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.011518670246005058, |
|
"learning_rate": 1.8251134982782952e-05, |
|
"loss": 0.0121, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2728, |
|
"grad_norm": 3.1481823921203613, |
|
"learning_rate": 1.8235325976284276e-05, |
|
"loss": 0.9131, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.2736, |
|
"grad_norm": 0.021789977326989174, |
|
"learning_rate": 1.8219452749059332e-05, |
|
"loss": 0.0169, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.2744, |
|
"grad_norm": 2.266319990158081, |
|
"learning_rate": 1.8203515424890738e-05, |
|
"loss": 0.4778, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 2.55071759223938, |
|
"learning_rate": 1.8187514128060946e-05, |
|
"loss": 0.4595, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.276, |
|
"grad_norm": 1.0234826803207397, |
|
"learning_rate": 1.8171448983351284e-05, |
|
"loss": 0.6759, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2768, |
|
"grad_norm": 0.5817314982414246, |
|
"learning_rate": 1.8155320116040983e-05, |
|
"loss": 0.2023, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.2776, |
|
"grad_norm": 0.07609284669160843, |
|
"learning_rate": 1.8139127651906183e-05, |
|
"loss": 0.2312, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 0.1080310121178627, |
|
"learning_rate": 1.812287171721897e-05, |
|
"loss": 0.1537, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.2792, |
|
"grad_norm": 0.1361207813024521, |
|
"learning_rate": 1.81065524387464e-05, |
|
"loss": 0.0511, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.11058028042316437, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.0885, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2808, |
|
"grad_norm": 0.6568836569786072, |
|
"learning_rate": 1.8073724359982184e-05, |
|
"loss": 0.3678, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.250731885433197, |
|
"learning_rate": 1.8057215815690494e-05, |
|
"loss": 0.5069, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.2824, |
|
"grad_norm": 0.0678286999464035, |
|
"learning_rate": 1.8040644439611348e-05, |
|
"loss": 0.0418, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.2832, |
|
"grad_norm": 1.9648799896240234, |
|
"learning_rate": 1.802401036097167e-05, |
|
"loss": 0.4493, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.284, |
|
"grad_norm": 0.21381056308746338, |
|
"learning_rate": 1.8007313709487334e-05, |
|
"loss": 0.1236, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 0.5156528949737549, |
|
"learning_rate": 1.79905546153622e-05, |
|
"loss": 0.1819, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.2856, |
|
"grad_norm": 0.09592005610466003, |
|
"learning_rate": 1.7973733209287036e-05, |
|
"loss": 0.1251, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.2864, |
|
"grad_norm": 0.42804062366485596, |
|
"learning_rate": 1.7956849622438554e-05, |
|
"loss": 0.1349, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.2872, |
|
"grad_norm": 0.13698697090148926, |
|
"learning_rate": 1.7939903986478354e-05, |
|
"loss": 0.0416, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.01778518594801426, |
|
"learning_rate": 1.792289643355191e-05, |
|
"loss": 0.0129, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2888, |
|
"grad_norm": 0.2408394068479538, |
|
"learning_rate": 1.7905827096287532e-05, |
|
"loss": 0.0963, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.2896, |
|
"grad_norm": 0.025463353842496872, |
|
"learning_rate": 1.7888696107795343e-05, |
|
"loss": 0.6419, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.2904, |
|
"grad_norm": 1.5628716945648193, |
|
"learning_rate": 1.7871503601666233e-05, |
|
"loss": 0.4731, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 0.07346764206886292, |
|
"learning_rate": 1.785424971197082e-05, |
|
"loss": 0.0206, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.292, |
|
"grad_norm": 5.0355963706970215, |
|
"learning_rate": 1.78369345732584e-05, |
|
"loss": 0.2513, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2928, |
|
"grad_norm": 1.0326207876205444, |
|
"learning_rate": 1.7819558320555902e-05, |
|
"loss": 1.2558, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.2936, |
|
"grad_norm": 0.24173852801322937, |
|
"learning_rate": 1.780212108936684e-05, |
|
"loss": 0.0767, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 0.20275752246379852, |
|
"learning_rate": 1.7784623015670237e-05, |
|
"loss": 0.059, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.2952, |
|
"grad_norm": 0.1756519377231598, |
|
"learning_rate": 1.7767064235919594e-05, |
|
"loss": 0.1225, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.09066800028085709, |
|
"learning_rate": 1.7749444887041797e-05, |
|
"loss": 0.2309, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2968, |
|
"grad_norm": 2.341961622238159, |
|
"learning_rate": 1.7731765106436073e-05, |
|
"loss": 0.5643, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 0.09298836439847946, |
|
"learning_rate": 1.7714025031972904e-05, |
|
"loss": 0.272, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.2984, |
|
"grad_norm": 0.15143701434135437, |
|
"learning_rate": 1.7696224801992947e-05, |
|
"loss": 0.0548, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.2992, |
|
"grad_norm": 1.298582911491394, |
|
"learning_rate": 1.767836455530598e-05, |
|
"loss": 0.2634, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.12245524674654007, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.113, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.08619975298643112, |
|
"learning_rate": 1.764246456938909e-05, |
|
"loss": 0.0373, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.3016, |
|
"grad_norm": 1.6767048835754395, |
|
"learning_rate": 1.762442511011448e-05, |
|
"loss": 0.3271, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.3024, |
|
"grad_norm": 0.0463847815990448, |
|
"learning_rate": 1.7606326194041274e-05, |
|
"loss": 0.3972, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.3032, |
|
"grad_norm": 0.018276596441864967, |
|
"learning_rate": 1.7588167962308458e-05, |
|
"loss": 0.0141, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.1315947324037552, |
|
"learning_rate": 1.7569950556517566e-05, |
|
"loss": 0.0556, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3048, |
|
"grad_norm": 0.5924662947654724, |
|
"learning_rate": 1.7551674118731592e-05, |
|
"loss": 0.1275, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.3056, |
|
"grad_norm": 0.23044385015964508, |
|
"learning_rate": 1.7533338791473872e-05, |
|
"loss": 0.0416, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.3064, |
|
"grad_norm": 1.4832113981246948, |
|
"learning_rate": 1.7514944717726962e-05, |
|
"loss": 0.4879, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.08857022970914841, |
|
"learning_rate": 1.749649204093155e-05, |
|
"loss": 0.042, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.308, |
|
"grad_norm": 0.24603064358234406, |
|
"learning_rate": 1.747798090498532e-05, |
|
"loss": 0.2623, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3088, |
|
"grad_norm": 0.08749913424253464, |
|
"learning_rate": 1.7459411454241822e-05, |
|
"loss": 0.0188, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.3096, |
|
"grad_norm": 0.10140682011842728, |
|
"learning_rate": 1.7440783833509366e-05, |
|
"loss": 0.0401, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 0.027609799057245255, |
|
"learning_rate": 1.7422098188049885e-05, |
|
"loss": 0.0173, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.3112, |
|
"grad_norm": 1.5163832902908325, |
|
"learning_rate": 1.7403354663577782e-05, |
|
"loss": 0.654, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.4432819187641144, |
|
"learning_rate": 1.7384553406258842e-05, |
|
"loss": 0.0778, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3128, |
|
"grad_norm": 0.006748716812580824, |
|
"learning_rate": 1.7365694562709034e-05, |
|
"loss": 0.0119, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 0.023698939010500908, |
|
"learning_rate": 1.7346778279993417e-05, |
|
"loss": 0.013, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.3144, |
|
"grad_norm": 0.01780467852950096, |
|
"learning_rate": 1.732780470562496e-05, |
|
"loss": 0.015, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.3152, |
|
"grad_norm": 1.4911342859268188, |
|
"learning_rate": 1.7308773987563406e-05, |
|
"loss": 0.1485, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.316, |
|
"grad_norm": 0.030542919412255287, |
|
"learning_rate": 1.7289686274214116e-05, |
|
"loss": 0.0136, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 0.039021674543619156, |
|
"learning_rate": 1.727054171442692e-05, |
|
"loss": 0.0715, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.3176, |
|
"grad_norm": 0.24402689933776855, |
|
"learning_rate": 1.7251340457494934e-05, |
|
"loss": 0.5099, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.3184, |
|
"grad_norm": 2.137239456176758, |
|
"learning_rate": 1.7232082653153422e-05, |
|
"loss": 0.5133, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.3192, |
|
"grad_norm": 2.8100578784942627, |
|
"learning_rate": 1.721276845157861e-05, |
|
"loss": 0.428, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1050364151597023, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 0.0221, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3208, |
|
"grad_norm": 0.02302808314561844, |
|
"learning_rate": 1.717397145963179e-05, |
|
"loss": 0.0524, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.3216, |
|
"grad_norm": 0.5185064077377319, |
|
"learning_rate": 1.715448897180652e-05, |
|
"loss": 0.0452, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.3224, |
|
"grad_norm": 0.49271076917648315, |
|
"learning_rate": 1.7134950691839063e-05, |
|
"loss": 0.1355, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 0.2875131666660309, |
|
"learning_rate": 1.7115356772092858e-05, |
|
"loss": 0.5409, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.324, |
|
"grad_norm": 2.0039379596710205, |
|
"learning_rate": 1.709570736536521e-05, |
|
"loss": 0.2663, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3248, |
|
"grad_norm": 1.235532522201538, |
|
"learning_rate": 1.7076002624886156e-05, |
|
"loss": 0.391, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.3256, |
|
"grad_norm": 0.9183419942855835, |
|
"learning_rate": 1.705624270431721e-05, |
|
"loss": 0.1418, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.5461841821670532, |
|
"learning_rate": 1.7036427757750205e-05, |
|
"loss": 0.1603, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.3272, |
|
"grad_norm": 0.373760849237442, |
|
"learning_rate": 1.7016557939706075e-05, |
|
"loss": 0.0709, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 0.40094730257987976, |
|
"learning_rate": 1.6996633405133656e-05, |
|
"loss": 0.0659, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3288, |
|
"grad_norm": 0.05968625098466873, |
|
"learning_rate": 1.6976654309408464e-05, |
|
"loss": 0.0307, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 0.22513960301876068, |
|
"learning_rate": 1.695662080833151e-05, |
|
"loss": 0.5364, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.3304, |
|
"grad_norm": 0.2083749920129776, |
|
"learning_rate": 1.693653305812805e-05, |
|
"loss": 0.0446, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.3312, |
|
"grad_norm": 0.00946067925542593, |
|
"learning_rate": 1.6916391215446403e-05, |
|
"loss": 0.0212, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.332, |
|
"grad_norm": 0.08354925364255905, |
|
"learning_rate": 1.68961954373567e-05, |
|
"loss": 0.0427, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.04629332199692726, |
|
"learning_rate": 1.6875945881349676e-05, |
|
"loss": 0.0264, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.3336, |
|
"grad_norm": 0.05332889407873154, |
|
"learning_rate": 1.6855642705335438e-05, |
|
"loss": 0.0174, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.3344, |
|
"grad_norm": 0.04525836184620857, |
|
"learning_rate": 1.6835286067642228e-05, |
|
"loss": 0.0188, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.3352, |
|
"grad_norm": 1.4626930952072144, |
|
"learning_rate": 1.68148761270152e-05, |
|
"loss": 0.3748, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.023244062438607216, |
|
"learning_rate": 1.6794413042615168e-05, |
|
"loss": 0.1176, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3368, |
|
"grad_norm": 0.06457129120826721, |
|
"learning_rate": 1.6773896974017373e-05, |
|
"loss": 0.5326, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.3376, |
|
"grad_norm": 0.0086558498442173, |
|
"learning_rate": 1.6753328081210244e-05, |
|
"loss": 0.1256, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.3384, |
|
"grad_norm": 0.005909595172852278, |
|
"learning_rate": 1.6732706524594138e-05, |
|
"loss": 0.0242, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.0692615732550621, |
|
"learning_rate": 1.6712032464980094e-05, |
|
"loss": 0.0237, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.266417980194092, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 0.3703, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3408, |
|
"grad_norm": 2.5376994609832764, |
|
"learning_rate": 1.6670527482048246e-05, |
|
"loss": 0.3166, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.3416, |
|
"grad_norm": 0.02030259743332863, |
|
"learning_rate": 1.6649696882394635e-05, |
|
"loss": 0.0218, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 0.09955969452857971, |
|
"learning_rate": 1.6628814427068954e-05, |
|
"loss": 0.0158, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.3432, |
|
"grad_norm": 1.0117660760879517, |
|
"learning_rate": 1.6607880278916778e-05, |
|
"loss": 0.4415, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 0.7470207214355469, |
|
"learning_rate": 1.6586894601186804e-05, |
|
"loss": 0.0473, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3448, |
|
"grad_norm": 2.56488299369812, |
|
"learning_rate": 1.6565857557529567e-05, |
|
"loss": 0.5677, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.06736627966165543, |
|
"learning_rate": 1.654476931199615e-05, |
|
"loss": 0.0131, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.3464, |
|
"grad_norm": 0.2571139931678772, |
|
"learning_rate": 1.652363002903693e-05, |
|
"loss": 0.1469, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.3472, |
|
"grad_norm": 0.21657270193099976, |
|
"learning_rate": 1.650243987350029e-05, |
|
"loss": 0.0489, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.348, |
|
"grad_norm": 0.02142491564154625, |
|
"learning_rate": 1.6481199010631312e-05, |
|
"loss": 0.0332, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 0.1039031520485878, |
|
"learning_rate": 1.6459907606070513e-05, |
|
"loss": 0.2827, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.3496, |
|
"grad_norm": 0.07285499572753906, |
|
"learning_rate": 1.643856582585254e-05, |
|
"loss": 0.0334, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.3504, |
|
"grad_norm": 0.0627899169921875, |
|
"learning_rate": 1.6417173836404888e-05, |
|
"loss": 0.0265, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.3512, |
|
"grad_norm": 0.021432699635624886, |
|
"learning_rate": 1.6395731804546582e-05, |
|
"loss": 0.0292, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.07570278644561768, |
|
"learning_rate": 1.63742398974869e-05, |
|
"loss": 0.0153, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3528, |
|
"grad_norm": 0.0861375704407692, |
|
"learning_rate": 1.6352698282824045e-05, |
|
"loss": 0.0148, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.3536, |
|
"grad_norm": 0.12202827632427216, |
|
"learning_rate": 1.6331107128543856e-05, |
|
"loss": 0.0365, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.3544, |
|
"grad_norm": 0.049717921763658524, |
|
"learning_rate": 1.6309466603018497e-05, |
|
"loss": 0.0202, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 2.4151523113250732, |
|
"learning_rate": 1.628777687500513e-05, |
|
"loss": 0.4901, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.356, |
|
"grad_norm": 0.03276515007019043, |
|
"learning_rate": 1.6266038113644605e-05, |
|
"loss": 0.0243, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3568, |
|
"grad_norm": 0.09225738793611526, |
|
"learning_rate": 1.624425048846016e-05, |
|
"loss": 0.0916, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.3576, |
|
"grad_norm": 0.06391174346208572, |
|
"learning_rate": 1.6222414169356066e-05, |
|
"loss": 0.0105, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.03721031919121742, |
|
"learning_rate": 1.620052932661633e-05, |
|
"loss": 0.0123, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.3592, |
|
"grad_norm": 0.2955784201622009, |
|
"learning_rate": 1.6178596130903345e-05, |
|
"loss": 1.2386, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.05584976449608803, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 0.0926, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3608, |
|
"grad_norm": 1.0190123319625854, |
|
"learning_rate": 1.6134585365091243e-05, |
|
"loss": 0.1162, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 0.09299857914447784, |
|
"learning_rate": 1.611250813819692e-05, |
|
"loss": 0.0425, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.3624, |
|
"grad_norm": 0.057083725929260254, |
|
"learning_rate": 1.6090383244736256e-05, |
|
"loss": 0.0206, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.3632, |
|
"grad_norm": 0.006012015510350466, |
|
"learning_rate": 1.6068210857243625e-05, |
|
"loss": 0.0242, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.364, |
|
"grad_norm": 0.04129006341099739, |
|
"learning_rate": 1.6045991148623752e-05, |
|
"loss": 0.0422, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.049417100846767426, |
|
"learning_rate": 1.6023724292150387e-05, |
|
"loss": 0.0371, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.3656, |
|
"grad_norm": 0.13303126394748688, |
|
"learning_rate": 1.6001410461464955e-05, |
|
"loss": 0.2441, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.3664, |
|
"grad_norm": 0.1420883685350418, |
|
"learning_rate": 1.597904983057519e-05, |
|
"loss": 0.0301, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.3672, |
|
"grad_norm": 0.02255750633776188, |
|
"learning_rate": 1.5956642573853784e-05, |
|
"loss": 0.1678, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 2.711224317550659, |
|
"learning_rate": 1.5934188866037017e-05, |
|
"loss": 0.2587, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.3688, |
|
"grad_norm": 0.015457335859537125, |
|
"learning_rate": 1.591168888222342e-05, |
|
"loss": 0.0161, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.3696, |
|
"grad_norm": 0.021741507574915886, |
|
"learning_rate": 1.5889142797872387e-05, |
|
"loss": 0.0163, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.3704, |
|
"grad_norm": 2.3807716369628906, |
|
"learning_rate": 1.5866550788802815e-05, |
|
"loss": 0.2696, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 1.5936973094940186, |
|
"learning_rate": 1.5843913031191722e-05, |
|
"loss": 0.4635, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.372, |
|
"grad_norm": 0.18653613328933716, |
|
"learning_rate": 1.5821229701572897e-05, |
|
"loss": 0.0413, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3728, |
|
"grad_norm": 0.048368774354457855, |
|
"learning_rate": 1.5798500976835493e-05, |
|
"loss": 0.0504, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.3736, |
|
"grad_norm": 1.1611881256103516, |
|
"learning_rate": 1.5775727034222675e-05, |
|
"loss": 0.1021, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 0.03345046192407608, |
|
"learning_rate": 1.575290805133023e-05, |
|
"loss": 0.0152, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.3752, |
|
"grad_norm": 0.035641297698020935, |
|
"learning_rate": 1.5730044206105156e-05, |
|
"loss": 0.0111, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 0.1988631784915924, |
|
"learning_rate": 1.570713567684432e-05, |
|
"loss": 0.0267, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3768, |
|
"grad_norm": 0.10902003198862076, |
|
"learning_rate": 1.568418264219303e-05, |
|
"loss": 0.063, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 0.25664544105529785, |
|
"learning_rate": 1.5661185281143666e-05, |
|
"loss": 0.6095, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.3784, |
|
"grad_norm": 0.02122955210506916, |
|
"learning_rate": 1.5638143773034268e-05, |
|
"loss": 0.0116, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.3792, |
|
"grad_norm": 0.05375152826309204, |
|
"learning_rate": 1.5615058297547144e-05, |
|
"loss": 0.2848, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5861246585845947, |
|
"learning_rate": 1.5591929034707468e-05, |
|
"loss": 0.0492, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 0.2694084048271179, |
|
"learning_rate": 1.556875616488188e-05, |
|
"loss": 0.0587, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.3816, |
|
"grad_norm": 0.1376236230134964, |
|
"learning_rate": 1.5545539868777075e-05, |
|
"loss": 0.094, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.3824, |
|
"grad_norm": 0.03570554405450821, |
|
"learning_rate": 1.5522280327438388e-05, |
|
"loss": 0.0584, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.3832, |
|
"grad_norm": 0.01179492473602295, |
|
"learning_rate": 1.54989777222484e-05, |
|
"loss": 0.4026, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.4097414016723633, |
|
"learning_rate": 1.5475632234925505e-05, |
|
"loss": 0.0395, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3848, |
|
"grad_norm": 0.021319517865777016, |
|
"learning_rate": 1.5452244047522504e-05, |
|
"loss": 0.0063, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.3856, |
|
"grad_norm": 0.008077614940702915, |
|
"learning_rate": 1.5428813342425177e-05, |
|
"loss": 0.0087, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.3864, |
|
"grad_norm": 0.23517374694347382, |
|
"learning_rate": 1.540534030235087e-05, |
|
"loss": 0.0319, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 0.5178306102752686, |
|
"learning_rate": 1.5381825110347072e-05, |
|
"loss": 0.077, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.388, |
|
"grad_norm": 0.15628471970558167, |
|
"learning_rate": 1.5358267949789968e-05, |
|
"loss": 0.026, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3888, |
|
"grad_norm": 0.32474571466445923, |
|
"learning_rate": 1.533466900438303e-05, |
|
"loss": 0.049, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.3896, |
|
"grad_norm": 0.024446366354823112, |
|
"learning_rate": 1.5311028458155567e-05, |
|
"loss": 0.1731, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 0.1003560721874237, |
|
"learning_rate": 1.528734649546132e-05, |
|
"loss": 0.0142, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.3912, |
|
"grad_norm": 0.05430926755070686, |
|
"learning_rate": 1.526362330097698e-05, |
|
"loss": 0.1951, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.0365879088640213, |
|
"learning_rate": 1.5239859059700794e-05, |
|
"loss": 0.0093, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3928, |
|
"grad_norm": 0.006135095842182636, |
|
"learning_rate": 1.5216053956951081e-05, |
|
"loss": 0.0098, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 0.25733840465545654, |
|
"learning_rate": 1.5192208178364815e-05, |
|
"loss": 0.0337, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.3944, |
|
"grad_norm": 1.6840555667877197, |
|
"learning_rate": 1.5168321909896171e-05, |
|
"loss": 0.0434, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.3952, |
|
"grad_norm": 0.12561841309070587, |
|
"learning_rate": 1.5144395337815066e-05, |
|
"loss": 1.176, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.396, |
|
"grad_norm": 0.03084614872932434, |
|
"learning_rate": 1.5120428648705716e-05, |
|
"loss": 0.0051, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.08889269083738327, |
|
"learning_rate": 1.5096422029465178e-05, |
|
"loss": 0.0162, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.3976, |
|
"grad_norm": 0.012536651454865932, |
|
"learning_rate": 1.5072375667301893e-05, |
|
"loss": 0.0087, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.3984, |
|
"grad_norm": 0.08808624744415283, |
|
"learning_rate": 1.504828974973422e-05, |
|
"loss": 0.254, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.3992, |
|
"grad_norm": 0.00895402766764164, |
|
"learning_rate": 1.5024164464588982e-05, |
|
"loss": 0.0067, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.01615263894200325, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.0474, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4008, |
|
"grad_norm": 0.11077135056257248, |
|
"learning_rate": 1.4975796544406627e-05, |
|
"loss": 0.0484, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.4016, |
|
"grad_norm": 1.5227055549621582, |
|
"learning_rate": 1.4951554286552266e-05, |
|
"loss": 0.88, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.4024, |
|
"grad_norm": 0.11261726170778275, |
|
"learning_rate": 1.4927273415482916e-05, |
|
"loss": 0.0309, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 3.3870084285736084, |
|
"learning_rate": 1.4902954120545687e-05, |
|
"loss": 0.2266, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.404, |
|
"grad_norm": 0.6803337335586548, |
|
"learning_rate": 1.4878596591387329e-05, |
|
"loss": 0.0631, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4048, |
|
"grad_norm": 0.2337927520275116, |
|
"learning_rate": 1.485420101795274e-05, |
|
"loss": 0.0601, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.4056, |
|
"grad_norm": 2.0759363174438477, |
|
"learning_rate": 1.4829767590483508e-05, |
|
"loss": 0.1077, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 0.16511759161949158, |
|
"learning_rate": 1.4805296499516408e-05, |
|
"loss": 0.0283, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.4072, |
|
"grad_norm": 0.013055981136858463, |
|
"learning_rate": 1.4780787935881925e-05, |
|
"loss": 0.0067, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 1.3341008424758911, |
|
"learning_rate": 1.4756242090702756e-05, |
|
"loss": 0.1319, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4088, |
|
"grad_norm": 0.21409232914447784, |
|
"learning_rate": 1.4731659155392332e-05, |
|
"loss": 0.0223, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.007527098525315523, |
|
"learning_rate": 1.470703932165333e-05, |
|
"loss": 0.008, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.4104, |
|
"grad_norm": 0.02161354571580887, |
|
"learning_rate": 1.4682382781476146e-05, |
|
"loss": 0.0093, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.4112, |
|
"grad_norm": 3.1243057250976562, |
|
"learning_rate": 1.4657689727137443e-05, |
|
"loss": 0.2774, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.412, |
|
"grad_norm": 0.7081921696662903, |
|
"learning_rate": 1.463296035119862e-05, |
|
"loss": 0.3953, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 0.23352546989917755, |
|
"learning_rate": 1.4608194846504311e-05, |
|
"loss": 0.0156, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.4136, |
|
"grad_norm": 0.44725605845451355, |
|
"learning_rate": 1.4583393406180898e-05, |
|
"loss": 0.0288, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.4144, |
|
"grad_norm": 0.10920588672161102, |
|
"learning_rate": 1.4558556223635004e-05, |
|
"loss": 0.0283, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.4152, |
|
"grad_norm": 0.004958951845765114, |
|
"learning_rate": 1.4533683492551954e-05, |
|
"loss": 0.0138, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.009044544771313667, |
|
"learning_rate": 1.4508775406894308e-05, |
|
"loss": 0.0509, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.4168, |
|
"grad_norm": 4.271718502044678, |
|
"learning_rate": 1.4483832160900326e-05, |
|
"loss": 0.6421, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.4176, |
|
"grad_norm": 0.005885216407477856, |
|
"learning_rate": 1.4458853949082443e-05, |
|
"loss": 0.0097, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.4184, |
|
"grad_norm": 0.004338196478784084, |
|
"learning_rate": 1.4433840966225772e-05, |
|
"loss": 0.0071, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 0.0037080624606460333, |
|
"learning_rate": 1.4408793407386587e-05, |
|
"loss": 0.012, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.032303668558597565, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 0.0057, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4208, |
|
"grad_norm": 0.006096419878304005, |
|
"learning_rate": 1.4358595343332342e-05, |
|
"loss": 0.0036, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.4216, |
|
"grad_norm": 0.018376469612121582, |
|
"learning_rate": 1.4333445229571874e-05, |
|
"loss": 0.0039, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.05468170344829559, |
|
"learning_rate": 1.4308261322735006e-05, |
|
"loss": 0.0417, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.4232, |
|
"grad_norm": 0.033594775944948196, |
|
"learning_rate": 1.4283043819210905e-05, |
|
"loss": 0.0169, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.006937976460903883, |
|
"learning_rate": 1.4257792915650728e-05, |
|
"loss": 0.0705, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4248, |
|
"grad_norm": 0.01067473366856575, |
|
"learning_rate": 1.4232508808966097e-05, |
|
"loss": 0.0099, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 0.032820601016283035, |
|
"learning_rate": 1.420719169632755e-05, |
|
"loss": 0.0063, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.4264, |
|
"grad_norm": 4.574073314666748, |
|
"learning_rate": 1.4181841775163014e-05, |
|
"loss": 1.646, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.4272, |
|
"grad_norm": 2.232372999191284, |
|
"learning_rate": 1.415645924315628e-05, |
|
"loss": 0.0774, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.428, |
|
"grad_norm": 0.007644816767424345, |
|
"learning_rate": 1.413104429824542e-05, |
|
"loss": 0.0042, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 0.041802894324064255, |
|
"learning_rate": 1.4105597138621281e-05, |
|
"loss": 0.0084, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.4296, |
|
"grad_norm": 1.1193236112594604, |
|
"learning_rate": 1.4080117962725929e-05, |
|
"loss": 0.5387, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.4304, |
|
"grad_norm": 0.04370833560824394, |
|
"learning_rate": 1.4054606969251095e-05, |
|
"loss": 0.0382, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.4312, |
|
"grad_norm": 0.9142640233039856, |
|
"learning_rate": 1.4029064357136628e-05, |
|
"loss": 0.8189, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.30186763405799866, |
|
"learning_rate": 1.4003490325568953e-05, |
|
"loss": 0.0461, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4328, |
|
"grad_norm": 0.05774744972586632, |
|
"learning_rate": 1.39778850739795e-05, |
|
"loss": 0.6106, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.4336, |
|
"grad_norm": 0.3426492214202881, |
|
"learning_rate": 1.3952248802043166e-05, |
|
"loss": 0.0866, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.4344, |
|
"grad_norm": 0.5425324440002441, |
|
"learning_rate": 1.3926581709676752e-05, |
|
"loss": 0.1086, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.10301554948091507, |
|
"learning_rate": 1.3900883997037398e-05, |
|
"loss": 0.0468, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.436, |
|
"grad_norm": 1.1397227048873901, |
|
"learning_rate": 1.3875155864521031e-05, |
|
"loss": 0.2403, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4368, |
|
"grad_norm": 0.02772480994462967, |
|
"learning_rate": 1.3849397512760797e-05, |
|
"loss": 0.1271, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.4376, |
|
"grad_norm": 0.10023550689220428, |
|
"learning_rate": 1.3823609142625492e-05, |
|
"loss": 0.0979, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 1.5255858898162842, |
|
"learning_rate": 1.3797790955218014e-05, |
|
"loss": 0.7019, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.4392, |
|
"grad_norm": 0.0757753774523735, |
|
"learning_rate": 1.3771943151873768e-05, |
|
"loss": 0.0422, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.053413309156894684, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 0.4052, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4408, |
|
"grad_norm": 0.061455827206373215, |
|
"learning_rate": 1.3720159503869816e-05, |
|
"loss": 0.0402, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 0.3662703335285187, |
|
"learning_rate": 1.3694224063029396e-05, |
|
"loss": 0.0559, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.4424, |
|
"grad_norm": 0.09608737379312515, |
|
"learning_rate": 1.3668259813887644e-05, |
|
"loss": 0.0703, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.4432, |
|
"grad_norm": 0.09362676739692688, |
|
"learning_rate": 1.3642266958918985e-05, |
|
"loss": 0.0191, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.444, |
|
"grad_norm": 0.03751781955361366, |
|
"learning_rate": 1.3616245700820922e-05, |
|
"loss": 0.1236, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 2.1613173484802246, |
|
"learning_rate": 1.3590196242512463e-05, |
|
"loss": 0.6019, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.4456, |
|
"grad_norm": 0.03422972559928894, |
|
"learning_rate": 1.3564118787132507e-05, |
|
"loss": 0.0138, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.4464, |
|
"grad_norm": 0.01046049501746893, |
|
"learning_rate": 1.3538013538038295e-05, |
|
"loss": 0.0101, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.4472, |
|
"grad_norm": 0.7763115167617798, |
|
"learning_rate": 1.3511880698803801e-05, |
|
"loss": 0.1013, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.2062385082244873, |
|
"learning_rate": 1.3485720473218153e-05, |
|
"loss": 0.0343, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4488, |
|
"grad_norm": 0.3189225196838379, |
|
"learning_rate": 1.3459533065284049e-05, |
|
"loss": 0.0661, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.4496, |
|
"grad_norm": 0.9145419597625732, |
|
"learning_rate": 1.3433318679216154e-05, |
|
"loss": 0.555, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.4504, |
|
"grad_norm": 1.2541166543960571, |
|
"learning_rate": 1.340707751943952e-05, |
|
"loss": 0.5902, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 0.02262170799076557, |
|
"learning_rate": 1.3380809790587975e-05, |
|
"loss": 0.0191, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.452, |
|
"grad_norm": 0.4805641174316406, |
|
"learning_rate": 1.3354515697502552e-05, |
|
"loss": 0.0719, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4528, |
|
"grad_norm": 0.0759739875793457, |
|
"learning_rate": 1.3328195445229869e-05, |
|
"loss": 0.035, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.4536, |
|
"grad_norm": 0.0279012992978096, |
|
"learning_rate": 1.3301849239020537e-05, |
|
"loss": 0.0732, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 0.040366217494010925, |
|
"learning_rate": 1.327547728432757e-05, |
|
"loss": 0.036, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.4552, |
|
"grad_norm": 1.5210012197494507, |
|
"learning_rate": 1.3249079786804765e-05, |
|
"loss": 0.1844, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.26963767409324646, |
|
"learning_rate": 1.3222656952305113e-05, |
|
"loss": 0.0458, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4568, |
|
"grad_norm": 2.5474393367767334, |
|
"learning_rate": 1.319620898687918e-05, |
|
"loss": 0.4129, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 0.021995197981595993, |
|
"learning_rate": 1.316973609677352e-05, |
|
"loss": 0.0168, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.4584, |
|
"grad_norm": 0.0323915109038353, |
|
"learning_rate": 1.3143238488429042e-05, |
|
"loss": 0.109, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.4592, |
|
"grad_norm": 0.050105806440114975, |
|
"learning_rate": 1.3116716368479418e-05, |
|
"loss": 0.0305, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.13928213715553284, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.0237, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.06462297588586807, |
|
"learning_rate": 1.306359942125356e-05, |
|
"loss": 0.0192, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.4616, |
|
"grad_norm": 0.022758983075618744, |
|
"learning_rate": 1.3037005008193944e-05, |
|
"loss": 0.7191, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.4624, |
|
"grad_norm": 0.03799004480242729, |
|
"learning_rate": 1.3010386911959207e-05, |
|
"loss": 0.0214, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.4632, |
|
"grad_norm": 0.2102702260017395, |
|
"learning_rate": 1.2983745340122604e-05, |
|
"loss": 0.1163, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.08294696360826492, |
|
"learning_rate": 1.2957080500440469e-05, |
|
"loss": 0.0268, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4648, |
|
"grad_norm": 0.04941265285015106, |
|
"learning_rate": 1.2930392600850574e-05, |
|
"loss": 0.0154, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.4656, |
|
"grad_norm": 0.05030106008052826, |
|
"learning_rate": 1.2903681849470528e-05, |
|
"loss": 0.1079, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.4664, |
|
"grad_norm": 0.11340274661779404, |
|
"learning_rate": 1.287694845459613e-05, |
|
"loss": 0.018, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 0.06605678051710129, |
|
"learning_rate": 1.2850192624699762e-05, |
|
"loss": 0.0336, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.468, |
|
"grad_norm": 1.597208023071289, |
|
"learning_rate": 1.2823414568428767e-05, |
|
"loss": 0.7895, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.4688, |
|
"grad_norm": 0.12116717547178268, |
|
"learning_rate": 1.27966144946038e-05, |
|
"loss": 0.026, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.4696, |
|
"grad_norm": 0.04143265634775162, |
|
"learning_rate": 1.2769792612217224e-05, |
|
"loss": 0.1011, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 0.09404078125953674, |
|
"learning_rate": 1.2742949130431468e-05, |
|
"loss": 0.0211, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.4712, |
|
"grad_norm": 0.024886123836040497, |
|
"learning_rate": 1.2716084258577388e-05, |
|
"loss": 0.012, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.05635695159435272, |
|
"learning_rate": 1.2689198206152657e-05, |
|
"loss": 0.0137, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4728, |
|
"grad_norm": 0.09039817750453949, |
|
"learning_rate": 1.2662291182820115e-05, |
|
"loss": 0.0537, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.41814690828323364, |
|
"learning_rate": 1.263536339840613e-05, |
|
"loss": 0.0814, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.4744, |
|
"grad_norm": 0.23532457649707794, |
|
"learning_rate": 1.2608415062898971e-05, |
|
"loss": 0.0322, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 0.4752, |
|
"grad_norm": 0.009926537983119488, |
|
"learning_rate": 1.2581446386447178e-05, |
|
"loss": 0.1281, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.476, |
|
"grad_norm": 0.4470027983188629, |
|
"learning_rate": 1.2554457579357906e-05, |
|
"loss": 0.0478, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 0.05581334978342056, |
|
"learning_rate": 1.2527448852095295e-05, |
|
"loss": 0.0226, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.4776, |
|
"grad_norm": 0.06944628804922104, |
|
"learning_rate": 1.2500420415278822e-05, |
|
"loss": 0.0229, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.4784, |
|
"grad_norm": 0.11777735501527786, |
|
"learning_rate": 1.2473372479681671e-05, |
|
"loss": 0.0189, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.4792, |
|
"grad_norm": 0.37742823362350464, |
|
"learning_rate": 1.2446305256229074e-05, |
|
"loss": 0.2176, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1584476232528687, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 0.5342, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4808, |
|
"grad_norm": 0.21147167682647705, |
|
"learning_rate": 1.2392113790208895e-05, |
|
"loss": 0.027, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 0.4816, |
|
"grad_norm": 0.061478614807128906, |
|
"learning_rate": 1.236498997023725e-05, |
|
"loss": 0.027, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.4824, |
|
"grad_norm": 0.02769525721669197, |
|
"learning_rate": 1.2337847707598738e-05, |
|
"loss": 0.0325, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 0.6708433628082275, |
|
"learning_rate": 1.2310687213954182e-05, |
|
"loss": 0.1057, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 0.484, |
|
"grad_norm": 0.08236505091190338, |
|
"learning_rate": 1.2283508701106559e-05, |
|
"loss": 0.0192, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.4848, |
|
"grad_norm": 0.01167634129524231, |
|
"learning_rate": 1.2256312380999376e-05, |
|
"loss": 0.3463, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.4856, |
|
"grad_norm": 0.03433239459991455, |
|
"learning_rate": 1.2229098465715005e-05, |
|
"loss": 0.3755, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 1.2798956632614136, |
|
"learning_rate": 1.2201867167473015e-05, |
|
"loss": 0.3444, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 0.4872, |
|
"grad_norm": 0.003344225697219372, |
|
"learning_rate": 1.217461869862855e-05, |
|
"loss": 0.0047, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 0.006934499368071556, |
|
"learning_rate": 1.2147353271670634e-05, |
|
"loss": 0.0031, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.4888, |
|
"grad_norm": 0.04046230763196945, |
|
"learning_rate": 1.212007109922055e-05, |
|
"loss": 0.0566, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 1.6989425420761108, |
|
"learning_rate": 1.2092772394030153e-05, |
|
"loss": 0.3364, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.4904, |
|
"grad_norm": 0.6641682982444763, |
|
"learning_rate": 1.2065457368980236e-05, |
|
"loss": 0.0774, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 0.4912, |
|
"grad_norm": 0.07043974846601486, |
|
"learning_rate": 1.203812623707885e-05, |
|
"loss": 0.0186, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 0.492, |
|
"grad_norm": 0.01750028319656849, |
|
"learning_rate": 1.2010779211459649e-05, |
|
"loss": 0.1607, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 0.14301519095897675, |
|
"learning_rate": 1.1983416505380234e-05, |
|
"loss": 0.0485, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.4936, |
|
"grad_norm": 0.046129919588565826, |
|
"learning_rate": 1.1956038332220484e-05, |
|
"loss": 0.0111, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 0.4944, |
|
"grad_norm": 1.4883394241333008, |
|
"learning_rate": 1.192864490548089e-05, |
|
"loss": 0.5768, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.4952, |
|
"grad_norm": 1.122119665145874, |
|
"learning_rate": 1.1901236438780902e-05, |
|
"loss": 0.1481, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.19744625687599182, |
|
"learning_rate": 1.187381314585725e-05, |
|
"loss": 0.1095, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4968, |
|
"grad_norm": 0.06524749845266342, |
|
"learning_rate": 1.184637524056227e-05, |
|
"loss": 0.1115, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.4976, |
|
"grad_norm": 0.024867022410035133, |
|
"learning_rate": 1.181892293686227e-05, |
|
"loss": 0.0095, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 0.4984, |
|
"grad_norm": 0.20855647325515747, |
|
"learning_rate": 1.1791456448835825e-05, |
|
"loss": 0.0315, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 0.009501822292804718, |
|
"learning_rate": 1.1763975990672125e-05, |
|
"loss": 0.0212, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.030127333477139473, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.0104, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5008, |
|
"grad_norm": 0.793460488319397, |
|
"learning_rate": 1.1708974021232768e-05, |
|
"loss": 0.1835, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 0.5016, |
|
"grad_norm": 0.1162368431687355, |
|
"learning_rate": 1.1681452938873516e-05, |
|
"loss": 0.0165, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 0.03160820156335831, |
|
"learning_rate": 1.1653918744206478e-05, |
|
"loss": 0.0659, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 0.5032, |
|
"grad_norm": 2.536365509033203, |
|
"learning_rate": 1.1626371651948839e-05, |
|
"loss": 0.1661, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 0.06747017800807953, |
|
"learning_rate": 1.159881187691835e-05, |
|
"loss": 0.0591, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5048, |
|
"grad_norm": 0.10115383565425873, |
|
"learning_rate": 1.157123963403168e-05, |
|
"loss": 0.0216, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 0.017214270308613777, |
|
"learning_rate": 1.1543655138302714e-05, |
|
"loss": 0.0077, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 0.5064, |
|
"grad_norm": 0.21995119750499725, |
|
"learning_rate": 1.1516058604840891e-05, |
|
"loss": 0.0389, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.5072, |
|
"grad_norm": 0.01604562997817993, |
|
"learning_rate": 1.1488450248849523e-05, |
|
"loss": 0.0223, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 0.508, |
|
"grad_norm": 0.031004084274172783, |
|
"learning_rate": 1.1460830285624119e-05, |
|
"loss": 0.0159, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 2.251960039138794, |
|
"learning_rate": 1.1433198930550694e-05, |
|
"loss": 0.2915, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.5096, |
|
"grad_norm": 0.03171626105904579, |
|
"learning_rate": 1.140555639910411e-05, |
|
"loss": 0.0069, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.5104, |
|
"grad_norm": 0.791187584400177, |
|
"learning_rate": 1.137790290684638e-05, |
|
"loss": 0.21, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 0.5112, |
|
"grad_norm": 0.016608070582151413, |
|
"learning_rate": 1.1350238669424993e-05, |
|
"loss": 0.0064, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.0038915143813937902, |
|
"learning_rate": 1.1322563902571227e-05, |
|
"loss": 0.002, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5128, |
|
"grad_norm": 0.2625635862350464, |
|
"learning_rate": 1.129487882209847e-05, |
|
"loss": 0.0156, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 0.5136, |
|
"grad_norm": 0.037254203110933304, |
|
"learning_rate": 1.1267183643900548e-05, |
|
"loss": 0.0056, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.5144, |
|
"grad_norm": 0.08309903740882874, |
|
"learning_rate": 1.1239478583950019e-05, |
|
"loss": 0.0056, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 1.2419265508651733, |
|
"learning_rate": 1.1211763858296507e-05, |
|
"loss": 0.1099, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.516, |
|
"grad_norm": 0.008148876950144768, |
|
"learning_rate": 1.1184039683065014e-05, |
|
"loss": 0.0064, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5168, |
|
"grad_norm": 0.04519444331526756, |
|
"learning_rate": 1.1156306274454218e-05, |
|
"loss": 0.0066, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 0.5176, |
|
"grad_norm": 0.0015114143025130033, |
|
"learning_rate": 1.1128563848734817e-05, |
|
"loss": 0.001, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 0.006196278613060713, |
|
"learning_rate": 1.1100812622247823e-05, |
|
"loss": 0.0032, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.5192, |
|
"grad_norm": 0.0075135682709515095, |
|
"learning_rate": 1.1073052811402867e-05, |
|
"loss": 0.6506, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.030847422778606415, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.016, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5208, |
|
"grad_norm": 0.07433111220598221, |
|
"learning_rate": 1.1017508302610665e-05, |
|
"loss": 0.0134, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 0.20086438953876495, |
|
"learning_rate": 1.0989724037810651e-05, |
|
"loss": 0.0139, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 0.5224, |
|
"grad_norm": 0.015072687529027462, |
|
"learning_rate": 1.0961932054943778e-05, |
|
"loss": 0.0192, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 0.5232, |
|
"grad_norm": 0.5555530190467834, |
|
"learning_rate": 1.0934132570737508e-05, |
|
"loss": 0.0359, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.524, |
|
"grad_norm": 0.0621090903878212, |
|
"learning_rate": 1.0906325801977804e-05, |
|
"loss": 0.0126, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 0.013316777534782887, |
|
"learning_rate": 1.0878511965507435e-05, |
|
"loss": 0.0085, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 0.5256, |
|
"grad_norm": 0.037804149091243744, |
|
"learning_rate": 1.0850691278224282e-05, |
|
"loss": 1.6496, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.5264, |
|
"grad_norm": 0.04232160374522209, |
|
"learning_rate": 1.0822863957079657e-05, |
|
"loss": 0.0068, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.5272, |
|
"grad_norm": 0.020469985902309418, |
|
"learning_rate": 1.07950302190766e-05, |
|
"loss": 0.1708, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.04445105418562889, |
|
"learning_rate": 1.0767190281268187e-05, |
|
"loss": 0.1707, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5288, |
|
"grad_norm": 1.9354827404022217, |
|
"learning_rate": 1.0739344360755853e-05, |
|
"loss": 0.1453, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 0.5296, |
|
"grad_norm": 0.0484926700592041, |
|
"learning_rate": 1.071149267468767e-05, |
|
"loss": 0.0299, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 0.5304, |
|
"grad_norm": 0.02645016647875309, |
|
"learning_rate": 1.0683635440256689e-05, |
|
"loss": 0.0162, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 0.05882829427719116, |
|
"learning_rate": 1.0655772874699217e-05, |
|
"loss": 0.0311, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 0.532, |
|
"grad_norm": 0.03740588575601578, |
|
"learning_rate": 1.0627905195293135e-05, |
|
"loss": 0.0254, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5328, |
|
"grad_norm": 0.21913115680217743, |
|
"learning_rate": 1.0600032619356208e-05, |
|
"loss": 0.0732, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 0.5336, |
|
"grad_norm": 0.02981780469417572, |
|
"learning_rate": 1.0572155364244383e-05, |
|
"loss": 0.0603, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 0.3409574627876282, |
|
"learning_rate": 1.0544273647350091e-05, |
|
"loss": 0.0482, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 0.5352, |
|
"grad_norm": 1.3873035907745361, |
|
"learning_rate": 1.0516387686100566e-05, |
|
"loss": 0.2645, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 0.9133062958717346, |
|
"learning_rate": 1.0488497697956134e-05, |
|
"loss": 0.6023, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5368, |
|
"grad_norm": 0.014215713366866112, |
|
"learning_rate": 1.0460603900408523e-05, |
|
"loss": 0.0249, |
|
"step": 1342 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.013020013459026814, |
|
"learning_rate": 1.0432706510979172e-05, |
|
"loss": 0.0572, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.5384, |
|
"grad_norm": 0.07060626894235611, |
|
"learning_rate": 1.0404805747217525e-05, |
|
"loss": 0.0259, |
|
"step": 1346 |
|
}, |
|
{ |
|
"epoch": 0.5392, |
|
"grad_norm": 0.16781005263328552, |
|
"learning_rate": 1.0376901826699349e-05, |
|
"loss": 0.0441, |
|
"step": 1348 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.0290207639336586, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 0.9059, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 0.03659132868051529, |
|
"learning_rate": 1.0321085385817818e-05, |
|
"loss": 0.6523, |
|
"step": 1352 |
|
}, |
|
{ |
|
"epoch": 0.5416, |
|
"grad_norm": 0.10811670869588852, |
|
"learning_rate": 1.0293173300722286e-05, |
|
"loss": 0.0472, |
|
"step": 1354 |
|
}, |
|
{ |
|
"epoch": 0.5424, |
|
"grad_norm": 0.04781627655029297, |
|
"learning_rate": 1.026525892940246e-05, |
|
"loss": 0.0203, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 0.5432, |
|
"grad_norm": 0.0950162261724472, |
|
"learning_rate": 1.0237342489540221e-05, |
|
"loss": 0.5411, |
|
"step": 1358 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.059019722044467926, |
|
"learning_rate": 1.0209424198833571e-05, |
|
"loss": 0.049, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5448, |
|
"grad_norm": 0.0552980937063694, |
|
"learning_rate": 1.0181504274994949e-05, |
|
"loss": 0.5069, |
|
"step": 1362 |
|
}, |
|
{ |
|
"epoch": 0.5456, |
|
"grad_norm": 0.037155695259571075, |
|
"learning_rate": 1.0153582935749531e-05, |
|
"loss": 0.0366, |
|
"step": 1364 |
|
}, |
|
{ |
|
"epoch": 0.5464, |
|
"grad_norm": 0.32015174627304077, |
|
"learning_rate": 1.0125660398833528e-05, |
|
"loss": 0.1187, |
|
"step": 1366 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 0.0431252084672451, |
|
"learning_rate": 1.0097736881992492e-05, |
|
"loss": 0.0293, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 0.548, |
|
"grad_norm": 0.05704091861844063, |
|
"learning_rate": 1.0069812602979617e-05, |
|
"loss": 0.0485, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5488, |
|
"grad_norm": 0.0650290697813034, |
|
"learning_rate": 1.0041887779554041e-05, |
|
"loss": 0.0455, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 0.5496, |
|
"grad_norm": 2.1595537662506104, |
|
"learning_rate": 1.0013962629479145e-05, |
|
"loss": 0.1577, |
|
"step": 1374 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 0.05115671455860138, |
|
"learning_rate": 9.986037370520856e-06, |
|
"loss": 0.0318, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 0.5512, |
|
"grad_norm": 0.06018679961562157, |
|
"learning_rate": 9.958112220445964e-06, |
|
"loss": 0.0382, |
|
"step": 1378 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 0.0702604129910469, |
|
"learning_rate": 9.930187397020385e-06, |
|
"loss": 0.5824, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5528, |
|
"grad_norm": 0.038693517446517944, |
|
"learning_rate": 9.902263118007513e-06, |
|
"loss": 0.0264, |
|
"step": 1382 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 0.018939625471830368, |
|
"learning_rate": 9.874339601166474e-06, |
|
"loss": 0.0496, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 0.5544, |
|
"grad_norm": 0.043414074927568436, |
|
"learning_rate": 9.84641706425047e-06, |
|
"loss": 0.0181, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.5552, |
|
"grad_norm": 0.5982060432434082, |
|
"learning_rate": 9.818495725005053e-06, |
|
"loss": 0.0623, |
|
"step": 1388 |
|
}, |
|
{ |
|
"epoch": 0.556, |
|
"grad_norm": 0.13789403438568115, |
|
"learning_rate": 9.790575801166432e-06, |
|
"loss": 0.3926, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 0.029673095792531967, |
|
"learning_rate": 9.762657510459784e-06, |
|
"loss": 0.3696, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.5576, |
|
"grad_norm": 1.172825813293457, |
|
"learning_rate": 9.73474107059754e-06, |
|
"loss": 0.8355, |
|
"step": 1394 |
|
}, |
|
{ |
|
"epoch": 0.5584, |
|
"grad_norm": 1.2738497257232666, |
|
"learning_rate": 9.706826699277719e-06, |
|
"loss": 0.2447, |
|
"step": 1396 |
|
}, |
|
{ |
|
"epoch": 0.5592, |
|
"grad_norm": 1.911895751953125, |
|
"learning_rate": 9.678914614182185e-06, |
|
"loss": 0.2138, |
|
"step": 1398 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.17634525895118713, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 0.0485, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5608, |
|
"grad_norm": 0.03239009529352188, |
|
"learning_rate": 9.623098173300655e-06, |
|
"loss": 0.0351, |
|
"step": 1402 |
|
}, |
|
{ |
|
"epoch": 0.5616, |
|
"grad_norm": 0.12052161246538162, |
|
"learning_rate": 9.595194252782476e-06, |
|
"loss": 0.0429, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 0.5624, |
|
"grad_norm": 0.06105445697903633, |
|
"learning_rate": 9.567293489020831e-06, |
|
"loss": 0.0241, |
|
"step": 1406 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 0.10784406960010529, |
|
"learning_rate": 9.539396099591477e-06, |
|
"loss": 0.0291, |
|
"step": 1408 |
|
}, |
|
{ |
|
"epoch": 0.564, |
|
"grad_norm": 0.5862268209457397, |
|
"learning_rate": 9.511502302043867e-06, |
|
"loss": 0.0746, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5648, |
|
"grad_norm": 2.016383409500122, |
|
"learning_rate": 9.483612313899436e-06, |
|
"loss": 0.1858, |
|
"step": 1412 |
|
}, |
|
{ |
|
"epoch": 0.5656, |
|
"grad_norm": 0.05418021231889725, |
|
"learning_rate": 9.45572635264991e-06, |
|
"loss": 0.0362, |
|
"step": 1414 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 1.7044521570205688, |
|
"learning_rate": 9.42784463575562e-06, |
|
"loss": 0.2979, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 0.5672, |
|
"grad_norm": 0.09087449312210083, |
|
"learning_rate": 9.399967380643795e-06, |
|
"loss": 0.0365, |
|
"step": 1418 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 0.16950923204421997, |
|
"learning_rate": 9.372094804706867e-06, |
|
"loss": 0.0382, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5688, |
|
"grad_norm": 0.1982814371585846, |
|
"learning_rate": 9.344227125300788e-06, |
|
"loss": 0.0395, |
|
"step": 1422 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 1.310996413230896, |
|
"learning_rate": 9.316364559743315e-06, |
|
"loss": 0.5231, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 0.5704, |
|
"grad_norm": 0.03387328237295151, |
|
"learning_rate": 9.288507325312334e-06, |
|
"loss": 0.6156, |
|
"step": 1426 |
|
}, |
|
{ |
|
"epoch": 0.5712, |
|
"grad_norm": 0.009328456595540047, |
|
"learning_rate": 9.260655639244152e-06, |
|
"loss": 0.0184, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 0.572, |
|
"grad_norm": 0.07381106168031693, |
|
"learning_rate": 9.232809718731815e-06, |
|
"loss": 0.0261, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 0.050106290727853775, |
|
"learning_rate": 9.204969780923404e-06, |
|
"loss": 0.0132, |
|
"step": 1432 |
|
}, |
|
{ |
|
"epoch": 0.5736, |
|
"grad_norm": 0.010202400386333466, |
|
"learning_rate": 9.177136042920344e-06, |
|
"loss": 0.0187, |
|
"step": 1434 |
|
}, |
|
{ |
|
"epoch": 0.5744, |
|
"grad_norm": 0.15420140326023102, |
|
"learning_rate": 9.14930872177572e-06, |
|
"loss": 0.062, |
|
"step": 1436 |
|
}, |
|
{ |
|
"epoch": 0.5752, |
|
"grad_norm": 0.028073610737919807, |
|
"learning_rate": 9.121488034492569e-06, |
|
"loss": 0.0171, |
|
"step": 1438 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.018873965367674828, |
|
"learning_rate": 9.093674198022201e-06, |
|
"loss": 0.019, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5768, |
|
"grad_norm": 0.8036087155342102, |
|
"learning_rate": 9.065867429262497e-06, |
|
"loss": 1.608, |
|
"step": 1442 |
|
}, |
|
{ |
|
"epoch": 0.5776, |
|
"grad_norm": 0.19082395732402802, |
|
"learning_rate": 9.038067945056229e-06, |
|
"loss": 0.0394, |
|
"step": 1444 |
|
}, |
|
{ |
|
"epoch": 0.5784, |
|
"grad_norm": 0.07714349031448364, |
|
"learning_rate": 9.01027596218935e-06, |
|
"loss": 0.0772, |
|
"step": 1446 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 0.06311635673046112, |
|
"learning_rate": 8.982491697389339e-06, |
|
"loss": 0.032, |
|
"step": 1448 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.13729214668273926, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 0.0404, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5808, |
|
"grad_norm": 0.6665918827056885, |
|
"learning_rate": 8.926947188597133e-06, |
|
"loss": 0.2674, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 0.5816, |
|
"grad_norm": 0.11501887440681458, |
|
"learning_rate": 8.89918737775218e-06, |
|
"loss": 0.066, |
|
"step": 1454 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 0.04181879013776779, |
|
"learning_rate": 8.871436151265183e-06, |
|
"loss": 0.0176, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 0.5832, |
|
"grad_norm": 0.07718206197023392, |
|
"learning_rate": 8.843693725545787e-06, |
|
"loss": 0.3476, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 0.17924383282661438, |
|
"learning_rate": 8.815960316934991e-06, |
|
"loss": 0.0852, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5848, |
|
"grad_norm": 0.016910772770643234, |
|
"learning_rate": 8.788236141703498e-06, |
|
"loss": 0.0112, |
|
"step": 1462 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 0.20013374090194702, |
|
"learning_rate": 8.760521416049983e-06, |
|
"loss": 0.0349, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 0.5864, |
|
"grad_norm": 0.0546087808907032, |
|
"learning_rate": 8.732816356099455e-06, |
|
"loss": 0.0229, |
|
"step": 1466 |
|
}, |
|
{ |
|
"epoch": 0.5872, |
|
"grad_norm": 0.1050657406449318, |
|
"learning_rate": 8.705121177901532e-06, |
|
"loss": 0.0342, |
|
"step": 1468 |
|
}, |
|
{ |
|
"epoch": 0.588, |
|
"grad_norm": 0.00835899356752634, |
|
"learning_rate": 8.677436097428775e-06, |
|
"loss": 0.2512, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 0.12563996016979218, |
|
"learning_rate": 8.649761330575009e-06, |
|
"loss": 0.0289, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 0.5896, |
|
"grad_norm": 2.070195436477661, |
|
"learning_rate": 8.62209709315362e-06, |
|
"loss": 0.5447, |
|
"step": 1474 |
|
}, |
|
{ |
|
"epoch": 0.5904, |
|
"grad_norm": 0.20926696062088013, |
|
"learning_rate": 8.594443600895892e-06, |
|
"loss": 0.0291, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 0.5912, |
|
"grad_norm": 0.03596784546971321, |
|
"learning_rate": 8.566801069449307e-06, |
|
"loss": 0.0246, |
|
"step": 1478 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.03978172317147255, |
|
"learning_rate": 8.539169714375885e-06, |
|
"loss": 0.1662, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5928, |
|
"grad_norm": 0.030970891937613487, |
|
"learning_rate": 8.511549751150478e-06, |
|
"loss": 0.1755, |
|
"step": 1482 |
|
}, |
|
{ |
|
"epoch": 0.5936, |
|
"grad_norm": 0.029246820136904716, |
|
"learning_rate": 8.483941395159114e-06, |
|
"loss": 0.0162, |
|
"step": 1484 |
|
}, |
|
{ |
|
"epoch": 0.5944, |
|
"grad_norm": 1.2437587976455688, |
|
"learning_rate": 8.45634486169729e-06, |
|
"loss": 0.2451, |
|
"step": 1486 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 0.01518918015062809, |
|
"learning_rate": 8.428760365968327e-06, |
|
"loss": 0.0139, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 0.596, |
|
"grad_norm": 0.058590278029441833, |
|
"learning_rate": 8.401188123081653e-06, |
|
"loss": 0.1158, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5968, |
|
"grad_norm": 0.09938734769821167, |
|
"learning_rate": 8.373628348051165e-06, |
|
"loss": 0.0162, |
|
"step": 1492 |
|
}, |
|
{ |
|
"epoch": 0.5976, |
|
"grad_norm": 0.221556156873703, |
|
"learning_rate": 8.346081255793524e-06, |
|
"loss": 0.0708, |
|
"step": 1494 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 0.9571221470832825, |
|
"learning_rate": 8.318547061126485e-06, |
|
"loss": 0.3115, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 0.5992, |
|
"grad_norm": 0.38105764985084534, |
|
"learning_rate": 8.291025978767236e-06, |
|
"loss": 0.2224, |
|
"step": 1498 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.5868642330169678, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.2413, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6008, |
|
"grad_norm": 0.23962773382663727, |
|
"learning_rate": 8.236024009327879e-06, |
|
"loss": 0.0339, |
|
"step": 1502 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 0.5392745137214661, |
|
"learning_rate": 8.208543551164178e-06, |
|
"loss": 0.0817, |
|
"step": 1504 |
|
}, |
|
{ |
|
"epoch": 0.6024, |
|
"grad_norm": 0.06242334842681885, |
|
"learning_rate": 8.181077063137733e-06, |
|
"loss": 0.5676, |
|
"step": 1506 |
|
}, |
|
{ |
|
"epoch": 0.6032, |
|
"grad_norm": 0.05859392136335373, |
|
"learning_rate": 8.153624759437733e-06, |
|
"loss": 0.0143, |
|
"step": 1508 |
|
}, |
|
{ |
|
"epoch": 0.604, |
|
"grad_norm": 0.20578180253505707, |
|
"learning_rate": 8.126186854142752e-06, |
|
"loss": 0.0336, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 2.3235504627227783, |
|
"learning_rate": 8.098763561219101e-06, |
|
"loss": 0.5843, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.6056, |
|
"grad_norm": 0.014721088111400604, |
|
"learning_rate": 8.07135509451911e-06, |
|
"loss": 0.0786, |
|
"step": 1514 |
|
}, |
|
{ |
|
"epoch": 0.6064, |
|
"grad_norm": 0.029024440795183182, |
|
"learning_rate": 8.04396166777952e-06, |
|
"loss": 0.0238, |
|
"step": 1516 |
|
}, |
|
{ |
|
"epoch": 0.6072, |
|
"grad_norm": 0.007215852849185467, |
|
"learning_rate": 8.016583494619769e-06, |
|
"loss": 0.0082, |
|
"step": 1518 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.8240259885787964, |
|
"learning_rate": 7.989220788540356e-06, |
|
"loss": 0.1647, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6088, |
|
"grad_norm": 0.3796282708644867, |
|
"learning_rate": 7.961873762921153e-06, |
|
"loss": 0.0807, |
|
"step": 1522 |
|
}, |
|
{ |
|
"epoch": 0.6096, |
|
"grad_norm": 0.03532170131802559, |
|
"learning_rate": 7.934542631019767e-06, |
|
"loss": 0.5003, |
|
"step": 1524 |
|
}, |
|
{ |
|
"epoch": 0.6104, |
|
"grad_norm": 0.018144795671105385, |
|
"learning_rate": 7.907227605969849e-06, |
|
"loss": 0.0202, |
|
"step": 1526 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 0.06521397829055786, |
|
"learning_rate": 7.879928900779457e-06, |
|
"loss": 0.0214, |
|
"step": 1528 |
|
}, |
|
{ |
|
"epoch": 0.612, |
|
"grad_norm": 0.010673885233700275, |
|
"learning_rate": 7.852646728329368e-06, |
|
"loss": 0.2308, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6128, |
|
"grad_norm": 0.07566576451063156, |
|
"learning_rate": 7.825381301371452e-06, |
|
"loss": 0.0437, |
|
"step": 1532 |
|
}, |
|
{ |
|
"epoch": 0.6136, |
|
"grad_norm": 0.01657886430621147, |
|
"learning_rate": 7.798132832526986e-06, |
|
"loss": 0.0092, |
|
"step": 1534 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.04421677067875862, |
|
"learning_rate": 7.770901534284996e-06, |
|
"loss": 0.2318, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.6152, |
|
"grad_norm": 0.3701765239238739, |
|
"learning_rate": 7.743687619000625e-06, |
|
"loss": 0.0437, |
|
"step": 1538 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 0.006094436626881361, |
|
"learning_rate": 7.716491298893443e-06, |
|
"loss": 0.0216, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6168, |
|
"grad_norm": 0.27293258905410767, |
|
"learning_rate": 7.689312786045823e-06, |
|
"loss": 0.0417, |
|
"step": 1542 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 0.2565436363220215, |
|
"learning_rate": 7.662152292401265e-06, |
|
"loss": 0.0669, |
|
"step": 1544 |
|
}, |
|
{ |
|
"epoch": 0.6184, |
|
"grad_norm": 0.9066870212554932, |
|
"learning_rate": 7.635010029762755e-06, |
|
"loss": 0.1723, |
|
"step": 1546 |
|
}, |
|
{ |
|
"epoch": 0.6192, |
|
"grad_norm": 0.040010981261730194, |
|
"learning_rate": 7.6078862097911075e-06, |
|
"loss": 0.0133, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7351746559143066, |
|
"learning_rate": 7.580781044003324e-06, |
|
"loss": 0.0992, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 0.6215207576751709, |
|
"learning_rate": 7.553694743770928e-06, |
|
"loss": 0.6671, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 0.6216, |
|
"grad_norm": 0.0056650955229997635, |
|
"learning_rate": 7.526627520318329e-06, |
|
"loss": 0.0339, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 0.6224, |
|
"grad_norm": 0.008909706026315689, |
|
"learning_rate": 7.49957958472118e-06, |
|
"loss": 0.0059, |
|
"step": 1556 |
|
}, |
|
{ |
|
"epoch": 0.6232, |
|
"grad_norm": 1.9135671854019165, |
|
"learning_rate": 7.472551147904708e-06, |
|
"loss": 0.9156, |
|
"step": 1558 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.7585664391517639, |
|
"learning_rate": 7.445542420642097e-06, |
|
"loss": 0.103, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6248, |
|
"grad_norm": 0.07106231898069382, |
|
"learning_rate": 7.418553613552824e-06, |
|
"loss": 0.0217, |
|
"step": 1562 |
|
}, |
|
{ |
|
"epoch": 0.6256, |
|
"grad_norm": 1.5444700717926025, |
|
"learning_rate": 7.391584937101034e-06, |
|
"loss": 0.1937, |
|
"step": 1564 |
|
}, |
|
{ |
|
"epoch": 0.6264, |
|
"grad_norm": 0.006505718920379877, |
|
"learning_rate": 7.364636601593875e-06, |
|
"loss": 0.014, |
|
"step": 1566 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 0.3871181309223175, |
|
"learning_rate": 7.33770881717989e-06, |
|
"loss": 0.048, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 0.628, |
|
"grad_norm": 0.03062368370592594, |
|
"learning_rate": 7.310801793847344e-06, |
|
"loss": 0.0133, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6288, |
|
"grad_norm": 0.2106819897890091, |
|
"learning_rate": 7.283915741422611e-06, |
|
"loss": 0.2936, |
|
"step": 1572 |
|
}, |
|
{ |
|
"epoch": 0.6296, |
|
"grad_norm": 0.9387131929397583, |
|
"learning_rate": 7.257050869568536e-06, |
|
"loss": 0.3943, |
|
"step": 1574 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 0.18858036398887634, |
|
"learning_rate": 7.2302073877827775e-06, |
|
"loss": 0.2703, |
|
"step": 1576 |
|
}, |
|
{ |
|
"epoch": 0.6312, |
|
"grad_norm": 0.34716084599494934, |
|
"learning_rate": 7.203385505396203e-06, |
|
"loss": 0.0523, |
|
"step": 1578 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 1.6694183349609375, |
|
"learning_rate": 7.176585431571235e-06, |
|
"loss": 0.4431, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6328, |
|
"grad_norm": 0.07224322855472565, |
|
"learning_rate": 7.149807375300239e-06, |
|
"loss": 0.1626, |
|
"step": 1582 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 0.03537129983305931, |
|
"learning_rate": 7.123051545403874e-06, |
|
"loss": 0.0113, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.6344, |
|
"grad_norm": 0.0862552747130394, |
|
"learning_rate": 7.096318150529476e-06, |
|
"loss": 0.1374, |
|
"step": 1586 |
|
}, |
|
{ |
|
"epoch": 0.6352, |
|
"grad_norm": 0.3428609073162079, |
|
"learning_rate": 7.069607399149427e-06, |
|
"loss": 0.0467, |
|
"step": 1588 |
|
}, |
|
{ |
|
"epoch": 0.636, |
|
"grad_norm": 0.8518670201301575, |
|
"learning_rate": 7.042919499559538e-06, |
|
"loss": 0.179, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 0.017964590340852737, |
|
"learning_rate": 7.016254659877398e-06, |
|
"loss": 0.0767, |
|
"step": 1592 |
|
}, |
|
{ |
|
"epoch": 0.6376, |
|
"grad_norm": 0.042712751775979996, |
|
"learning_rate": 6.9896130880407965e-06, |
|
"loss": 0.0153, |
|
"step": 1594 |
|
}, |
|
{ |
|
"epoch": 0.6384, |
|
"grad_norm": 0.3006531894207001, |
|
"learning_rate": 6.962994991806059e-06, |
|
"loss": 0.0977, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 0.6392, |
|
"grad_norm": 0.010738243348896503, |
|
"learning_rate": 6.9364005787464406e-06, |
|
"loss": 0.1063, |
|
"step": 1598 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.017853038385510445, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.1487, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6408, |
|
"grad_norm": 0.38184410333633423, |
|
"learning_rate": 6.883283631520582e-06, |
|
"loss": 0.0403, |
|
"step": 1602 |
|
}, |
|
{ |
|
"epoch": 0.6416, |
|
"grad_norm": 0.026657408103346825, |
|
"learning_rate": 6.856761511570963e-06, |
|
"loss": 0.0139, |
|
"step": 1604 |
|
}, |
|
{ |
|
"epoch": 0.6424, |
|
"grad_norm": 0.04037747532129288, |
|
"learning_rate": 6.830263903226483e-06, |
|
"loss": 0.0218, |
|
"step": 1606 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 0.07742556184530258, |
|
"learning_rate": 6.803791013120822e-06, |
|
"loss": 0.0117, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 0.644, |
|
"grad_norm": 1.956659197807312, |
|
"learning_rate": 6.777343047694891e-06, |
|
"loss": 0.3377, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6448, |
|
"grad_norm": 0.04879617318511009, |
|
"learning_rate": 6.750920213195238e-06, |
|
"loss": 0.1139, |
|
"step": 1612 |
|
}, |
|
{ |
|
"epoch": 0.6456, |
|
"grad_norm": 3.4786341190338135, |
|
"learning_rate": 6.7245227156724324e-06, |
|
"loss": 0.0851, |
|
"step": 1614 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 0.018991775810718536, |
|
"learning_rate": 6.698150760979463e-06, |
|
"loss": 0.0136, |
|
"step": 1616 |
|
}, |
|
{ |
|
"epoch": 0.6472, |
|
"grad_norm": 1.7258912324905396, |
|
"learning_rate": 6.671804554770135e-06, |
|
"loss": 0.3864, |
|
"step": 1618 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 0.27930495142936707, |
|
"learning_rate": 6.645484302497452e-06, |
|
"loss": 0.0221, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6488, |
|
"grad_norm": 0.023989146575331688, |
|
"learning_rate": 6.6191902094120295e-06, |
|
"loss": 0.0198, |
|
"step": 1622 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 0.026416273787617683, |
|
"learning_rate": 6.5929224805604845e-06, |
|
"loss": 0.0195, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 0.6504, |
|
"grad_norm": 0.021918591111898422, |
|
"learning_rate": 6.566681320783849e-06, |
|
"loss": 0.0088, |
|
"step": 1626 |
|
}, |
|
{ |
|
"epoch": 0.6512, |
|
"grad_norm": 0.005978676024824381, |
|
"learning_rate": 6.540466934715953e-06, |
|
"loss": 0.0158, |
|
"step": 1628 |
|
}, |
|
{ |
|
"epoch": 0.652, |
|
"grad_norm": 0.0690799206495285, |
|
"learning_rate": 6.5142795267818505e-06, |
|
"loss": 0.0426, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 0.06811723858118057, |
|
"learning_rate": 6.488119301196201e-06, |
|
"loss": 0.0474, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 0.6536, |
|
"grad_norm": 0.3594679534435272, |
|
"learning_rate": 6.461986461961706e-06, |
|
"loss": 0.0359, |
|
"step": 1634 |
|
}, |
|
{ |
|
"epoch": 0.6544, |
|
"grad_norm": 0.00533846952021122, |
|
"learning_rate": 6.435881212867494e-06, |
|
"loss": 0.0055, |
|
"step": 1636 |
|
}, |
|
{ |
|
"epoch": 0.6552, |
|
"grad_norm": 0.4549589455127716, |
|
"learning_rate": 6.409803757487539e-06, |
|
"loss": 0.0631, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.02013424225151539, |
|
"learning_rate": 6.383754299179079e-06, |
|
"loss": 0.0046, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6568, |
|
"grad_norm": 2.2123377323150635, |
|
"learning_rate": 6.357733041081018e-06, |
|
"loss": 0.482, |
|
"step": 1642 |
|
}, |
|
{ |
|
"epoch": 0.6576, |
|
"grad_norm": 0.0015303940745070577, |
|
"learning_rate": 6.33174018611236e-06, |
|
"loss": 0.0028, |
|
"step": 1644 |
|
}, |
|
{ |
|
"epoch": 0.6584, |
|
"grad_norm": 0.2125168740749359, |
|
"learning_rate": 6.305775936970606e-06, |
|
"loss": 0.019, |
|
"step": 1646 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 1.2157135009765625, |
|
"learning_rate": 6.27984049613019e-06, |
|
"loss": 0.4542, |
|
"step": 1648 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.030942708253860474, |
|
"learning_rate": 6.25393406584088e-06, |
|
"loss": 0.0049, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6608, |
|
"grad_norm": 0.0280720554292202, |
|
"learning_rate": 6.228056848126236e-06, |
|
"loss": 0.0107, |
|
"step": 1652 |
|
}, |
|
{ |
|
"epoch": 0.6616, |
|
"grad_norm": 0.6233726739883423, |
|
"learning_rate": 6.202209044781991e-06, |
|
"loss": 0.0566, |
|
"step": 1654 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 0.014730525203049183, |
|
"learning_rate": 6.176390857374508e-06, |
|
"loss": 0.0088, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 0.6632, |
|
"grad_norm": 1.784373164176941, |
|
"learning_rate": 6.150602487239207e-06, |
|
"loss": 0.7035, |
|
"step": 1658 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 0.12886707484722137, |
|
"learning_rate": 6.124844135478971e-06, |
|
"loss": 0.0122, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6648, |
|
"grad_norm": 0.04210919514298439, |
|
"learning_rate": 6.099116002962604e-06, |
|
"loss": 0.1507, |
|
"step": 1662 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.05602734535932541, |
|
"learning_rate": 6.073418290323251e-06, |
|
"loss": 0.0145, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 0.6664, |
|
"grad_norm": 0.04478934034705162, |
|
"learning_rate": 6.047751197956838e-06, |
|
"loss": 0.0079, |
|
"step": 1666 |
|
}, |
|
{ |
|
"epoch": 0.6672, |
|
"grad_norm": 0.010290348902344704, |
|
"learning_rate": 6.022114926020504e-06, |
|
"loss": 0.0495, |
|
"step": 1668 |
|
}, |
|
{ |
|
"epoch": 0.668, |
|
"grad_norm": 0.01751306839287281, |
|
"learning_rate": 5.996509674431053e-06, |
|
"loss": 0.0056, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 0.1504068821668625, |
|
"learning_rate": 5.970935642863375e-06, |
|
"loss": 0.0234, |
|
"step": 1672 |
|
}, |
|
{ |
|
"epoch": 0.6696, |
|
"grad_norm": 0.0935196727514267, |
|
"learning_rate": 5.94539303074891e-06, |
|
"loss": 0.0214, |
|
"step": 1674 |
|
}, |
|
{ |
|
"epoch": 0.6704, |
|
"grad_norm": 1.2794703245162964, |
|
"learning_rate": 5.9198820372740726e-06, |
|
"loss": 0.8835, |
|
"step": 1676 |
|
}, |
|
{ |
|
"epoch": 0.6712, |
|
"grad_norm": 0.028086047619581223, |
|
"learning_rate": 5.894402861378721e-06, |
|
"loss": 0.0076, |
|
"step": 1678 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.2483760118484497, |
|
"learning_rate": 5.868955701754584e-06, |
|
"loss": 0.5054, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6728, |
|
"grad_norm": 0.016343148425221443, |
|
"learning_rate": 5.843540756843722e-06, |
|
"loss": 0.1617, |
|
"step": 1682 |
|
}, |
|
{ |
|
"epoch": 0.6736, |
|
"grad_norm": 0.05262403190135956, |
|
"learning_rate": 5.818158224836987e-06, |
|
"loss": 0.0523, |
|
"step": 1684 |
|
}, |
|
{ |
|
"epoch": 0.6744, |
|
"grad_norm": 0.11089430004358292, |
|
"learning_rate": 5.792808303672454e-06, |
|
"loss": 0.4788, |
|
"step": 1686 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 0.0768204778432846, |
|
"learning_rate": 5.7674911910339094e-06, |
|
"loss": 0.0263, |
|
"step": 1688 |
|
}, |
|
{ |
|
"epoch": 0.676, |
|
"grad_norm": 0.7316690683364868, |
|
"learning_rate": 5.742207084349274e-06, |
|
"loss": 0.064, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6768, |
|
"grad_norm": 0.12514914572238922, |
|
"learning_rate": 5.716956180789098e-06, |
|
"loss": 0.0287, |
|
"step": 1692 |
|
}, |
|
{ |
|
"epoch": 0.6776, |
|
"grad_norm": 0.12199829518795013, |
|
"learning_rate": 5.691738677265e-06, |
|
"loss": 0.1082, |
|
"step": 1694 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 0.15735796093940735, |
|
"learning_rate": 5.666554770428129e-06, |
|
"loss": 0.0281, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 0.6792, |
|
"grad_norm": 2.5249431133270264, |
|
"learning_rate": 5.641404656667661e-06, |
|
"loss": 0.8569, |
|
"step": 1698 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.12266776710748672, |
|
"learning_rate": 5.616288532109225e-06, |
|
"loss": 0.0213, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6808, |
|
"grad_norm": 0.07756412774324417, |
|
"learning_rate": 5.591206592613416e-06, |
|
"loss": 0.02, |
|
"step": 1702 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 0.025011537596583366, |
|
"learning_rate": 5.5661590337742255e-06, |
|
"loss": 0.0081, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 0.6824, |
|
"grad_norm": 0.24261169135570526, |
|
"learning_rate": 5.5411460509175605e-06, |
|
"loss": 0.0702, |
|
"step": 1706 |
|
}, |
|
{ |
|
"epoch": 0.6832, |
|
"grad_norm": 0.04961364343762398, |
|
"learning_rate": 5.516167839099679e-06, |
|
"loss": 0.0143, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 0.684, |
|
"grad_norm": 0.0382879376411438, |
|
"learning_rate": 5.491224593105695e-06, |
|
"loss": 0.0174, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 0.024480927735567093, |
|
"learning_rate": 5.466316507448049e-06, |
|
"loss": 0.021, |
|
"step": 1712 |
|
}, |
|
{ |
|
"epoch": 0.6856, |
|
"grad_norm": 0.355844646692276, |
|
"learning_rate": 5.441443776365003e-06, |
|
"loss": 0.0201, |
|
"step": 1714 |
|
}, |
|
{ |
|
"epoch": 0.6864, |
|
"grad_norm": 0.10836853832006454, |
|
"learning_rate": 5.416606593819102e-06, |
|
"loss": 0.018, |
|
"step": 1716 |
|
}, |
|
{ |
|
"epoch": 0.6872, |
|
"grad_norm": 0.1412193775177002, |
|
"learning_rate": 5.391805153495693e-06, |
|
"loss": 0.0284, |
|
"step": 1718 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.15320326387882233, |
|
"learning_rate": 5.367039648801386e-06, |
|
"loss": 0.0345, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6888, |
|
"grad_norm": 0.07995325326919556, |
|
"learning_rate": 5.342310272862558e-06, |
|
"loss": 0.0102, |
|
"step": 1722 |
|
}, |
|
{ |
|
"epoch": 0.6896, |
|
"grad_norm": 0.04907960444688797, |
|
"learning_rate": 5.317617218523856e-06, |
|
"loss": 0.0391, |
|
"step": 1724 |
|
}, |
|
{ |
|
"epoch": 0.6904, |
|
"grad_norm": 1.2691978216171265, |
|
"learning_rate": 5.292960678346674e-06, |
|
"loss": 0.402, |
|
"step": 1726 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.008955995552241802, |
|
"learning_rate": 5.26834084460767e-06, |
|
"loss": 0.0053, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 0.692, |
|
"grad_norm": 0.25324296951293945, |
|
"learning_rate": 5.243757909297247e-06, |
|
"loss": 0.035, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6928, |
|
"grad_norm": 0.026979120448231697, |
|
"learning_rate": 5.219212064118079e-06, |
|
"loss": 0.0089, |
|
"step": 1732 |
|
}, |
|
{ |
|
"epoch": 0.6936, |
|
"grad_norm": 0.015840429812669754, |
|
"learning_rate": 5.194703500483593e-06, |
|
"loss": 0.0049, |
|
"step": 1734 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 1.4288071393966675, |
|
"learning_rate": 5.1702324095164955e-06, |
|
"loss": 0.899, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 0.6952, |
|
"grad_norm": 1.0387349128723145, |
|
"learning_rate": 5.145798982047261e-06, |
|
"loss": 0.7296, |
|
"step": 1738 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 0.4432249963283539, |
|
"learning_rate": 5.121403408612672e-06, |
|
"loss": 0.0525, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6968, |
|
"grad_norm": 0.021967697888612747, |
|
"learning_rate": 5.0970458794543135e-06, |
|
"loss": 0.0092, |
|
"step": 1742 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 1.2726808786392212, |
|
"learning_rate": 5.072726584517086e-06, |
|
"loss": 0.1722, |
|
"step": 1744 |
|
}, |
|
{ |
|
"epoch": 0.6984, |
|
"grad_norm": 0.01904178597033024, |
|
"learning_rate": 5.048445713447738e-06, |
|
"loss": 0.0209, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 0.6992, |
|
"grad_norm": 0.5619332194328308, |
|
"learning_rate": 5.024203455593375e-06, |
|
"loss": 0.769, |
|
"step": 1748 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.2965996265411377, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.1219, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 0.08266191929578781, |
|
"learning_rate": 4.97583553541102e-06, |
|
"loss": 0.0208, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 0.7016, |
|
"grad_norm": 0.016945960000157356, |
|
"learning_rate": 4.951710250265785e-06, |
|
"loss": 0.0105, |
|
"step": 1754 |
|
}, |
|
{ |
|
"epoch": 0.7024, |
|
"grad_norm": 0.06696458905935287, |
|
"learning_rate": 4.927624332698109e-06, |
|
"loss": 0.0239, |
|
"step": 1756 |
|
}, |
|
{ |
|
"epoch": 0.7032, |
|
"grad_norm": 0.024396728724241257, |
|
"learning_rate": 4.903577970534823e-06, |
|
"loss": 0.0168, |
|
"step": 1758 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.3054494857788086, |
|
"learning_rate": 4.879571351294287e-06, |
|
"loss": 0.0534, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7048, |
|
"grad_norm": 0.038238685578107834, |
|
"learning_rate": 4.855604662184935e-06, |
|
"loss": 0.3647, |
|
"step": 1762 |
|
}, |
|
{ |
|
"epoch": 0.7056, |
|
"grad_norm": 0.23634418845176697, |
|
"learning_rate": 4.831678090103832e-06, |
|
"loss": 0.0938, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 0.7064, |
|
"grad_norm": 0.15138259530067444, |
|
"learning_rate": 4.807791821635186e-06, |
|
"loss": 0.0674, |
|
"step": 1766 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 0.24614623188972473, |
|
"learning_rate": 4.783946043048922e-06, |
|
"loss": 0.0925, |
|
"step": 1768 |
|
}, |
|
{ |
|
"epoch": 0.708, |
|
"grad_norm": 2.123792886734009, |
|
"learning_rate": 4.76014094029921e-06, |
|
"loss": 0.9384, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7088, |
|
"grad_norm": 0.10990063846111298, |
|
"learning_rate": 4.736376699023023e-06, |
|
"loss": 0.0253, |
|
"step": 1772 |
|
}, |
|
{ |
|
"epoch": 0.7096, |
|
"grad_norm": 0.3349180221557617, |
|
"learning_rate": 4.712653504538684e-06, |
|
"loss": 0.195, |
|
"step": 1774 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 0.20643967390060425, |
|
"learning_rate": 4.688971541844436e-06, |
|
"loss": 0.0572, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 0.7112, |
|
"grad_norm": 0.09784634411334991, |
|
"learning_rate": 4.6653309956169745e-06, |
|
"loss": 0.0524, |
|
"step": 1778 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 0.34553948044776917, |
|
"learning_rate": 4.641732050210032e-06, |
|
"loss": 0.0434, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7128, |
|
"grad_norm": 1.177024483680725, |
|
"learning_rate": 4.618174889652928e-06, |
|
"loss": 0.2077, |
|
"step": 1782 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 0.10425437986850739, |
|
"learning_rate": 4.59465969764913e-06, |
|
"loss": 0.0273, |
|
"step": 1784 |
|
}, |
|
{ |
|
"epoch": 0.7144, |
|
"grad_norm": 0.09616199135780334, |
|
"learning_rate": 4.571186657574828e-06, |
|
"loss": 0.0605, |
|
"step": 1786 |
|
}, |
|
{ |
|
"epoch": 0.7152, |
|
"grad_norm": 0.3832583427429199, |
|
"learning_rate": 4.5477559524775e-06, |
|
"loss": 0.0672, |
|
"step": 1788 |
|
}, |
|
{ |
|
"epoch": 0.716, |
|
"grad_norm": 0.02167525887489319, |
|
"learning_rate": 4.524367765074499e-06, |
|
"loss": 0.0105, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.04896343871951103, |
|
"learning_rate": 4.501022277751602e-06, |
|
"loss": 0.0176, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 0.7176, |
|
"grad_norm": 0.03762371465563774, |
|
"learning_rate": 4.477719672561615e-06, |
|
"loss": 0.0242, |
|
"step": 1794 |
|
}, |
|
{ |
|
"epoch": 0.7184, |
|
"grad_norm": 3.2954201698303223, |
|
"learning_rate": 4.4544601312229295e-06, |
|
"loss": 0.3976, |
|
"step": 1796 |
|
}, |
|
{ |
|
"epoch": 0.7192, |
|
"grad_norm": 2.2384305000305176, |
|
"learning_rate": 4.4312438351181246e-06, |
|
"loss": 0.2907, |
|
"step": 1798 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.017522338777780533, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 0.4971, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7208, |
|
"grad_norm": 0.04784021154046059, |
|
"learning_rate": 4.384941702452856e-06, |
|
"loss": 0.6106, |
|
"step": 1802 |
|
}, |
|
{ |
|
"epoch": 0.7216, |
|
"grad_norm": 1.3355712890625, |
|
"learning_rate": 4.361856226965733e-06, |
|
"loss": 0.1558, |
|
"step": 1804 |
|
}, |
|
{ |
|
"epoch": 0.7224, |
|
"grad_norm": 0.026937812566757202, |
|
"learning_rate": 4.338814718856333e-06, |
|
"loss": 0.0143, |
|
"step": 1806 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 0.14190848171710968, |
|
"learning_rate": 4.315817357806974e-06, |
|
"loss": 0.4526, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 0.724, |
|
"grad_norm": 0.10695420950651169, |
|
"learning_rate": 4.292864323155684e-06, |
|
"loss": 0.0318, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7248, |
|
"grad_norm": 0.988207221031189, |
|
"learning_rate": 4.26995579389485e-06, |
|
"loss": 0.1699, |
|
"step": 1812 |
|
}, |
|
{ |
|
"epoch": 0.7256, |
|
"grad_norm": 0.03641045466065407, |
|
"learning_rate": 4.247091948669775e-06, |
|
"loss": 0.0399, |
|
"step": 1814 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 0.032800428569316864, |
|
"learning_rate": 4.224272965777326e-06, |
|
"loss": 0.015, |
|
"step": 1816 |
|
}, |
|
{ |
|
"epoch": 0.7272, |
|
"grad_norm": 0.0268462635576725, |
|
"learning_rate": 4.201499023164508e-06, |
|
"loss": 0.0272, |
|
"step": 1818 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 0.05160361900925636, |
|
"learning_rate": 4.178770298427107e-06, |
|
"loss": 0.068, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.7288, |
|
"grad_norm": 1.727393627166748, |
|
"learning_rate": 4.15608696880828e-06, |
|
"loss": 0.3799, |
|
"step": 1822 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 0.08573580533266068, |
|
"learning_rate": 4.133449211197188e-06, |
|
"loss": 0.0329, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 0.7304, |
|
"grad_norm": 0.025401653721928596, |
|
"learning_rate": 4.110857202127615e-06, |
|
"loss": 0.0221, |
|
"step": 1826 |
|
}, |
|
{ |
|
"epoch": 0.7312, |
|
"grad_norm": 0.37113156914711, |
|
"learning_rate": 4.08831111777658e-06, |
|
"loss": 0.0818, |
|
"step": 1828 |
|
}, |
|
{ |
|
"epoch": 0.732, |
|
"grad_norm": 0.06260918080806732, |
|
"learning_rate": 4.065811133962987e-06, |
|
"loss": 0.0284, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 0.09040958434343338, |
|
"learning_rate": 4.04335742614622e-06, |
|
"loss": 0.0748, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 0.7336, |
|
"grad_norm": 0.14594818651676178, |
|
"learning_rate": 4.020950169424815e-06, |
|
"loss": 0.0702, |
|
"step": 1834 |
|
}, |
|
{ |
|
"epoch": 0.7344, |
|
"grad_norm": 0.03950029984116554, |
|
"learning_rate": 3.998589538535046e-06, |
|
"loss": 0.0142, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 0.7352, |
|
"grad_norm": 0.07695923000574112, |
|
"learning_rate": 3.976275707849616e-06, |
|
"loss": 0.0313, |
|
"step": 1838 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.02458553947508335, |
|
"learning_rate": 3.954008851376252e-06, |
|
"loss": 0.0769, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7368, |
|
"grad_norm": 0.6668182611465454, |
|
"learning_rate": 3.931789142756377e-06, |
|
"loss": 0.1822, |
|
"step": 1842 |
|
}, |
|
{ |
|
"epoch": 0.7376, |
|
"grad_norm": 1.2924126386642456, |
|
"learning_rate": 3.9096167552637454e-06, |
|
"loss": 0.1353, |
|
"step": 1844 |
|
}, |
|
{ |
|
"epoch": 0.7384, |
|
"grad_norm": 0.021815890446305275, |
|
"learning_rate": 3.887491861803085e-06, |
|
"loss": 0.0674, |
|
"step": 1846 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 0.043568458408117294, |
|
"learning_rate": 3.86541463490876e-06, |
|
"loss": 0.0284, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.013898522593080997, |
|
"learning_rate": 3.8433852467434175e-06, |
|
"loss": 0.0049, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7408, |
|
"grad_norm": 0.03106304258108139, |
|
"learning_rate": 3.821403869096658e-06, |
|
"loss": 0.2808, |
|
"step": 1852 |
|
}, |
|
{ |
|
"epoch": 0.7416, |
|
"grad_norm": 0.0395389050245285, |
|
"learning_rate": 3.7994706733836738e-06, |
|
"loss": 0.0115, |
|
"step": 1854 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.027892421931028366, |
|
"learning_rate": 3.7775858306439374e-06, |
|
"loss": 0.0126, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 0.7432, |
|
"grad_norm": 0.04609684646129608, |
|
"learning_rate": 3.7557495115398446e-06, |
|
"loss": 0.0117, |
|
"step": 1858 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.218703493475914, |
|
"learning_rate": 3.7339618863553983e-06, |
|
"loss": 0.042, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7448, |
|
"grad_norm": 0.03932720422744751, |
|
"learning_rate": 3.7122231249948747e-06, |
|
"loss": 0.0253, |
|
"step": 1862 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 0.04857787489891052, |
|
"learning_rate": 3.6905333969815038e-06, |
|
"loss": 0.0178, |
|
"step": 1864 |
|
}, |
|
{ |
|
"epoch": 0.7464, |
|
"grad_norm": 0.12067477405071259, |
|
"learning_rate": 3.6688928714561444e-06, |
|
"loss": 0.0707, |
|
"step": 1866 |
|
}, |
|
{ |
|
"epoch": 0.7472, |
|
"grad_norm": 2.37345814704895, |
|
"learning_rate": 3.6473017171759563e-06, |
|
"loss": 0.4737, |
|
"step": 1868 |
|
}, |
|
{ |
|
"epoch": 0.748, |
|
"grad_norm": 0.13440009951591492, |
|
"learning_rate": 3.625760102513103e-06, |
|
"loss": 0.0347, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 3.3364574909210205, |
|
"learning_rate": 3.604268195453421e-06, |
|
"loss": 0.1831, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 0.7496, |
|
"grad_norm": 0.6941204071044922, |
|
"learning_rate": 3.582826163595119e-06, |
|
"loss": 0.0828, |
|
"step": 1874 |
|
}, |
|
{ |
|
"epoch": 0.7504, |
|
"grad_norm": 2.122856616973877, |
|
"learning_rate": 3.5614341741474633e-06, |
|
"loss": 0.6934, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 0.7512, |
|
"grad_norm": 1.272242546081543, |
|
"learning_rate": 3.540092393929494e-06, |
|
"loss": 0.1293, |
|
"step": 1878 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.18253956735134125, |
|
"learning_rate": 3.5188009893686916e-06, |
|
"loss": 0.3014, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7528, |
|
"grad_norm": 0.1802951991558075, |
|
"learning_rate": 3.4975601264997094e-06, |
|
"loss": 0.0245, |
|
"step": 1882 |
|
}, |
|
{ |
|
"epoch": 0.7536, |
|
"grad_norm": 0.06385163217782974, |
|
"learning_rate": 3.476369970963072e-06, |
|
"loss": 0.3051, |
|
"step": 1884 |
|
}, |
|
{ |
|
"epoch": 0.7544, |
|
"grad_norm": 0.04664032533764839, |
|
"learning_rate": 3.455230688003852e-06, |
|
"loss": 0.0243, |
|
"step": 1886 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 0.09623868763446808, |
|
"learning_rate": 3.4341424424704373e-06, |
|
"loss": 0.0285, |
|
"step": 1888 |
|
}, |
|
{ |
|
"epoch": 0.756, |
|
"grad_norm": 0.12530668079853058, |
|
"learning_rate": 3.4131053988131947e-06, |
|
"loss": 0.1282, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7568, |
|
"grad_norm": 0.03179166465997696, |
|
"learning_rate": 3.3921197210832235e-06, |
|
"loss": 0.0231, |
|
"step": 1892 |
|
}, |
|
{ |
|
"epoch": 0.7576, |
|
"grad_norm": 0.3028177320957184, |
|
"learning_rate": 3.3711855729310482e-06, |
|
"loss": 0.0521, |
|
"step": 1894 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 0.01785534806549549, |
|
"learning_rate": 3.3503031176053657e-06, |
|
"loss": 0.0962, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 0.7592, |
|
"grad_norm": 0.4004635214805603, |
|
"learning_rate": 3.3294725179517573e-06, |
|
"loss": 0.0461, |
|
"step": 1898 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.02001468650996685, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": 0.0133, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7608, |
|
"grad_norm": 0.019235266372561455, |
|
"learning_rate": 3.287967535019908e-06, |
|
"loss": 0.006, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 0.0368565134704113, |
|
"learning_rate": 3.2672934754058615e-06, |
|
"loss": 0.0194, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 0.7624, |
|
"grad_norm": 0.045688826590776443, |
|
"learning_rate": 3.2466719187897555e-06, |
|
"loss": 0.0148, |
|
"step": 1906 |
|
}, |
|
{ |
|
"epoch": 0.7632, |
|
"grad_norm": 0.1634262502193451, |
|
"learning_rate": 3.2261030259826287e-06, |
|
"loss": 0.0357, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 0.764, |
|
"grad_norm": 0.03429366648197174, |
|
"learning_rate": 3.2055869573848374e-06, |
|
"loss": 0.1542, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 0.04100421443581581, |
|
"learning_rate": 3.1851238729848033e-06, |
|
"loss": 0.0103, |
|
"step": 1912 |
|
}, |
|
{ |
|
"epoch": 0.7656, |
|
"grad_norm": 0.07986348867416382, |
|
"learning_rate": 3.164713932357776e-06, |
|
"loss": 0.0197, |
|
"step": 1914 |
|
}, |
|
{ |
|
"epoch": 0.7664, |
|
"grad_norm": 0.015995606780052185, |
|
"learning_rate": 3.144357294664565e-06, |
|
"loss": 0.0145, |
|
"step": 1916 |
|
}, |
|
{ |
|
"epoch": 0.7672, |
|
"grad_norm": 1.615556240081787, |
|
"learning_rate": 3.124054118650327e-06, |
|
"loss": 0.7627, |
|
"step": 1918 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.1365402489900589, |
|
"learning_rate": 3.103804562643302e-06, |
|
"loss": 0.04, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7688, |
|
"grad_norm": 0.2867192327976227, |
|
"learning_rate": 3.0836087845536e-06, |
|
"loss": 0.0462, |
|
"step": 1922 |
|
}, |
|
{ |
|
"epoch": 0.7696, |
|
"grad_norm": 1.3241535425186157, |
|
"learning_rate": 3.063466941871952e-06, |
|
"loss": 0.6603, |
|
"step": 1924 |
|
}, |
|
{ |
|
"epoch": 0.7704, |
|
"grad_norm": 0.1195574626326561, |
|
"learning_rate": 3.043379191668492e-06, |
|
"loss": 0.3676, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 1.3540464639663696, |
|
"learning_rate": 3.023345690591537e-06, |
|
"loss": 0.0907, |
|
"step": 1928 |
|
}, |
|
{ |
|
"epoch": 0.772, |
|
"grad_norm": 0.03964836522936821, |
|
"learning_rate": 3.003366594866345e-06, |
|
"loss": 0.1992, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7728, |
|
"grad_norm": 0.16762642562389374, |
|
"learning_rate": 2.983442060293926e-06, |
|
"loss": 0.182, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 0.7736, |
|
"grad_norm": 0.02284320630133152, |
|
"learning_rate": 2.963572242249799e-06, |
|
"loss": 0.0122, |
|
"step": 1934 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 0.05577899515628815, |
|
"learning_rate": 2.9437572956827965e-06, |
|
"loss": 0.039, |
|
"step": 1936 |
|
}, |
|
{ |
|
"epoch": 0.7752, |
|
"grad_norm": 0.08313470333814621, |
|
"learning_rate": 2.9239973751138495e-06, |
|
"loss": 0.0276, |
|
"step": 1938 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 0.06292706727981567, |
|
"learning_rate": 2.9042926346347932e-06, |
|
"loss": 0.0117, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7768, |
|
"grad_norm": 2.362842559814453, |
|
"learning_rate": 2.884643227907147e-06, |
|
"loss": 1.182, |
|
"step": 1942 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 1.178203821182251, |
|
"learning_rate": 2.8650493081609344e-06, |
|
"loss": 0.312, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 0.7784, |
|
"grad_norm": 0.3593979775905609, |
|
"learning_rate": 2.8455110281934804e-06, |
|
"loss": 0.0304, |
|
"step": 1946 |
|
}, |
|
{ |
|
"epoch": 0.7792, |
|
"grad_norm": 0.2723308205604553, |
|
"learning_rate": 2.8260285403682153e-06, |
|
"loss": 0.0387, |
|
"step": 1948 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.010247028432786465, |
|
"learning_rate": 2.8066019966134907e-06, |
|
"loss": 0.0151, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 1.3679031133651733, |
|
"learning_rate": 2.7872315484213954e-06, |
|
"loss": 0.1826, |
|
"step": 1952 |
|
}, |
|
{ |
|
"epoch": 0.7816, |
|
"grad_norm": 0.024432742968201637, |
|
"learning_rate": 2.7679173468465813e-06, |
|
"loss": 0.0401, |
|
"step": 1954 |
|
}, |
|
{ |
|
"epoch": 0.7824, |
|
"grad_norm": 0.026111546903848648, |
|
"learning_rate": 2.7486595425050667e-06, |
|
"loss": 0.2, |
|
"step": 1956 |
|
}, |
|
{ |
|
"epoch": 0.7832, |
|
"grad_norm": 1.0953105688095093, |
|
"learning_rate": 2.7294582855730835e-06, |
|
"loss": 0.4462, |
|
"step": 1958 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 2.070159673690796, |
|
"learning_rate": 2.7103137257858867e-06, |
|
"loss": 0.5674, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7848, |
|
"grad_norm": 0.26604995131492615, |
|
"learning_rate": 2.6912260124366007e-06, |
|
"loss": 0.0452, |
|
"step": 1962 |
|
}, |
|
{ |
|
"epoch": 0.7856, |
|
"grad_norm": 0.9278347492218018, |
|
"learning_rate": 2.672195294375045e-06, |
|
"loss": 0.4562, |
|
"step": 1964 |
|
}, |
|
{ |
|
"epoch": 0.7864, |
|
"grad_norm": 0.44804051518440247, |
|
"learning_rate": 2.6532217200065856e-06, |
|
"loss": 0.1851, |
|
"step": 1966 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 1.6803971529006958, |
|
"learning_rate": 2.634305437290968e-06, |
|
"loss": 0.4855, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 0.788, |
|
"grad_norm": 0.14506347477436066, |
|
"learning_rate": 2.615446593741161e-06, |
|
"loss": 0.0346, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7888, |
|
"grad_norm": 0.01994413323700428, |
|
"learning_rate": 2.596645336422219e-06, |
|
"loss": 0.0191, |
|
"step": 1972 |
|
}, |
|
{ |
|
"epoch": 0.7896, |
|
"grad_norm": 1.5493515729904175, |
|
"learning_rate": 2.577901811950121e-06, |
|
"loss": 0.4661, |
|
"step": 1974 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 0.20212967693805695, |
|
"learning_rate": 2.5592161664906366e-06, |
|
"loss": 0.1535, |
|
"step": 1976 |
|
}, |
|
{ |
|
"epoch": 0.7912, |
|
"grad_norm": 0.20568671822547913, |
|
"learning_rate": 2.5405885457581793e-06, |
|
"loss": 0.0685, |
|
"step": 1978 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.2657265365123749, |
|
"learning_rate": 2.522019095014683e-06, |
|
"loss": 0.5426, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7928, |
|
"grad_norm": 0.39118292927742004, |
|
"learning_rate": 2.5035079590684496e-06, |
|
"loss": 0.0688, |
|
"step": 1982 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 0.2671131193637848, |
|
"learning_rate": 2.48505528227304e-06, |
|
"loss": 0.0788, |
|
"step": 1984 |
|
}, |
|
{ |
|
"epoch": 0.7944, |
|
"grad_norm": 0.055703867226839066, |
|
"learning_rate": 2.4666612085261344e-06, |
|
"loss": 0.0482, |
|
"step": 1986 |
|
}, |
|
{ |
|
"epoch": 0.7952, |
|
"grad_norm": 0.03526290878653526, |
|
"learning_rate": 2.4483258812684096e-06, |
|
"loss": 0.0153, |
|
"step": 1988 |
|
}, |
|
{ |
|
"epoch": 0.796, |
|
"grad_norm": 1.1341354846954346, |
|
"learning_rate": 2.4300494434824373e-06, |
|
"loss": 0.6923, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 0.07913788408041, |
|
"learning_rate": 2.411832037691545e-06, |
|
"loss": 0.0221, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 0.7976, |
|
"grad_norm": 0.10583814233541489, |
|
"learning_rate": 2.3936738059587284e-06, |
|
"loss": 0.0335, |
|
"step": 1994 |
|
}, |
|
{ |
|
"epoch": 0.7984, |
|
"grad_norm": 0.019925588741898537, |
|
"learning_rate": 2.37557488988552e-06, |
|
"loss": 0.0187, |
|
"step": 1996 |
|
}, |
|
{ |
|
"epoch": 0.7992, |
|
"grad_norm": 0.020932232961058617, |
|
"learning_rate": 2.35753543061091e-06, |
|
"loss": 0.0373, |
|
"step": 1998 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.03722110390663147, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.0309, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8008, |
|
"grad_norm": 0.6024539470672607, |
|
"learning_rate": 2.321635444694028e-06, |
|
"loss": 0.1057, |
|
"step": 2002 |
|
}, |
|
{ |
|
"epoch": 0.8016, |
|
"grad_norm": 0.28481170535087585, |
|
"learning_rate": 2.3037751980070557e-06, |
|
"loss": 0.0379, |
|
"step": 2004 |
|
}, |
|
{ |
|
"epoch": 0.8024, |
|
"grad_norm": 0.13168196380138397, |
|
"learning_rate": 2.2859749680270983e-06, |
|
"loss": 0.0436, |
|
"step": 2006 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 0.02613903023302555, |
|
"learning_rate": 2.2682348935639274e-06, |
|
"loss": 0.0622, |
|
"step": 2008 |
|
}, |
|
{ |
|
"epoch": 0.804, |
|
"grad_norm": 0.3884766399860382, |
|
"learning_rate": 2.2505551129582047e-06, |
|
"loss": 0.0609, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8048, |
|
"grad_norm": 0.014485559426248074, |
|
"learning_rate": 2.2329357640804118e-06, |
|
"loss": 0.024, |
|
"step": 2012 |
|
}, |
|
{ |
|
"epoch": 0.8056, |
|
"grad_norm": 0.09305766969919205, |
|
"learning_rate": 2.215376984329767e-06, |
|
"loss": 0.0157, |
|
"step": 2014 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 0.23493258655071259, |
|
"learning_rate": 2.1978789106331666e-06, |
|
"loss": 0.1795, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.8072, |
|
"grad_norm": 0.2084125131368637, |
|
"learning_rate": 2.1804416794441e-06, |
|
"loss": 0.0342, |
|
"step": 2018 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 0.08282342553138733, |
|
"learning_rate": 2.163065426741603e-06, |
|
"loss": 0.0217, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.8088, |
|
"grad_norm": 0.10017091780900955, |
|
"learning_rate": 2.1457502880291815e-06, |
|
"loss": 0.3079, |
|
"step": 2022 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 0.11336953938007355, |
|
"learning_rate": 2.128496398333768e-06, |
|
"loss": 0.0351, |
|
"step": 2024 |
|
}, |
|
{ |
|
"epoch": 0.8104, |
|
"grad_norm": 2.011279344558716, |
|
"learning_rate": 2.1113038922046603e-06, |
|
"loss": 0.6162, |
|
"step": 2026 |
|
}, |
|
{ |
|
"epoch": 0.8112, |
|
"grad_norm": 0.02046182006597519, |
|
"learning_rate": 2.09417290371247e-06, |
|
"loss": 0.0658, |
|
"step": 2028 |
|
}, |
|
{ |
|
"epoch": 0.812, |
|
"grad_norm": 0.0351959727704525, |
|
"learning_rate": 2.0771035664480944e-06, |
|
"loss": 0.7813, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 0.031053269281983376, |
|
"learning_rate": 2.0600960135216463e-06, |
|
"loss": 0.0579, |
|
"step": 2032 |
|
}, |
|
{ |
|
"epoch": 0.8136, |
|
"grad_norm": 0.027455810457468033, |
|
"learning_rate": 2.0431503775614457e-06, |
|
"loss": 0.0735, |
|
"step": 2034 |
|
}, |
|
{ |
|
"epoch": 0.8144, |
|
"grad_norm": 0.030600672587752342, |
|
"learning_rate": 2.026266790712965e-06, |
|
"loss": 0.0096, |
|
"step": 2036 |
|
}, |
|
{ |
|
"epoch": 0.8152, |
|
"grad_norm": 0.11856792867183685, |
|
"learning_rate": 2.009445384637805e-06, |
|
"loss": 0.0198, |
|
"step": 2038 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.0909661203622818, |
|
"learning_rate": 1.9926862905126663e-06, |
|
"loss": 0.0384, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8168, |
|
"grad_norm": 0.008265670388936996, |
|
"learning_rate": 1.9759896390283362e-06, |
|
"loss": 0.092, |
|
"step": 2042 |
|
}, |
|
{ |
|
"epoch": 0.8176, |
|
"grad_norm": 0.035349734127521515, |
|
"learning_rate": 1.959355560388654e-06, |
|
"loss": 0.0277, |
|
"step": 2044 |
|
}, |
|
{ |
|
"epoch": 0.8184, |
|
"grad_norm": 0.058402419090270996, |
|
"learning_rate": 1.9427841843095063e-06, |
|
"loss": 0.0175, |
|
"step": 2046 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 0.11960252374410629, |
|
"learning_rate": 1.9262756400178163e-06, |
|
"loss": 0.4295, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.08432687819004059, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 0.0149, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.8208, |
|
"grad_norm": 1.8159247636795044, |
|
"learning_rate": 1.8934475612536019e-06, |
|
"loss": 0.2313, |
|
"step": 2052 |
|
}, |
|
{ |
|
"epoch": 0.8216, |
|
"grad_norm": 0.7769438028335571, |
|
"learning_rate": 1.8771282827810278e-06, |
|
"loss": 0.066, |
|
"step": 2054 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 0.3301757276058197, |
|
"learning_rate": 1.8608723480938207e-06, |
|
"loss": 0.0571, |
|
"step": 2056 |
|
}, |
|
{ |
|
"epoch": 0.8232, |
|
"grad_norm": 0.009649352170526981, |
|
"learning_rate": 1.8446798839590186e-06, |
|
"loss": 0.0781, |
|
"step": 2058 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 0.014373435638844967, |
|
"learning_rate": 1.8285510166487154e-06, |
|
"loss": 0.0129, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.8248, |
|
"grad_norm": 0.08050563931465149, |
|
"learning_rate": 1.812485871939056e-06, |
|
"loss": 0.3065, |
|
"step": 2062 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 0.0180222075432539, |
|
"learning_rate": 1.7964845751092663e-06, |
|
"loss": 0.0089, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 0.8264, |
|
"grad_norm": 0.15466023981571198, |
|
"learning_rate": 1.7805472509406695e-06, |
|
"loss": 0.0321, |
|
"step": 2066 |
|
}, |
|
{ |
|
"epoch": 0.8272, |
|
"grad_norm": 0.07331327348947525, |
|
"learning_rate": 1.7646740237157256e-06, |
|
"loss": 0.021, |
|
"step": 2068 |
|
}, |
|
{ |
|
"epoch": 0.828, |
|
"grad_norm": 0.1734917163848877, |
|
"learning_rate": 1.7488650172170496e-06, |
|
"loss": 0.0255, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 0.023930072784423828, |
|
"learning_rate": 1.7331203547264452e-06, |
|
"loss": 0.0887, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 0.8296, |
|
"grad_norm": 0.027445461601018906, |
|
"learning_rate": 1.7174401590239587e-06, |
|
"loss": 0.1292, |
|
"step": 2074 |
|
}, |
|
{ |
|
"epoch": 0.8304, |
|
"grad_norm": 0.07482447475194931, |
|
"learning_rate": 1.7018245523869038e-06, |
|
"loss": 0.0384, |
|
"step": 2076 |
|
}, |
|
{ |
|
"epoch": 0.8312, |
|
"grad_norm": 0.03953075036406517, |
|
"learning_rate": 1.686273656588917e-06, |
|
"loss": 0.0082, |
|
"step": 2078 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.05215131863951683, |
|
"learning_rate": 1.6707875928990059e-06, |
|
"loss": 0.0412, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8328, |
|
"grad_norm": 0.061881113797426224, |
|
"learning_rate": 1.6553664820806102e-06, |
|
"loss": 0.04, |
|
"step": 2082 |
|
}, |
|
{ |
|
"epoch": 0.8336, |
|
"grad_norm": 0.1969269961118698, |
|
"learning_rate": 1.6400104443906463e-06, |
|
"loss": 0.0356, |
|
"step": 2084 |
|
}, |
|
{ |
|
"epoch": 0.8344, |
|
"grad_norm": 0.0532572865486145, |
|
"learning_rate": 1.6247195995785836e-06, |
|
"loss": 0.0115, |
|
"step": 2086 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 0.07264053821563721, |
|
"learning_rate": 1.6094940668855008e-06, |
|
"loss": 0.0126, |
|
"step": 2088 |
|
}, |
|
{ |
|
"epoch": 0.836, |
|
"grad_norm": 1.00359308719635, |
|
"learning_rate": 1.5943339650431578e-06, |
|
"loss": 0.1388, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8368, |
|
"grad_norm": 0.0406976044178009, |
|
"learning_rate": 1.579239412273078e-06, |
|
"loss": 0.0194, |
|
"step": 2092 |
|
}, |
|
{ |
|
"epoch": 0.8376, |
|
"grad_norm": 0.02532949112355709, |
|
"learning_rate": 1.5642105262856122e-06, |
|
"loss": 0.0081, |
|
"step": 2094 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 0.10549493879079819, |
|
"learning_rate": 1.5492474242790368e-06, |
|
"loss": 0.075, |
|
"step": 2096 |
|
}, |
|
{ |
|
"epoch": 0.8392, |
|
"grad_norm": 0.029608091339468956, |
|
"learning_rate": 1.5343502229386209e-06, |
|
"loss": 0.022, |
|
"step": 2098 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.5157110691070557, |
|
"learning_rate": 1.5195190384357405e-06, |
|
"loss": 0.1403, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8408, |
|
"grad_norm": 0.13249818980693817, |
|
"learning_rate": 1.5047539864269477e-06, |
|
"loss": 0.1111, |
|
"step": 2102 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 0.09822747856378555, |
|
"learning_rate": 1.490055182053083e-06, |
|
"loss": 0.0193, |
|
"step": 2104 |
|
}, |
|
{ |
|
"epoch": 0.8424, |
|
"grad_norm": 0.7339903712272644, |
|
"learning_rate": 1.4754227399383758e-06, |
|
"loss": 0.0544, |
|
"step": 2106 |
|
}, |
|
{ |
|
"epoch": 0.8432, |
|
"grad_norm": 0.03530171886086464, |
|
"learning_rate": 1.4608567741895496e-06, |
|
"loss": 0.0135, |
|
"step": 2108 |
|
}, |
|
{ |
|
"epoch": 0.844, |
|
"grad_norm": 0.5551167726516724, |
|
"learning_rate": 1.446357398394934e-06, |
|
"loss": 0.1178, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 0.15902993083000183, |
|
"learning_rate": 1.4319247256235713e-06, |
|
"loss": 0.1147, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 0.8456, |
|
"grad_norm": 0.7204644083976746, |
|
"learning_rate": 1.4175588684243447e-06, |
|
"loss": 0.1095, |
|
"step": 2114 |
|
}, |
|
{ |
|
"epoch": 0.8464, |
|
"grad_norm": 0.05895598977804184, |
|
"learning_rate": 1.40325993882509e-06, |
|
"loss": 0.0297, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 0.8472, |
|
"grad_norm": 0.9869866371154785, |
|
"learning_rate": 1.3890280483317375e-06, |
|
"loss": 0.1489, |
|
"step": 2118 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.005029854364693165, |
|
"learning_rate": 1.3748633079274254e-06, |
|
"loss": 0.1327, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8488, |
|
"grad_norm": 0.14541175961494446, |
|
"learning_rate": 1.3607658280716474e-06, |
|
"loss": 0.1304, |
|
"step": 2122 |
|
}, |
|
{ |
|
"epoch": 0.8496, |
|
"grad_norm": 0.051523029804229736, |
|
"learning_rate": 1.3467357186993802e-06, |
|
"loss": 0.0182, |
|
"step": 2124 |
|
}, |
|
{ |
|
"epoch": 0.8504, |
|
"grad_norm": 0.13095594942569733, |
|
"learning_rate": 1.3327730892202384e-06, |
|
"loss": 0.4018, |
|
"step": 2126 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 0.07890293002128601, |
|
"learning_rate": 1.3188780485176089e-06, |
|
"loss": 0.0119, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 0.852, |
|
"grad_norm": 0.0385555699467659, |
|
"learning_rate": 1.30505070494781e-06, |
|
"loss": 0.0285, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.8528, |
|
"grad_norm": 0.022745564579963684, |
|
"learning_rate": 1.2912911663392468e-06, |
|
"loss": 0.0219, |
|
"step": 2132 |
|
}, |
|
{ |
|
"epoch": 0.8536, |
|
"grad_norm": 0.04321039468050003, |
|
"learning_rate": 1.277599539991563e-06, |
|
"loss": 0.061, |
|
"step": 2134 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 0.014432685449719429, |
|
"learning_rate": 1.2639759326748136e-06, |
|
"loss": 0.0409, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 0.8552, |
|
"grad_norm": 0.019150329753756523, |
|
"learning_rate": 1.2504204506286244e-06, |
|
"loss": 0.0051, |
|
"step": 2138 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 1.3626798391342163, |
|
"learning_rate": 1.2369331995613664e-06, |
|
"loss": 0.1418, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.8568, |
|
"grad_norm": 0.02268528752028942, |
|
"learning_rate": 1.223514284649331e-06, |
|
"loss": 0.2405, |
|
"step": 2142 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 0.5977320671081543, |
|
"learning_rate": 1.210163810535917e-06, |
|
"loss": 0.0792, |
|
"step": 2144 |
|
}, |
|
{ |
|
"epoch": 0.8584, |
|
"grad_norm": 0.01674368605017662, |
|
"learning_rate": 1.196881881330798e-06, |
|
"loss": 0.0047, |
|
"step": 2146 |
|
}, |
|
{ |
|
"epoch": 0.8592, |
|
"grad_norm": 0.22216765582561493, |
|
"learning_rate": 1.1836686006091313e-06, |
|
"loss": 0.4924, |
|
"step": 2148 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.04747781902551651, |
|
"learning_rate": 1.1705240714107301e-06, |
|
"loss": 0.0192, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 0.12776315212249756, |
|
"learning_rate": 1.1574483962392768e-06, |
|
"loss": 0.031, |
|
"step": 2152 |
|
}, |
|
{ |
|
"epoch": 0.8616, |
|
"grad_norm": 0.022700203582644463, |
|
"learning_rate": 1.1444416770615118e-06, |
|
"loss": 0.0062, |
|
"step": 2154 |
|
}, |
|
{ |
|
"epoch": 0.8624, |
|
"grad_norm": 0.07675009965896606, |
|
"learning_rate": 1.1315040153064416e-06, |
|
"loss": 0.0334, |
|
"step": 2156 |
|
}, |
|
{ |
|
"epoch": 0.8632, |
|
"grad_norm": 0.0223082285374403, |
|
"learning_rate": 1.1186355118645552e-06, |
|
"loss": 0.0168, |
|
"step": 2158 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.034395378082990646, |
|
"learning_rate": 1.1058362670870248e-06, |
|
"loss": 0.0086, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.8648, |
|
"grad_norm": 0.015471206046640873, |
|
"learning_rate": 1.093106380784934e-06, |
|
"loss": 0.1392, |
|
"step": 2162 |
|
}, |
|
{ |
|
"epoch": 0.8656, |
|
"grad_norm": 0.013013385236263275, |
|
"learning_rate": 1.0804459522284927e-06, |
|
"loss": 0.0584, |
|
"step": 2164 |
|
}, |
|
{ |
|
"epoch": 0.8664, |
|
"grad_norm": 0.03496384248137474, |
|
"learning_rate": 1.0678550801462662e-06, |
|
"loss": 0.0052, |
|
"step": 2166 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 0.25604620575904846, |
|
"learning_rate": 1.0553338627244026e-06, |
|
"loss": 0.0329, |
|
"step": 2168 |
|
}, |
|
{ |
|
"epoch": 0.868, |
|
"grad_norm": 1.7641575336456299, |
|
"learning_rate": 1.042882397605871e-06, |
|
"loss": 0.1352, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.8688, |
|
"grad_norm": 0.083366259932518, |
|
"learning_rate": 1.0305007818897006e-06, |
|
"loss": 0.0171, |
|
"step": 2172 |
|
}, |
|
{ |
|
"epoch": 0.8696, |
|
"grad_norm": 1.2676173448562622, |
|
"learning_rate": 1.0181891121302145e-06, |
|
"loss": 1.594, |
|
"step": 2174 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 0.604960560798645, |
|
"learning_rate": 1.0059474843362893e-06, |
|
"loss": 0.2283, |
|
"step": 2176 |
|
}, |
|
{ |
|
"epoch": 0.8712, |
|
"grad_norm": 0.16328264772891998, |
|
"learning_rate": 9.93775993970597e-07, |
|
"loss": 0.0237, |
|
"step": 2178 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 0.01555766724050045, |
|
"learning_rate": 9.816747359488632e-07, |
|
"loss": 0.1311, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.8728, |
|
"grad_norm": 0.6382170915603638, |
|
"learning_rate": 9.696438046391288e-07, |
|
"loss": 0.0781, |
|
"step": 2182 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 0.1925554871559143, |
|
"learning_rate": 9.576832938610137e-07, |
|
"loss": 0.057, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 0.8744, |
|
"grad_norm": 0.08132878690958023, |
|
"learning_rate": 9.457932968849826e-07, |
|
"loss": 0.0189, |
|
"step": 2186 |
|
}, |
|
{ |
|
"epoch": 0.8752, |
|
"grad_norm": 0.036015480756759644, |
|
"learning_rate": 9.339739064316233e-07, |
|
"loss": 0.1586, |
|
"step": 2188 |
|
}, |
|
{ |
|
"epoch": 0.876, |
|
"grad_norm": 1.3472023010253906, |
|
"learning_rate": 9.222252146709143e-07, |
|
"loss": 0.4576, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 2.6704301834106445, |
|
"learning_rate": 9.105473132215126e-07, |
|
"loss": 0.1486, |
|
"step": 2192 |
|
}, |
|
{ |
|
"epoch": 0.8776, |
|
"grad_norm": 0.029902072623372078, |
|
"learning_rate": 8.989402931500434e-07, |
|
"loss": 0.032, |
|
"step": 2194 |
|
}, |
|
{ |
|
"epoch": 0.8784, |
|
"grad_norm": 0.036093611270189285, |
|
"learning_rate": 8.874042449703779e-07, |
|
"loss": 0.0153, |
|
"step": 2196 |
|
}, |
|
{ |
|
"epoch": 0.8792, |
|
"grad_norm": 0.6965168118476868, |
|
"learning_rate": 8.759392586429394e-07, |
|
"loss": 0.0854, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3597988188266754, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": 0.0457, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8808, |
|
"grad_norm": 0.15428324043750763, |
|
"learning_rate": 8.532228286149502e-07, |
|
"loss": 0.0202, |
|
"step": 2202 |
|
}, |
|
{ |
|
"epoch": 0.8816, |
|
"grad_norm": 0.1308208853006363, |
|
"learning_rate": 8.419715620616875e-07, |
|
"loss": 0.0235, |
|
"step": 2204 |
|
}, |
|
{ |
|
"epoch": 0.8824, |
|
"grad_norm": 0.8089284300804138, |
|
"learning_rate": 8.307917116538378e-07, |
|
"loss": 0.0771, |
|
"step": 2206 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 1.2626261711120605, |
|
"learning_rate": 8.196833645741187e-07, |
|
"loss": 0.6288, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 0.884, |
|
"grad_norm": 0.0954088419675827, |
|
"learning_rate": 8.086466074476562e-07, |
|
"loss": 0.0133, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8848, |
|
"grad_norm": 0.13527199625968933, |
|
"learning_rate": 7.976815263412963e-07, |
|
"loss": 0.0211, |
|
"step": 2212 |
|
}, |
|
{ |
|
"epoch": 0.8856, |
|
"grad_norm": 0.5545380115509033, |
|
"learning_rate": 7.867882067629473e-07, |
|
"loss": 0.0581, |
|
"step": 2214 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 0.06293027848005295, |
|
"learning_rate": 7.759667336609011e-07, |
|
"loss": 0.0264, |
|
"step": 2216 |
|
}, |
|
{ |
|
"epoch": 0.8872, |
|
"grad_norm": 0.3431554138660431, |
|
"learning_rate": 7.652171914231777e-07, |
|
"loss": 0.0329, |
|
"step": 2218 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 0.00832755770534277, |
|
"learning_rate": 7.545396638768698e-07, |
|
"loss": 0.0099, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.8888, |
|
"grad_norm": 2.4031519889831543, |
|
"learning_rate": 7.439342342874789e-07, |
|
"loss": 0.1471, |
|
"step": 2222 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 0.06904103606939316, |
|
"learning_rate": 7.334009853582791e-07, |
|
"loss": 0.0304, |
|
"step": 2224 |
|
}, |
|
{ |
|
"epoch": 0.8904, |
|
"grad_norm": 0.24774344265460968, |
|
"learning_rate": 7.22939999229657e-07, |
|
"loss": 0.0465, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 0.8912, |
|
"grad_norm": 1.129273533821106, |
|
"learning_rate": 7.125513574784904e-07, |
|
"loss": 0.1176, |
|
"step": 2228 |
|
}, |
|
{ |
|
"epoch": 0.892, |
|
"grad_norm": 0.13521403074264526, |
|
"learning_rate": 7.022351411174866e-07, |
|
"loss": 0.0401, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 0.025177139788866043, |
|
"learning_rate": 6.919914305945774e-07, |
|
"loss": 0.0051, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 0.8936, |
|
"grad_norm": 2.7186474800109863, |
|
"learning_rate": 6.818203057922756e-07, |
|
"loss": 0.6471, |
|
"step": 2234 |
|
}, |
|
{ |
|
"epoch": 0.8944, |
|
"grad_norm": 0.5871244072914124, |
|
"learning_rate": 6.717218460270536e-07, |
|
"loss": 0.0628, |
|
"step": 2236 |
|
}, |
|
{ |
|
"epoch": 0.8952, |
|
"grad_norm": 0.3472304344177246, |
|
"learning_rate": 6.616961300487323e-07, |
|
"loss": 0.0305, |
|
"step": 2238 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.04057146608829498, |
|
"learning_rate": 6.517432360398556e-07, |
|
"loss": 0.7828, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8968, |
|
"grad_norm": 0.03729462996125221, |
|
"learning_rate": 6.418632416150927e-07, |
|
"loss": 0.006, |
|
"step": 2242 |
|
}, |
|
{ |
|
"epoch": 0.8976, |
|
"grad_norm": 0.022776370868086815, |
|
"learning_rate": 6.320562238206218e-07, |
|
"loss": 0.0087, |
|
"step": 2244 |
|
}, |
|
{ |
|
"epoch": 0.8984, |
|
"grad_norm": 0.06814192980527878, |
|
"learning_rate": 6.223222591335409e-07, |
|
"loss": 0.0067, |
|
"step": 2246 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 0.012981448322534561, |
|
"learning_rate": 6.126614234612593e-07, |
|
"loss": 0.0273, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7651333212852478, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.1298, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.9008, |
|
"grad_norm": 0.5903016924858093, |
|
"learning_rate": 5.935594399387856e-07, |
|
"loss": 0.0611, |
|
"step": 2252 |
|
}, |
|
{ |
|
"epoch": 0.9016, |
|
"grad_norm": 0.06699836999177933, |
|
"learning_rate": 5.841184410496992e-07, |
|
"loss": 0.0083, |
|
"step": 2254 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 0.124520443379879, |
|
"learning_rate": 5.747508690964599e-07, |
|
"loss": 0.3001, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 0.9032, |
|
"grad_norm": 0.16008125245571136, |
|
"learning_rate": 5.654567971292757e-07, |
|
"loss": 0.0201, |
|
"step": 2258 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 0.07481832057237625, |
|
"learning_rate": 5.562362976251901e-07, |
|
"loss": 0.0851, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.9048, |
|
"grad_norm": 0.6332103610038757, |
|
"learning_rate": 5.470894424875062e-07, |
|
"loss": 0.1097, |
|
"step": 2262 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 0.014627913013100624, |
|
"learning_rate": 5.380163030452412e-07, |
|
"loss": 0.0236, |
|
"step": 2264 |
|
}, |
|
{ |
|
"epoch": 0.9064, |
|
"grad_norm": 0.6561143398284912, |
|
"learning_rate": 5.290169500525577e-07, |
|
"loss": 0.0778, |
|
"step": 2266 |
|
}, |
|
{ |
|
"epoch": 0.9072, |
|
"grad_norm": 0.017556993290781975, |
|
"learning_rate": 5.200914536882184e-07, |
|
"loss": 0.0088, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 0.908, |
|
"grad_norm": 0.11898583173751831, |
|
"learning_rate": 5.112398835550348e-07, |
|
"loss": 0.0246, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 1.1768893003463745, |
|
"learning_rate": 5.024623086793323e-07, |
|
"loss": 0.0801, |
|
"step": 2272 |
|
}, |
|
{ |
|
"epoch": 0.9096, |
|
"grad_norm": 0.29487040638923645, |
|
"learning_rate": 4.937587975103997e-07, |
|
"loss": 0.0533, |
|
"step": 2274 |
|
}, |
|
{ |
|
"epoch": 0.9104, |
|
"grad_norm": 1.571694016456604, |
|
"learning_rate": 4.851294179199673e-07, |
|
"loss": 0.65, |
|
"step": 2276 |
|
}, |
|
{ |
|
"epoch": 0.9112, |
|
"grad_norm": 0.01853368431329727, |
|
"learning_rate": 4.765742372016735e-07, |
|
"loss": 0.0912, |
|
"step": 2278 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.2929413318634033, |
|
"learning_rate": 4.6809332207053083e-07, |
|
"loss": 0.0748, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9128, |
|
"grad_norm": 0.019635476171970367, |
|
"learning_rate": 4.596867386624215e-07, |
|
"loss": 0.0165, |
|
"step": 2282 |
|
}, |
|
{ |
|
"epoch": 0.9136, |
|
"grad_norm": 0.04413539171218872, |
|
"learning_rate": 4.5135455253357053e-07, |
|
"loss": 0.0407, |
|
"step": 2284 |
|
}, |
|
{ |
|
"epoch": 0.9144, |
|
"grad_norm": 0.022891348227858543, |
|
"learning_rate": 4.4309682866004124e-07, |
|
"loss": 0.0352, |
|
"step": 2286 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 0.0767710953950882, |
|
"learning_rate": 4.349136314372204e-07, |
|
"loss": 0.0121, |
|
"step": 2288 |
|
}, |
|
{ |
|
"epoch": 0.916, |
|
"grad_norm": 0.13404303789138794, |
|
"learning_rate": 4.268050246793276e-07, |
|
"loss": 0.0556, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.9168, |
|
"grad_norm": 0.4251805543899536, |
|
"learning_rate": 4.1877107161890416e-07, |
|
"loss": 0.1859, |
|
"step": 2292 |
|
}, |
|
{ |
|
"epoch": 0.9176, |
|
"grad_norm": 1.6532477140426636, |
|
"learning_rate": 4.108118349063306e-07, |
|
"loss": 0.6478, |
|
"step": 2294 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 0.1750348061323166, |
|
"learning_rate": 4.0292737660933335e-07, |
|
"loss": 0.0384, |
|
"step": 2296 |
|
}, |
|
{ |
|
"epoch": 0.9192, |
|
"grad_norm": 1.260365605354309, |
|
"learning_rate": 3.9511775821250206e-07, |
|
"loss": 0.8857, |
|
"step": 2298 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.05385835841298103, |
|
"learning_rate": 3.8738304061681107e-07, |
|
"loss": 0.1621, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.9208, |
|
"grad_norm": 0.4076821208000183, |
|
"learning_rate": 3.7972328413914074e-07, |
|
"loss": 0.1319, |
|
"step": 2302 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 0.42220211029052734, |
|
"learning_rate": 3.721385485118123e-07, |
|
"loss": 0.2582, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 0.9224, |
|
"grad_norm": 0.2544540762901306, |
|
"learning_rate": 3.646288928821151e-07, |
|
"loss": 0.0197, |
|
"step": 2306 |
|
}, |
|
{ |
|
"epoch": 0.9232, |
|
"grad_norm": 1.2406009435653687, |
|
"learning_rate": 3.571943758118546e-07, |
|
"loss": 0.8589, |
|
"step": 2308 |
|
}, |
|
{ |
|
"epoch": 0.924, |
|
"grad_norm": 0.11001749336719513, |
|
"learning_rate": 3.498350552768859e-07, |
|
"loss": 0.0317, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 0.5791686177253723, |
|
"learning_rate": 3.4255098866667114e-07, |
|
"loss": 0.0695, |
|
"step": 2312 |
|
}, |
|
{ |
|
"epoch": 0.9256, |
|
"grad_norm": 0.005229198839515448, |
|
"learning_rate": 3.3534223278382405e-07, |
|
"loss": 0.0093, |
|
"step": 2314 |
|
}, |
|
{ |
|
"epoch": 0.9264, |
|
"grad_norm": 0.09191671758890152, |
|
"learning_rate": 3.282088438436715e-07, |
|
"loss": 0.0201, |
|
"step": 2316 |
|
}, |
|
{ |
|
"epoch": 0.9272, |
|
"grad_norm": 0.32510700821876526, |
|
"learning_rate": 3.211508774738137e-07, |
|
"loss": 0.039, |
|
"step": 2318 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.027174528688192368, |
|
"learning_rate": 3.1416838871368925e-07, |
|
"loss": 0.0191, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.9288, |
|
"grad_norm": 0.0871250182390213, |
|
"learning_rate": 3.072614320141487e-07, |
|
"loss": 0.0112, |
|
"step": 2322 |
|
}, |
|
{ |
|
"epoch": 0.9296, |
|
"grad_norm": 0.11660738289356232, |
|
"learning_rate": 3.00430061237027e-07, |
|
"loss": 0.0281, |
|
"step": 2324 |
|
}, |
|
{ |
|
"epoch": 0.9304, |
|
"grad_norm": 0.009191269055008888, |
|
"learning_rate": 2.936743296547273e-07, |
|
"loss": 0.0221, |
|
"step": 2326 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 0.05838488042354584, |
|
"learning_rate": 2.8699428994980017e-07, |
|
"loss": 0.9408, |
|
"step": 2328 |
|
}, |
|
{ |
|
"epoch": 0.932, |
|
"grad_norm": 1.4220937490463257, |
|
"learning_rate": 2.8038999421453827e-07, |
|
"loss": 0.0656, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9328, |
|
"grad_norm": 0.08184941858053207, |
|
"learning_rate": 2.7386149395056463e-07, |
|
"loss": 0.4858, |
|
"step": 2332 |
|
}, |
|
{ |
|
"epoch": 0.9336, |
|
"grad_norm": 0.7450655102729797, |
|
"learning_rate": 2.6740884006843826e-07, |
|
"loss": 0.0379, |
|
"step": 2334 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 1.6832029819488525, |
|
"learning_rate": 2.6103208288724815e-07, |
|
"loss": 0.6098, |
|
"step": 2336 |
|
}, |
|
{ |
|
"epoch": 0.9352, |
|
"grad_norm": 0.09961254149675369, |
|
"learning_rate": 2.547312721342277e-07, |
|
"loss": 0.0168, |
|
"step": 2338 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 0.10063952207565308, |
|
"learning_rate": 2.4850645694436736e-07, |
|
"loss": 0.0119, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9368, |
|
"grad_norm": 0.06440629810094833, |
|
"learning_rate": 2.423576858600252e-07, |
|
"loss": 0.0119, |
|
"step": 2342 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 0.01774456538259983, |
|
"learning_rate": 2.3628500683055222e-07, |
|
"loss": 0.0081, |
|
"step": 2344 |
|
}, |
|
{ |
|
"epoch": 0.9384, |
|
"grad_norm": 1.3639814853668213, |
|
"learning_rate": 2.3028846721191878e-07, |
|
"loss": 0.1012, |
|
"step": 2346 |
|
}, |
|
{ |
|
"epoch": 0.9392, |
|
"grad_norm": 0.010448544286191463, |
|
"learning_rate": 2.2436811376634893e-07, |
|
"loss": 0.0061, |
|
"step": 2348 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.05712028965353966, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": 0.0325, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 0.027819665148854256, |
|
"learning_rate": 2.1275614947233624e-07, |
|
"loss": 0.0963, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 0.9416, |
|
"grad_norm": 1.7772072553634644, |
|
"learning_rate": 2.0706462917632676e-07, |
|
"loss": 0.8716, |
|
"step": 2354 |
|
}, |
|
{ |
|
"epoch": 0.9424, |
|
"grad_norm": 0.47964930534362793, |
|
"learning_rate": 2.014494761575314e-07, |
|
"loss": 0.0422, |
|
"step": 2356 |
|
}, |
|
{ |
|
"epoch": 0.9432, |
|
"grad_norm": 0.0034853648394346237, |
|
"learning_rate": 1.9591073420404338e-07, |
|
"loss": 0.0195, |
|
"step": 2358 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.03955305367708206, |
|
"learning_rate": 1.9044844650808468e-07, |
|
"loss": 0.0275, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.9448, |
|
"grad_norm": 0.5900484919548035, |
|
"learning_rate": 1.8506265566567095e-07, |
|
"loss": 0.032, |
|
"step": 2362 |
|
}, |
|
{ |
|
"epoch": 0.9456, |
|
"grad_norm": 0.006537347100675106, |
|
"learning_rate": 1.7975340367628269e-07, |
|
"loss": 0.0082, |
|
"step": 2364 |
|
}, |
|
{ |
|
"epoch": 0.9464, |
|
"grad_norm": 0.03216685727238655, |
|
"learning_rate": 1.7452073194253237e-07, |
|
"loss": 0.0099, |
|
"step": 2366 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 0.06808136403560638, |
|
"learning_rate": 1.6936468126984573e-07, |
|
"loss": 0.0111, |
|
"step": 2368 |
|
}, |
|
{ |
|
"epoch": 0.948, |
|
"grad_norm": 0.19587433338165283, |
|
"learning_rate": 1.6428529186614195e-07, |
|
"loss": 0.0222, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.9488, |
|
"grad_norm": 0.02673921547830105, |
|
"learning_rate": 1.5928260334151847e-07, |
|
"loss": 0.0362, |
|
"step": 2372 |
|
}, |
|
{ |
|
"epoch": 0.9496, |
|
"grad_norm": 0.8786045908927917, |
|
"learning_rate": 1.543566547079467e-07, |
|
"loss": 0.0973, |
|
"step": 2374 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 0.16141577064990997, |
|
"learning_rate": 1.4950748437896235e-07, |
|
"loss": 0.0884, |
|
"step": 2376 |
|
}, |
|
{ |
|
"epoch": 0.9512, |
|
"grad_norm": 0.010312036611139774, |
|
"learning_rate": 1.4473513016937223e-07, |
|
"loss": 0.0103, |
|
"step": 2378 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 3.670624017715454, |
|
"learning_rate": 1.400396292949513e-07, |
|
"loss": 0.6124, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9528, |
|
"grad_norm": 3.3416731357574463, |
|
"learning_rate": 1.3542101837215826e-07, |
|
"loss": 1.0542, |
|
"step": 2382 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 1.813645839691162, |
|
"learning_rate": 1.308793334178493e-07, |
|
"loss": 0.8749, |
|
"step": 2384 |
|
}, |
|
{ |
|
"epoch": 0.9544, |
|
"grad_norm": 0.02037736214697361, |
|
"learning_rate": 1.26414609848996e-07, |
|
"loss": 0.0147, |
|
"step": 2386 |
|
}, |
|
{ |
|
"epoch": 0.9552, |
|
"grad_norm": 0.29435980319976807, |
|
"learning_rate": 1.2202688248241113e-07, |
|
"loss": 0.0375, |
|
"step": 2388 |
|
}, |
|
{ |
|
"epoch": 0.956, |
|
"grad_norm": 0.02400418370962143, |
|
"learning_rate": 1.1771618553447217e-07, |
|
"loss": 0.0272, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 0.32467302680015564, |
|
"learning_rate": 1.134825526208605e-07, |
|
"loss": 0.039, |
|
"step": 2392 |
|
}, |
|
{ |
|
"epoch": 0.9576, |
|
"grad_norm": 0.04526238515973091, |
|
"learning_rate": 1.0932601675629595e-07, |
|
"loss": 0.0099, |
|
"step": 2394 |
|
}, |
|
{ |
|
"epoch": 0.9584, |
|
"grad_norm": 0.017281251028180122, |
|
"learning_rate": 1.052466103542793e-07, |
|
"loss": 0.0201, |
|
"step": 2396 |
|
}, |
|
{ |
|
"epoch": 0.9592, |
|
"grad_norm": 0.2014533132314682, |
|
"learning_rate": 1.0124436522684244e-07, |
|
"loss": 0.232, |
|
"step": 2398 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.08100484311580658, |
|
"learning_rate": 9.731931258429638e-08, |
|
"loss": 0.1188, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9608, |
|
"grad_norm": 0.03484244644641876, |
|
"learning_rate": 9.347148303499143e-08, |
|
"loss": 0.0144, |
|
"step": 2402 |
|
}, |
|
{ |
|
"epoch": 0.9616, |
|
"grad_norm": 0.1309373676776886, |
|
"learning_rate": 8.970090658507291e-08, |
|
"loss": 0.015, |
|
"step": 2404 |
|
}, |
|
{ |
|
"epoch": 0.9624, |
|
"grad_norm": 1.4030771255493164, |
|
"learning_rate": 8.600761263825475e-08, |
|
"loss": 0.5354, |
|
"step": 2406 |
|
}, |
|
{ |
|
"epoch": 0.9632, |
|
"grad_norm": 0.009812920354306698, |
|
"learning_rate": 8.239162999558403e-08, |
|
"loss": 0.0114, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 0.964, |
|
"grad_norm": 1.2562764883041382, |
|
"learning_rate": 7.885298685522235e-08, |
|
"loss": 1.0972, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.9648, |
|
"grad_norm": 0.012443533167243004, |
|
"learning_rate": 7.539171081221597e-08, |
|
"loss": 0.0077, |
|
"step": 2412 |
|
}, |
|
{ |
|
"epoch": 0.9656, |
|
"grad_norm": 0.011568997986614704, |
|
"learning_rate": 7.200782885829482e-08, |
|
"loss": 0.6151, |
|
"step": 2414 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 0.09277435392141342, |
|
"learning_rate": 6.870136738164612e-08, |
|
"loss": 0.0186, |
|
"step": 2416 |
|
}, |
|
{ |
|
"epoch": 0.9672, |
|
"grad_norm": 0.1386393904685974, |
|
"learning_rate": 6.547235216672443e-08, |
|
"loss": 0.2091, |
|
"step": 2418 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 1.1379826068878174, |
|
"learning_rate": 6.232080839403631e-08, |
|
"loss": 0.2519, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.9688, |
|
"grad_norm": 0.1122945249080658, |
|
"learning_rate": 5.9246760639953824e-08, |
|
"loss": 0.0173, |
|
"step": 2422 |
|
}, |
|
{ |
|
"epoch": 0.9696, |
|
"grad_norm": 0.12855499982833862, |
|
"learning_rate": 5.625023287652021e-08, |
|
"loss": 0.2261, |
|
"step": 2424 |
|
}, |
|
{ |
|
"epoch": 0.9704, |
|
"grad_norm": 0.007712547667324543, |
|
"learning_rate": 5.3331248471258926e-08, |
|
"loss": 0.0049, |
|
"step": 2426 |
|
}, |
|
{ |
|
"epoch": 0.9712, |
|
"grad_norm": 0.033568158745765686, |
|
"learning_rate": 5.048983018699827e-08, |
|
"loss": 0.054, |
|
"step": 2428 |
|
}, |
|
{ |
|
"epoch": 0.972, |
|
"grad_norm": 0.03975361958146095, |
|
"learning_rate": 4.772600018168816e-08, |
|
"loss": 0.0088, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 0.12293253093957901, |
|
"learning_rate": 4.503978000823028e-08, |
|
"loss": 0.0239, |
|
"step": 2432 |
|
}, |
|
{ |
|
"epoch": 0.9736, |
|
"grad_norm": 1.9904245138168335, |
|
"learning_rate": 4.2431190614309334e-08, |
|
"loss": 0.7805, |
|
"step": 2434 |
|
}, |
|
{ |
|
"epoch": 0.9744, |
|
"grad_norm": 0.08196962624788284, |
|
"learning_rate": 3.990025234222872e-08, |
|
"loss": 0.0215, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 0.9752, |
|
"grad_norm": 0.015654845163226128, |
|
"learning_rate": 3.7446984928753984e-08, |
|
"loss": 0.0054, |
|
"step": 2438 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.4907855987548828, |
|
"learning_rate": 3.50714075049563e-08, |
|
"loss": 0.0643, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.9768, |
|
"grad_norm": 0.11044461280107498, |
|
"learning_rate": 3.2773538596068134e-08, |
|
"loss": 0.017, |
|
"step": 2442 |
|
}, |
|
{ |
|
"epoch": 0.9776, |
|
"grad_norm": 0.12533003091812134, |
|
"learning_rate": 3.0553396121330015e-08, |
|
"loss": 0.8734, |
|
"step": 2444 |
|
}, |
|
{ |
|
"epoch": 0.9784, |
|
"grad_norm": 0.010446742177009583, |
|
"learning_rate": 2.8410997393860663e-08, |
|
"loss": 0.0066, |
|
"step": 2446 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 2.9740476608276367, |
|
"learning_rate": 2.6346359120514863e-08, |
|
"loss": 0.56, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.15533113479614258, |
|
"learning_rate": 2.4359497401758026e-08, |
|
"loss": 0.0152, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.9808, |
|
"grad_norm": 0.42947232723236084, |
|
"learning_rate": 2.2450427731534052e-08, |
|
"loss": 0.1126, |
|
"step": 2452 |
|
}, |
|
{ |
|
"epoch": 0.9816, |
|
"grad_norm": 0.31456366181373596, |
|
"learning_rate": 2.061916499715544e-08, |
|
"loss": 0.0455, |
|
"step": 2454 |
|
}, |
|
{ |
|
"epoch": 0.9824, |
|
"grad_norm": 1.7186102867126465, |
|
"learning_rate": 1.886572347917337e-08, |
|
"loss": 0.1411, |
|
"step": 2456 |
|
}, |
|
{ |
|
"epoch": 0.9832, |
|
"grad_norm": 0.3335157632827759, |
|
"learning_rate": 1.7190116851280024e-08, |
|
"loss": 0.0393, |
|
"step": 2458 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 0.01097691897302866, |
|
"learning_rate": 1.5592358180189782e-08, |
|
"loss": 0.0299, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.9848, |
|
"grad_norm": 0.1632501780986786, |
|
"learning_rate": 1.4072459925548176e-08, |
|
"loss": 0.0287, |
|
"step": 2462 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 0.02019256353378296, |
|
"learning_rate": 1.2630433939825326e-08, |
|
"loss": 0.0658, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 0.9864, |
|
"grad_norm": 2.6901440620422363, |
|
"learning_rate": 1.126629146822933e-08, |
|
"loss": 0.2091, |
|
"step": 2466 |
|
}, |
|
{ |
|
"epoch": 0.9872, |
|
"grad_norm": 0.05359053239226341, |
|
"learning_rate": 9.980043148619668e-09, |
|
"loss": 0.0161, |
|
"step": 2468 |
|
}, |
|
{ |
|
"epoch": 0.988, |
|
"grad_norm": 1.5825371742248535, |
|
"learning_rate": 8.771699011416169e-09, |
|
"loss": 1.7133, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.9888, |
|
"grad_norm": 0.02139631099998951, |
|
"learning_rate": 7.641268479531283e-09, |
|
"loss": 0.0111, |
|
"step": 2472 |
|
}, |
|
{ |
|
"epoch": 0.9896, |
|
"grad_norm": 0.06577183306217194, |
|
"learning_rate": 6.588760368287928e-09, |
|
"loss": 0.0126, |
|
"step": 2474 |
|
}, |
|
{ |
|
"epoch": 0.9904, |
|
"grad_norm": 0.04376514256000519, |
|
"learning_rate": 5.614182885357311e-09, |
|
"loss": 0.0159, |
|
"step": 2476 |
|
}, |
|
{ |
|
"epoch": 0.9912, |
|
"grad_norm": 0.02068435214459896, |
|
"learning_rate": 4.717543630688992e-09, |
|
"loss": 0.0087, |
|
"step": 2478 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 2.170161724090576, |
|
"learning_rate": 3.898849596456477e-09, |
|
"loss": 0.2086, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.9928, |
|
"grad_norm": 1.0947741270065308, |
|
"learning_rate": 3.1581071670006013e-09, |
|
"loss": 0.0886, |
|
"step": 2482 |
|
}, |
|
{ |
|
"epoch": 0.9936, |
|
"grad_norm": 0.08858716487884521, |
|
"learning_rate": 2.495322118778454e-09, |
|
"loss": 0.02, |
|
"step": 2484 |
|
}, |
|
{ |
|
"epoch": 0.9944, |
|
"grad_norm": 0.011349241249263287, |
|
"learning_rate": 1.910499620322304e-09, |
|
"loss": 0.0094, |
|
"step": 2486 |
|
}, |
|
{ |
|
"epoch": 0.9952, |
|
"grad_norm": 1.9726616144180298, |
|
"learning_rate": 1.4036442321962995e-09, |
|
"loss": 0.7641, |
|
"step": 2488 |
|
}, |
|
{ |
|
"epoch": 0.996, |
|
"grad_norm": 0.020813768729567528, |
|
"learning_rate": 9.74759906957612e-10, |
|
"loss": 0.0056, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9968, |
|
"grad_norm": 0.07518152892589569, |
|
"learning_rate": 6.238499891353389e-10, |
|
"loss": 0.0572, |
|
"step": 2492 |
|
}, |
|
{ |
|
"epoch": 0.9976, |
|
"grad_norm": 0.12918703258037567, |
|
"learning_rate": 3.509172151938689e-10, |
|
"loss": 0.0209, |
|
"step": 2494 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 2.08217453956604, |
|
"learning_rate": 1.559637135173375e-10, |
|
"loss": 1.673, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 0.9992, |
|
"grad_norm": 0.01897619105875492, |
|
"learning_rate": 3.899100439408443e-11, |
|
"loss": 0.0482, |
|
"step": 2498 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.07024825364351273, |
|
"learning_rate": 0.0, |
|
"loss": 0.1147, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2500, |
|
"total_flos": 5.2791220503996006e+17, |
|
"train_loss": 0.34746392381768676, |
|
"train_runtime": 41909.1346, |
|
"train_samples_per_second": 1.909, |
|
"train_steps_per_second": 0.06 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 2500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": {}, |
|
"total_flos": 5.2791220503996006e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|