|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.992, |
|
"eval_steps": 500, |
|
"global_step": 390, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 6.512522716818102, |
|
"learning_rate": 1.0256410256410257e-06, |
|
"loss": 1.0696, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 6.564160181602456, |
|
"learning_rate": 2.0512820512820513e-06, |
|
"loss": 1.0564, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 6.593596024999264, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 1.0596, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 5.985081448984162, |
|
"learning_rate": 4.102564102564103e-06, |
|
"loss": 1.0466, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 4.610526434764138, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 1.0029, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 2.9674251867637347, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.9675, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 2.6183167426455105, |
|
"learning_rate": 7.17948717948718e-06, |
|
"loss": 0.9643, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 4.139986932762751, |
|
"learning_rate": 8.205128205128205e-06, |
|
"loss": 0.954, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 4.138386824423963, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.9534, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 3.998663659019224, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 0.8995, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 3.601600820871555, |
|
"learning_rate": 1.1282051282051283e-05, |
|
"loss": 0.8841, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 2.4878243820396895, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.8629, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 1.6961514903991608, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.8292, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 1.7117706666141828, |
|
"learning_rate": 1.435897435897436e-05, |
|
"loss": 0.8133, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.4244399364484353, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.8074, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 1.1830469833483066, |
|
"learning_rate": 1.641025641025641e-05, |
|
"loss": 0.7743, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 1.0960043478039359, |
|
"learning_rate": 1.7435897435897438e-05, |
|
"loss": 0.7738, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.925926809490321, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.758, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 0.9063053282979348, |
|
"learning_rate": 1.9487179487179488e-05, |
|
"loss": 0.7584, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.9596389662207955, |
|
"learning_rate": 2.0512820512820512e-05, |
|
"loss": 0.7314, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.9399749494125175, |
|
"learning_rate": 2.153846153846154e-05, |
|
"loss": 0.7422, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.814900129926128, |
|
"learning_rate": 2.2564102564102566e-05, |
|
"loss": 0.7038, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 1.006404352335733, |
|
"learning_rate": 2.3589743589743593e-05, |
|
"loss": 0.7135, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.8831928954701753, |
|
"learning_rate": 2.461538461538462e-05, |
|
"loss": 0.7071, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.8702140393430762, |
|
"learning_rate": 2.5641025641025646e-05, |
|
"loss": 0.6963, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 0.7905059479264891, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.7009, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.8222320920404764, |
|
"learning_rate": 2.7692307692307694e-05, |
|
"loss": 0.7051, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 0.881523147841335, |
|
"learning_rate": 2.871794871794872e-05, |
|
"loss": 0.702, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 0.683235818479339, |
|
"learning_rate": 2.9743589743589747e-05, |
|
"loss": 0.6728, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.0916899983703134, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.6754, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 0.9465110514408511, |
|
"learning_rate": 3.1794871794871795e-05, |
|
"loss": 0.6673, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 0.9155354190132595, |
|
"learning_rate": 3.282051282051282e-05, |
|
"loss": 0.7061, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.7509262620244194, |
|
"learning_rate": 3.384615384615385e-05, |
|
"loss": 0.6809, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 1.0495172593977244, |
|
"learning_rate": 3.4871794871794875e-05, |
|
"loss": 0.6794, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.1286511421752445, |
|
"learning_rate": 3.58974358974359e-05, |
|
"loss": 0.681, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 1.2511380187362742, |
|
"learning_rate": 3.692307692307693e-05, |
|
"loss": 0.6802, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.9748855385002707, |
|
"learning_rate": 3.794871794871795e-05, |
|
"loss": 0.6828, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 0.8156621108069366, |
|
"learning_rate": 3.8974358974358976e-05, |
|
"loss": 0.6633, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 0.9991235641637006, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6751, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.2749953667768767, |
|
"learning_rate": 3.9999198907597046e-05, |
|
"loss": 0.6686, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 0.9412778755551462, |
|
"learning_rate": 3.9996795694563096e-05, |
|
"loss": 0.6687, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 1.0327337726864538, |
|
"learning_rate": 3.999279055341771e-05, |
|
"loss": 0.671, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 1.3555098275902229, |
|
"learning_rate": 3.998718380500971e-05, |
|
"loss": 0.677, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 1.0142701185284615, |
|
"learning_rate": 3.997997589849145e-05, |
|
"loss": 0.6767, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.3579183101745855, |
|
"learning_rate": 3.9971167411282835e-05, |
|
"loss": 0.6708, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 0.973784803214183, |
|
"learning_rate": 3.99607590490251e-05, |
|
"loss": 0.6578, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 1.3153252930699182, |
|
"learning_rate": 3.9948751645524235e-05, |
|
"loss": 0.6557, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.8522883488029417, |
|
"learning_rate": 3.9935146162684206e-05, |
|
"loss": 0.6478, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 1.4392745059653276, |
|
"learning_rate": 3.9919943690429906e-05, |
|
"loss": 0.6475, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.947603378372481, |
|
"learning_rate": 3.9903145446619837e-05, |
|
"loss": 0.6675, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 1.104626066310503, |
|
"learning_rate": 3.9884752776948564e-05, |
|
"loss": 0.6453, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.8481175442606228, |
|
"learning_rate": 3.9864767154838864e-05, |
|
"loss": 0.6509, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 1.0218965112916476, |
|
"learning_rate": 3.9843190181323744e-05, |
|
"loss": 0.6526, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.9434170277879759, |
|
"learning_rate": 3.982002358491817e-05, |
|
"loss": 0.6421, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.8406699901917485, |
|
"learning_rate": 3.979526922148058e-05, |
|
"loss": 0.6424, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.9026206554782883, |
|
"learning_rate": 3.9768929074064206e-05, |
|
"loss": 0.6316, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 1.033689588644658, |
|
"learning_rate": 3.9741005252758255e-05, |
|
"loss": 0.6603, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.791725840028453, |
|
"learning_rate": 3.971149999451886e-05, |
|
"loss": 0.629, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 0.9294092788143803, |
|
"learning_rate": 3.9680415662989806e-05, |
|
"loss": 0.6519, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1389.7577349506907, |
|
"learning_rate": 3.9647754748313294e-05, |
|
"loss": 0.8286, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 287.84776514709336, |
|
"learning_rate": 3.96135198669304e-05, |
|
"loss": 0.7328, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 2.60146678427347, |
|
"learning_rate": 3.957771376137144e-05, |
|
"loss": 0.6583, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 1.8859944095922938, |
|
"learning_rate": 3.954033930003634e-05, |
|
"loss": 0.6417, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 1.5315662460399988, |
|
"learning_rate": 3.9501399476964806e-05, |
|
"loss": 0.6525, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.184843434264107, |
|
"learning_rate": 3.946089741159648e-05, |
|
"loss": 0.6433, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 1.4893895377369322, |
|
"learning_rate": 3.9418836348521045e-05, |
|
"loss": 0.6654, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 0.8748298486341183, |
|
"learning_rate": 3.937521965721831e-05, |
|
"loss": 0.6284, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 1.3699663923970367, |
|
"learning_rate": 3.933005083178828e-05, |
|
"loss": 0.6406, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 0.9289499371200274, |
|
"learning_rate": 3.928333349067125e-05, |
|
"loss": 0.6276, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.3844222693101031, |
|
"learning_rate": 3.923507137635792e-05, |
|
"loss": 0.6503, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 1.0330742305379728, |
|
"learning_rate": 3.9185268355089606e-05, |
|
"loss": 0.6254, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 1.3050581038734472, |
|
"learning_rate": 3.913392841654851e-05, |
|
"loss": 0.6415, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 0.9757426728868531, |
|
"learning_rate": 3.9081055673538093e-05, |
|
"loss": 0.6362, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 1.3538131680621317, |
|
"learning_rate": 3.902665436165364e-05, |
|
"loss": 0.6166, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.1431225945777896, |
|
"learning_rate": 3.897072883894291e-05, |
|
"loss": 0.6133, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 1.2079492751237144, |
|
"learning_rate": 3.8913283585557054e-05, |
|
"loss": 0.631, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 1.1441761825751906, |
|
"learning_rate": 3.885432320339167e-05, |
|
"loss": 0.6193, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 0.9263058024622803, |
|
"learning_rate": 3.879385241571817e-05, |
|
"loss": 0.6161, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0112, |
|
"grad_norm": 1.1770323739591833, |
|
"learning_rate": 3.873187606680543e-05, |
|
"loss": 0.5719, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.9821440975441464, |
|
"learning_rate": 3.866839912153168e-05, |
|
"loss": 0.569, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0368, |
|
"grad_norm": 0.8823439553609027, |
|
"learning_rate": 3.860342666498677e-05, |
|
"loss": 0.5697, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.0496, |
|
"grad_norm": 0.9112762210935866, |
|
"learning_rate": 3.853696390206484e-05, |
|
"loss": 0.5594, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.0624, |
|
"grad_norm": 0.8518848011570124, |
|
"learning_rate": 3.846901615704734e-05, |
|
"loss": 0.5574, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.0752, |
|
"grad_norm": 0.8512911490008898, |
|
"learning_rate": 3.839958887317649e-05, |
|
"loss": 0.556, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.6560260951817384, |
|
"learning_rate": 3.832868761221926e-05, |
|
"loss": 0.5708, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1008, |
|
"grad_norm": 0.8464326366429381, |
|
"learning_rate": 3.825631805402182e-05, |
|
"loss": 0.5669, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.1136, |
|
"grad_norm": 0.5781958051581789, |
|
"learning_rate": 3.818248599605448e-05, |
|
"loss": 0.548, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.1264, |
|
"grad_norm": 0.7538860608432648, |
|
"learning_rate": 3.810719735294731e-05, |
|
"loss": 0.567, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.1392, |
|
"grad_norm": 0.7236013322656013, |
|
"learning_rate": 3.8030458156016326e-05, |
|
"loss": 0.56, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.6456344742128692, |
|
"learning_rate": 3.795227455278029e-05, |
|
"loss": 0.5782, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1648, |
|
"grad_norm": 0.5810578790280398, |
|
"learning_rate": 3.787265280646825e-05, |
|
"loss": 0.5545, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.1776, |
|
"grad_norm": 0.6284996703688887, |
|
"learning_rate": 3.7791599295517825e-05, |
|
"loss": 0.5553, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.1904, |
|
"grad_norm": 0.539844217783587, |
|
"learning_rate": 3.7709120513064196e-05, |
|
"loss": 0.5714, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.2032, |
|
"grad_norm": 0.5806615507593749, |
|
"learning_rate": 3.762522306641998e-05, |
|
"loss": 0.5488, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.4660908011302106, |
|
"learning_rate": 3.7539913676545874e-05, |
|
"loss": 0.573, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.2288000000000001, |
|
"grad_norm": 0.5423768176914702, |
|
"learning_rate": 3.745319917751229e-05, |
|
"loss": 0.5652, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.2416, |
|
"grad_norm": 0.5298302045696609, |
|
"learning_rate": 3.736508651595188e-05, |
|
"loss": 0.5682, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.2544, |
|
"grad_norm": 0.4864740532557761, |
|
"learning_rate": 3.727558275050301e-05, |
|
"loss": 0.554, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.2671999999999999, |
|
"grad_norm": 0.4938140719050684, |
|
"learning_rate": 3.718469505124434e-05, |
|
"loss": 0.5696, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.566671536862956, |
|
"learning_rate": 3.709243069912041e-05, |
|
"loss": 0.5732, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2928, |
|
"grad_norm": 0.6612312551578431, |
|
"learning_rate": 3.699879708535838e-05, |
|
"loss": 0.5531, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.3056, |
|
"grad_norm": 0.5778627367111475, |
|
"learning_rate": 3.69038017108759e-05, |
|
"loss": 0.5408, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.3184, |
|
"grad_norm": 0.5757871986128803, |
|
"learning_rate": 3.680745218568026e-05, |
|
"loss": 0.5665, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.3312, |
|
"grad_norm": 0.5585849057932734, |
|
"learning_rate": 3.6709756228258735e-05, |
|
"loss": 0.5567, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.6542173505624117, |
|
"learning_rate": 3.6610721664960236e-05, |
|
"loss": 0.5556, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.3568, |
|
"grad_norm": 0.4569668136989876, |
|
"learning_rate": 3.65103564293684e-05, |
|
"loss": 0.5421, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.3696, |
|
"grad_norm": 0.6529450991504214, |
|
"learning_rate": 3.640866856166601e-05, |
|
"loss": 0.55, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.3824, |
|
"grad_norm": 0.6678394214970926, |
|
"learning_rate": 3.6305666207990886e-05, |
|
"loss": 0.5573, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.3952, |
|
"grad_norm": 0.6734626909404589, |
|
"learning_rate": 3.6201357619783336e-05, |
|
"loss": 0.5655, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.8400225821815201, |
|
"learning_rate": 3.609575115312511e-05, |
|
"loss": 0.5633, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4208, |
|
"grad_norm": 0.5303419096191705, |
|
"learning_rate": 3.598885526807003e-05, |
|
"loss": 0.5478, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.4336, |
|
"grad_norm": 0.6614344684252336, |
|
"learning_rate": 3.5880678527966224e-05, |
|
"loss": 0.5455, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.4464000000000001, |
|
"grad_norm": 0.5472102418789047, |
|
"learning_rate": 3.577122959877017e-05, |
|
"loss": 0.5568, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.4592, |
|
"grad_norm": 0.5180083498261117, |
|
"learning_rate": 3.566051724835245e-05, |
|
"loss": 0.5422, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.5697847255506858, |
|
"learning_rate": 3.554855034579532e-05, |
|
"loss": 0.5712, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.4848, |
|
"grad_norm": 0.6040964974688277, |
|
"learning_rate": 3.5435337860682304e-05, |
|
"loss": 0.5487, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.4976, |
|
"grad_norm": 0.4742521971792901, |
|
"learning_rate": 3.532088886237956e-05, |
|
"loss": 0.566, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.5104, |
|
"grad_norm": 0.6819923342225018, |
|
"learning_rate": 3.520521251930941e-05, |
|
"loss": 0.5404, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.5232, |
|
"grad_norm": 0.4722306571762546, |
|
"learning_rate": 3.5088318098215805e-05, |
|
"loss": 0.5568, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.45741854977275803, |
|
"learning_rate": 3.497021496342203e-05, |
|
"loss": 0.5596, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5488, |
|
"grad_norm": 0.5192906664033172, |
|
"learning_rate": 3.485091257608047e-05, |
|
"loss": 0.5526, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.5615999999999999, |
|
"grad_norm": 0.4467233339545157, |
|
"learning_rate": 3.473042049341474e-05, |
|
"loss": 0.5627, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.5744, |
|
"grad_norm": 0.5277534067816066, |
|
"learning_rate": 3.4608748367954064e-05, |
|
"loss": 0.5657, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.5872000000000002, |
|
"grad_norm": 0.4448021374192618, |
|
"learning_rate": 3.4485905946759965e-05, |
|
"loss": 0.5679, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.48955231560697154, |
|
"learning_rate": 3.4361903070645484e-05, |
|
"loss": 0.5461, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.6128, |
|
"grad_norm": 0.5491127185002994, |
|
"learning_rate": 3.423674967338681e-05, |
|
"loss": 0.5252, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.6256, |
|
"grad_norm": 0.539004560507589, |
|
"learning_rate": 3.411045578092754e-05, |
|
"loss": 0.559, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.6383999999999999, |
|
"grad_norm": 0.4789128117584534, |
|
"learning_rate": 3.398303151057543e-05, |
|
"loss": 0.5297, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.6512, |
|
"grad_norm": 0.4551171420622061, |
|
"learning_rate": 3.385448707019199e-05, |
|
"loss": 0.5452, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.4062320230041345, |
|
"learning_rate": 3.372483275737468e-05, |
|
"loss": 0.5443, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6768, |
|
"grad_norm": 0.5011568209714911, |
|
"learning_rate": 3.359407895863199e-05, |
|
"loss": 0.5596, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.6896, |
|
"grad_norm": 0.6149446146414052, |
|
"learning_rate": 3.34622361485514e-05, |
|
"loss": 0.5551, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.7024, |
|
"grad_norm": 0.6047083533465212, |
|
"learning_rate": 3.332931488896029e-05, |
|
"loss": 0.547, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.7151999999999998, |
|
"grad_norm": 0.46632700703043184, |
|
"learning_rate": 3.319532582807977e-05, |
|
"loss": 0.5464, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.5601214392507189, |
|
"learning_rate": 3.30602796996717e-05, |
|
"loss": 0.5508, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.7408000000000001, |
|
"grad_norm": 0.6059686022001739, |
|
"learning_rate": 3.2924187322178865e-05, |
|
"loss": 0.5472, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.7536, |
|
"grad_norm": 0.4227869398475831, |
|
"learning_rate": 3.278705959785821e-05, |
|
"loss": 0.562, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.7664, |
|
"grad_norm": 0.4665187980858091, |
|
"learning_rate": 3.2648907511907544e-05, |
|
"loss": 0.5407, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.7792, |
|
"grad_norm": 0.6066621927137116, |
|
"learning_rate": 3.250974213158555e-05, |
|
"loss": 0.5419, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.45539013593733635, |
|
"learning_rate": 3.23695746053251e-05, |
|
"loss": 0.5594, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8048, |
|
"grad_norm": 0.4598890146802132, |
|
"learning_rate": 3.222841616184025e-05, |
|
"loss": 0.5456, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.8176, |
|
"grad_norm": 0.46840313280462353, |
|
"learning_rate": 3.208627810922665e-05, |
|
"loss": 0.5423, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.8304, |
|
"grad_norm": 0.47793633660719126, |
|
"learning_rate": 3.194317183405573e-05, |
|
"loss": 0.5611, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.8432, |
|
"grad_norm": 0.4109635129590553, |
|
"learning_rate": 3.1799108800462466e-05, |
|
"loss": 0.5596, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.4757780793309859, |
|
"learning_rate": 3.1654100549227024e-05, |
|
"loss": 0.548, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.8688, |
|
"grad_norm": 0.4667850045505799, |
|
"learning_rate": 3.1508158696850275e-05, |
|
"loss": 0.5599, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.8816000000000002, |
|
"grad_norm": 0.40800983146319364, |
|
"learning_rate": 3.136129493462312e-05, |
|
"loss": 0.5455, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.8944, |
|
"grad_norm": 0.495724755830386, |
|
"learning_rate": 3.121352102768998e-05, |
|
"loss": 0.5506, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.9072, |
|
"grad_norm": 0.4387238934256952, |
|
"learning_rate": 3.106484881410628e-05, |
|
"loss": 0.5658, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.4777624357651288, |
|
"learning_rate": 3.091529020389009e-05, |
|
"loss": 0.5597, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9327999999999999, |
|
"grad_norm": 0.4795073239219758, |
|
"learning_rate": 3.076485717806808e-05, |
|
"loss": 0.5462, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.9456, |
|
"grad_norm": 0.425612185278673, |
|
"learning_rate": 3.061356178771564e-05, |
|
"loss": 0.5501, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.9584000000000001, |
|
"grad_norm": 0.41777498513296457, |
|
"learning_rate": 3.0461416152991555e-05, |
|
"loss": 0.5544, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.9712, |
|
"grad_norm": 0.3978988350859362, |
|
"learning_rate": 3.0308432462167045e-05, |
|
"loss": 0.5522, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 0.49169460352223887, |
|
"learning_rate": 3.015462297064936e-05, |
|
"loss": 0.5368, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.9968, |
|
"grad_norm": 0.5201109396428173, |
|
"learning_rate": 3.0000000000000004e-05, |
|
"loss": 0.5557, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.0096, |
|
"grad_norm": 0.5223620357395014, |
|
"learning_rate": 2.98445759369477e-05, |
|
"loss": 0.4905, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.0224, |
|
"grad_norm": 0.42881429340953764, |
|
"learning_rate": 2.9688363232396056e-05, |
|
"loss": 0.4701, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.0352, |
|
"grad_norm": 0.6464022269396524, |
|
"learning_rate": 2.9531374400426158e-05, |
|
"loss": 0.4682, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.6475226688167106, |
|
"learning_rate": 2.9373622017294075e-05, |
|
"loss": 0.4674, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0608, |
|
"grad_norm": 0.612117266008549, |
|
"learning_rate": 2.9215118720423375e-05, |
|
"loss": 0.4709, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.0736, |
|
"grad_norm": 0.5011223881087815, |
|
"learning_rate": 2.9055877207392752e-05, |
|
"loss": 0.4664, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.0864, |
|
"grad_norm": 0.6467725169112344, |
|
"learning_rate": 2.8895910234918828e-05, |
|
"loss": 0.492, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.0992, |
|
"grad_norm": 0.5939346659836927, |
|
"learning_rate": 2.873523061783426e-05, |
|
"loss": 0.472, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.5027071095650045, |
|
"learning_rate": 2.8573851228061084e-05, |
|
"loss": 0.4959, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.1248, |
|
"grad_norm": 0.6048110893909238, |
|
"learning_rate": 2.8411784993579633e-05, |
|
"loss": 0.4611, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.1376, |
|
"grad_norm": 0.48691381474432466, |
|
"learning_rate": 2.8249044897392814e-05, |
|
"loss": 0.4677, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.1504, |
|
"grad_norm": 0.5630771489986927, |
|
"learning_rate": 2.80856439764861e-05, |
|
"loss": 0.448, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.1632, |
|
"grad_norm": 0.5150493831886828, |
|
"learning_rate": 2.792159532078314e-05, |
|
"loss": 0.4596, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.4765000326678474, |
|
"learning_rate": 2.77569120720971e-05, |
|
"loss": 0.4661, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.1888, |
|
"grad_norm": 0.4905566091477221, |
|
"learning_rate": 2.7591607423077932e-05, |
|
"loss": 0.4663, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.2016, |
|
"grad_norm": 0.42542651510717766, |
|
"learning_rate": 2.7425694616155474e-05, |
|
"loss": 0.4563, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.2144, |
|
"grad_norm": 0.4012248742478175, |
|
"learning_rate": 2.7259186942478656e-05, |
|
"loss": 0.4439, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.2272, |
|
"grad_norm": 0.5409433225896136, |
|
"learning_rate": 2.7092097740850712e-05, |
|
"loss": 0.4557, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.4015271073114425, |
|
"learning_rate": 2.692444039666066e-05, |
|
"loss": 0.4518, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.2528, |
|
"grad_norm": 0.4721981299304766, |
|
"learning_rate": 2.6756228340810946e-05, |
|
"loss": 0.455, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.2656, |
|
"grad_norm": 0.43733176558443665, |
|
"learning_rate": 2.6587475048641596e-05, |
|
"loss": 0.4504, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.2784, |
|
"grad_norm": 0.4130308223543829, |
|
"learning_rate": 2.6418194038850634e-05, |
|
"loss": 0.4716, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.2912, |
|
"grad_norm": 0.4444052788560941, |
|
"learning_rate": 2.624839887241115e-05, |
|
"loss": 0.4798, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.4122329176970373, |
|
"learning_rate": 2.607810315148494e-05, |
|
"loss": 0.4577, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.3168, |
|
"grad_norm": 0.38139886405102097, |
|
"learning_rate": 2.5907320518332827e-05, |
|
"loss": 0.4691, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.3296, |
|
"grad_norm": 0.42493686571960576, |
|
"learning_rate": 2.5736064654221808e-05, |
|
"loss": 0.4716, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.3424, |
|
"grad_norm": 0.3950827300126743, |
|
"learning_rate": 2.5564349278329056e-05, |
|
"loss": 0.4672, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.3552, |
|
"grad_norm": 0.44805829499136113, |
|
"learning_rate": 2.539218814664288e-05, |
|
"loss": 0.474, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.4369143120370755, |
|
"learning_rate": 2.521959505086075e-05, |
|
"loss": 0.4934, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.3808, |
|
"grad_norm": 0.44502476357356874, |
|
"learning_rate": 2.5046583817284437e-05, |
|
"loss": 0.4488, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.3936, |
|
"grad_norm": 0.4914622861856519, |
|
"learning_rate": 2.487316830571244e-05, |
|
"loss": 0.4719, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.4064, |
|
"grad_norm": 0.49385858341638383, |
|
"learning_rate": 2.4699362408329646e-05, |
|
"loss": 0.4701, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.4192, |
|
"grad_norm": 0.559542952455097, |
|
"learning_rate": 2.4525180048594452e-05, |
|
"loss": 0.4688, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.4974545523229099, |
|
"learning_rate": 2.435063518012335e-05, |
|
"loss": 0.4664, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.4448, |
|
"grad_norm": 0.5021566761179318, |
|
"learning_rate": 2.4175741785573177e-05, |
|
"loss": 0.4636, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.4576000000000002, |
|
"grad_norm": 0.43199282737226535, |
|
"learning_rate": 2.4000513875520892e-05, |
|
"loss": 0.4485, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.4704, |
|
"grad_norm": 0.4100491369475057, |
|
"learning_rate": 2.3824965487341247e-05, |
|
"loss": 0.4579, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.4832, |
|
"grad_norm": 0.4555048740503307, |
|
"learning_rate": 2.3649110684082258e-05, |
|
"loss": 0.4641, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.4295774465611716, |
|
"learning_rate": 2.3472963553338614e-05, |
|
"loss": 0.4454, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.5088, |
|
"grad_norm": 0.4785651412027942, |
|
"learning_rate": 2.3296538206123134e-05, |
|
"loss": 0.4388, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.5216, |
|
"grad_norm": 0.42227580702493306, |
|
"learning_rate": 2.311984877573636e-05, |
|
"loss": 0.4739, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.5343999999999998, |
|
"grad_norm": 0.44233593861762177, |
|
"learning_rate": 2.2942909416634326e-05, |
|
"loss": 0.4664, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.5472, |
|
"grad_norm": 0.4150214176585268, |
|
"learning_rate": 2.2765734303294666e-05, |
|
"loss": 0.4892, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.4988147678655741, |
|
"learning_rate": 2.2588337629081107e-05, |
|
"loss": 0.4642, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5728, |
|
"grad_norm": 0.4674183051033628, |
|
"learning_rate": 2.2410733605106462e-05, |
|
"loss": 0.4787, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.5856, |
|
"grad_norm": 0.4529420246577216, |
|
"learning_rate": 2.2232936459094158e-05, |
|
"loss": 0.4903, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.5984, |
|
"grad_norm": 0.36156313514672506, |
|
"learning_rate": 2.205496043423849e-05, |
|
"loss": 0.4672, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.6112, |
|
"grad_norm": 0.43331599376613517, |
|
"learning_rate": 2.1876819788063586e-05, |
|
"loss": 0.4769, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.34250275398477475, |
|
"learning_rate": 2.16985287912813e-05, |
|
"loss": 0.4793, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.6368, |
|
"grad_norm": 0.4153769866085747, |
|
"learning_rate": 2.1520101726647922e-05, |
|
"loss": 0.4749, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.6496, |
|
"grad_norm": 0.3640796467211484, |
|
"learning_rate": 2.1341552887820048e-05, |
|
"loss": 0.4641, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.6624, |
|
"grad_norm": 0.4605091312839679, |
|
"learning_rate": 2.1162896578209517e-05, |
|
"loss": 0.4588, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.6752000000000002, |
|
"grad_norm": 0.3841760161085162, |
|
"learning_rate": 2.0984147109837564e-05, |
|
"loss": 0.4569, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.4297762401237527, |
|
"learning_rate": 2.0805318802188307e-05, |
|
"loss": 0.4826, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.7008, |
|
"grad_norm": 0.3989050544628582, |
|
"learning_rate": 2.0626425981061608e-05, |
|
"loss": 0.4773, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.7136, |
|
"grad_norm": 0.4148902267262383, |
|
"learning_rate": 2.0447482977425465e-05, |
|
"loss": 0.4631, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.7264, |
|
"grad_norm": 0.37315236923456857, |
|
"learning_rate": 2.0268504126267952e-05, |
|
"loss": 0.4557, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.7392, |
|
"grad_norm": 0.47822952527855817, |
|
"learning_rate": 2.008950376544887e-05, |
|
"loss": 0.4505, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.38122602440566117, |
|
"learning_rate": 1.9910496234551132e-05, |
|
"loss": 0.4637, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.7648, |
|
"grad_norm": 0.44550611997452944, |
|
"learning_rate": 1.9731495873732055e-05, |
|
"loss": 0.4711, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.7776, |
|
"grad_norm": 0.4927928131481893, |
|
"learning_rate": 1.9552517022574542e-05, |
|
"loss": 0.4796, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.7904, |
|
"grad_norm": 0.3965958103156703, |
|
"learning_rate": 1.93735740189384e-05, |
|
"loss": 0.4559, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.8032, |
|
"grad_norm": 0.34579615803746744, |
|
"learning_rate": 1.9194681197811703e-05, |
|
"loss": 0.465, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.3993263463900034, |
|
"learning_rate": 1.901585289016244e-05, |
|
"loss": 0.4679, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.8288, |
|
"grad_norm": 0.4029764066771043, |
|
"learning_rate": 1.8837103421790486e-05, |
|
"loss": 0.4523, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.8416, |
|
"grad_norm": 0.3419221200613061, |
|
"learning_rate": 1.8658447112179952e-05, |
|
"loss": 0.4592, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.8544, |
|
"grad_norm": 0.3595384240299151, |
|
"learning_rate": 1.8479898273352084e-05, |
|
"loss": 0.458, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.8672, |
|
"grad_norm": 0.3620413876501833, |
|
"learning_rate": 1.83014712087187e-05, |
|
"loss": 0.4756, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.3270882856212691, |
|
"learning_rate": 1.8123180211936417e-05, |
|
"loss": 0.457, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.8928000000000003, |
|
"grad_norm": 0.3446593100809947, |
|
"learning_rate": 1.794503956576152e-05, |
|
"loss": 0.4533, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.9055999999999997, |
|
"grad_norm": 0.40571435960375596, |
|
"learning_rate": 1.776706354090585e-05, |
|
"loss": 0.4418, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.9184, |
|
"grad_norm": 0.3635415788213531, |
|
"learning_rate": 1.758926639489354e-05, |
|
"loss": 0.4523, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.9312, |
|
"grad_norm": 0.412269187518228, |
|
"learning_rate": 1.7411662370918893e-05, |
|
"loss": 0.4694, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.4445468290789392, |
|
"learning_rate": 1.7234265696705344e-05, |
|
"loss": 0.4578, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.9568, |
|
"grad_norm": 0.3394327173424959, |
|
"learning_rate": 1.7057090583365678e-05, |
|
"loss": 0.4361, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.9696, |
|
"grad_norm": 0.3947782619670345, |
|
"learning_rate": 1.6880151224263646e-05, |
|
"loss": 0.47, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.9824, |
|
"grad_norm": 0.35576225634334796, |
|
"learning_rate": 1.6703461793876876e-05, |
|
"loss": 0.4509, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.9952, |
|
"grad_norm": 0.3991955857455497, |
|
"learning_rate": 1.6527036446661396e-05, |
|
"loss": 0.451, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.008, |
|
"grad_norm": 0.4852272293028313, |
|
"learning_rate": 1.635088931591775e-05, |
|
"loss": 0.4085, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.0208, |
|
"grad_norm": 0.44316609201957585, |
|
"learning_rate": 1.6175034512658753e-05, |
|
"loss": 0.4021, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.0336, |
|
"grad_norm": 0.6804901436503166, |
|
"learning_rate": 1.5999486124479115e-05, |
|
"loss": 0.4229, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.0464, |
|
"grad_norm": 0.46064626578777224, |
|
"learning_rate": 1.5824258214426833e-05, |
|
"loss": 0.419, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.0592, |
|
"grad_norm": 0.5156640732343791, |
|
"learning_rate": 1.5649364819876655e-05, |
|
"loss": 0.4002, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 0.5124400542001356, |
|
"learning_rate": 1.547481995140556e-05, |
|
"loss": 0.381, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0848, |
|
"grad_norm": 0.4545025150296528, |
|
"learning_rate": 1.5300637591670357e-05, |
|
"loss": 0.3719, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.0976, |
|
"grad_norm": 0.4764983476665866, |
|
"learning_rate": 1.5126831694287564e-05, |
|
"loss": 0.3951, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.1104, |
|
"grad_norm": 0.45902267574729466, |
|
"learning_rate": 1.4953416182715566e-05, |
|
"loss": 0.4014, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.1232, |
|
"grad_norm": 0.42287998012354977, |
|
"learning_rate": 1.478040494913926e-05, |
|
"loss": 0.3915, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 3.136, |
|
"grad_norm": 0.40844017593827425, |
|
"learning_rate": 1.460781185335713e-05, |
|
"loss": 0.3705, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.1488, |
|
"grad_norm": 0.45596793756186865, |
|
"learning_rate": 1.443565072167095e-05, |
|
"loss": 0.3649, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.1616, |
|
"grad_norm": 0.38766625246386077, |
|
"learning_rate": 1.4263935345778202e-05, |
|
"loss": 0.3708, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 3.1744, |
|
"grad_norm": 0.4769480173172975, |
|
"learning_rate": 1.409267948166718e-05, |
|
"loss": 0.3881, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 3.1872, |
|
"grad_norm": 0.4092396043545385, |
|
"learning_rate": 1.3921896848515064e-05, |
|
"loss": 0.3545, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.448234453310902, |
|
"learning_rate": 1.3751601127588849e-05, |
|
"loss": 0.3927, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.2128, |
|
"grad_norm": 0.44359469507401983, |
|
"learning_rate": 1.3581805961149371e-05, |
|
"loss": 0.4055, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 3.2256, |
|
"grad_norm": 0.4088719915070779, |
|
"learning_rate": 1.341252495135841e-05, |
|
"loss": 0.3856, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 3.2384, |
|
"grad_norm": 0.3674312784017301, |
|
"learning_rate": 1.324377165918906e-05, |
|
"loss": 0.3791, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 3.2512, |
|
"grad_norm": 0.3938521997358805, |
|
"learning_rate": 1.3075559603339354e-05, |
|
"loss": 0.3925, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 3.2640000000000002, |
|
"grad_norm": 0.3623453990180961, |
|
"learning_rate": 1.2907902259149287e-05, |
|
"loss": 0.3924, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.2768, |
|
"grad_norm": 0.3600044504780135, |
|
"learning_rate": 1.274081305752135e-05, |
|
"loss": 0.4029, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 3.2896, |
|
"grad_norm": 0.3520606689385732, |
|
"learning_rate": 1.2574305383844528e-05, |
|
"loss": 0.3656, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 3.3024, |
|
"grad_norm": 0.3327793130489215, |
|
"learning_rate": 1.2408392576922075e-05, |
|
"loss": 0.3894, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 3.3152, |
|
"grad_norm": 0.39965934167160516, |
|
"learning_rate": 1.2243087927902905e-05, |
|
"loss": 0.3945, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 3.328, |
|
"grad_norm": 0.3161346715660535, |
|
"learning_rate": 1.2078404679216864e-05, |
|
"loss": 0.3747, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.3407999999999998, |
|
"grad_norm": 0.3313302846658303, |
|
"learning_rate": 1.1914356023513904e-05, |
|
"loss": 0.3812, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 3.3536, |
|
"grad_norm": 0.32079435244947657, |
|
"learning_rate": 1.1750955102607193e-05, |
|
"loss": 0.3837, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 3.3664, |
|
"grad_norm": 0.29381848255351123, |
|
"learning_rate": 1.1588215006420374e-05, |
|
"loss": 0.3955, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 3.3792, |
|
"grad_norm": 0.32575149113643387, |
|
"learning_rate": 1.1426148771938915e-05, |
|
"loss": 0.3803, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 3.392, |
|
"grad_norm": 0.29157539527768916, |
|
"learning_rate": 1.1264769382165748e-05, |
|
"loss": 0.4054, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 3.4048, |
|
"grad_norm": 0.2988204292744495, |
|
"learning_rate": 1.110408976508118e-05, |
|
"loss": 0.3828, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 3.4176, |
|
"grad_norm": 0.29731376217801975, |
|
"learning_rate": 1.094412279260726e-05, |
|
"loss": 0.387, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 3.4304, |
|
"grad_norm": 0.27018508655208406, |
|
"learning_rate": 1.0784881279576635e-05, |
|
"loss": 0.3829, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 3.4432, |
|
"grad_norm": 0.286383510499009, |
|
"learning_rate": 1.0626377982705929e-05, |
|
"loss": 0.3813, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 3.456, |
|
"grad_norm": 0.30871337744485106, |
|
"learning_rate": 1.0468625599573842e-05, |
|
"loss": 0.3849, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.4688, |
|
"grad_norm": 0.2923862683828382, |
|
"learning_rate": 1.0311636767603952e-05, |
|
"loss": 0.3922, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 3.4816, |
|
"grad_norm": 0.31472617862809404, |
|
"learning_rate": 1.0155424063052306e-05, |
|
"loss": 0.3712, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 3.4944, |
|
"grad_norm": 0.2822166054269409, |
|
"learning_rate": 1.0000000000000006e-05, |
|
"loss": 0.3909, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 3.5072, |
|
"grad_norm": 0.28500555437532055, |
|
"learning_rate": 9.84537702935065e-06, |
|
"loss": 0.3998, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.30256608738210994, |
|
"learning_rate": 9.691567537832964e-06, |
|
"loss": 0.3951, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 3.5328, |
|
"grad_norm": 0.27877028351239436, |
|
"learning_rate": 9.538583847008452e-06, |
|
"loss": 0.404, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 3.5456, |
|
"grad_norm": 0.29405001535468067, |
|
"learning_rate": 9.386438212284372e-06, |
|
"loss": 0.4283, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 3.5584, |
|
"grad_norm": 0.2846756333337361, |
|
"learning_rate": 9.235142821931928e-06, |
|
"loss": 0.396, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 3.5712, |
|
"grad_norm": 0.25839177653428463, |
|
"learning_rate": 9.084709796109907e-06, |
|
"loss": 0.4037, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 3.584, |
|
"grad_norm": 0.3170799093435875, |
|
"learning_rate": 8.93515118589373e-06, |
|
"loss": 0.4064, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.5968, |
|
"grad_norm": 0.29846928577818926, |
|
"learning_rate": 8.786478972310023e-06, |
|
"loss": 0.3801, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 3.6096, |
|
"grad_norm": 0.28359666691655633, |
|
"learning_rate": 8.638705065376887e-06, |
|
"loss": 0.4006, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 3.6224, |
|
"grad_norm": 0.2853900373223349, |
|
"learning_rate": 8.491841303149728e-06, |
|
"loss": 0.3978, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 3.6352, |
|
"grad_norm": 0.30183351823204946, |
|
"learning_rate": 8.345899450772975e-06, |
|
"loss": 0.3809, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 3.648, |
|
"grad_norm": 0.26513412424846494, |
|
"learning_rate": 8.200891199537549e-06, |
|
"loss": 0.3837, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.6608, |
|
"grad_norm": 0.28491958067646267, |
|
"learning_rate": 8.056828165944282e-06, |
|
"loss": 0.3924, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 3.6736, |
|
"grad_norm": 0.2847449046866254, |
|
"learning_rate": 7.913721890773354e-06, |
|
"loss": 0.406, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 3.6864, |
|
"grad_norm": 0.29049536394008524, |
|
"learning_rate": 7.771583838159756e-06, |
|
"loss": 0.3777, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 3.6992000000000003, |
|
"grad_norm": 0.26668171489061726, |
|
"learning_rate": 7.630425394674903e-06, |
|
"loss": 0.3921, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 3.7119999999999997, |
|
"grad_norm": 0.2824867701723897, |
|
"learning_rate": 7.49025786841445e-06, |
|
"loss": 0.4009, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.7248, |
|
"grad_norm": 0.2837737298294316, |
|
"learning_rate": 7.3510924880924575e-06, |
|
"loss": 0.3792, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 3.7376, |
|
"grad_norm": 0.2908414313566709, |
|
"learning_rate": 7.212940402141808e-06, |
|
"loss": 0.384, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 3.7504, |
|
"grad_norm": 0.2878412441106774, |
|
"learning_rate": 7.075812677821145e-06, |
|
"loss": 0.3564, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 3.7632, |
|
"grad_norm": 0.30353853584727125, |
|
"learning_rate": 6.939720300328303e-06, |
|
"loss": 0.3678, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 3.776, |
|
"grad_norm": 0.28504135457069835, |
|
"learning_rate": 6.8046741719202385e-06, |
|
"loss": 0.3811, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 3.7888, |
|
"grad_norm": 0.2791963655781665, |
|
"learning_rate": 6.67068511103971e-06, |
|
"loss": 0.395, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 3.8016, |
|
"grad_norm": 0.28620457724872916, |
|
"learning_rate": 6.537763851448593e-06, |
|
"loss": 0.3818, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 3.8144, |
|
"grad_norm": 0.26750728763761783, |
|
"learning_rate": 6.4059210413680175e-06, |
|
"loss": 0.3944, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 3.8272, |
|
"grad_norm": 0.27577656010569185, |
|
"learning_rate": 6.275167242625331e-06, |
|
"loss": 0.3967, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.2831731684757878, |
|
"learning_rate": 6.145512929808013e-06, |
|
"loss": 0.4102, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.8528000000000002, |
|
"grad_norm": 0.27543329146697637, |
|
"learning_rate": 6.016968489424572e-06, |
|
"loss": 0.3753, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 3.8656, |
|
"grad_norm": 0.28971938443452644, |
|
"learning_rate": 5.889544219072465e-06, |
|
"loss": 0.3692, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 3.8784, |
|
"grad_norm": 0.2852417011416498, |
|
"learning_rate": 5.7632503266131925e-06, |
|
"loss": 0.3899, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 3.8912, |
|
"grad_norm": 0.5772779677453664, |
|
"learning_rate": 5.638096929354522e-06, |
|
"loss": 0.4008, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.904, |
|
"grad_norm": 0.2964062916907281, |
|
"learning_rate": 5.514094053240035e-06, |
|
"loss": 0.365, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.9168, |
|
"grad_norm": 0.27420611921751054, |
|
"learning_rate": 5.39125163204594e-06, |
|
"loss": 0.3847, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 3.9295999999999998, |
|
"grad_norm": 0.2735906025995398, |
|
"learning_rate": 5.269579506585259e-06, |
|
"loss": 0.3824, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 3.9424, |
|
"grad_norm": 0.29961039877972634, |
|
"learning_rate": 5.149087423919541e-06, |
|
"loss": 0.3903, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 3.9552, |
|
"grad_norm": 0.27504216393149944, |
|
"learning_rate": 5.029785036577976e-06, |
|
"loss": 0.3994, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 3.968, |
|
"grad_norm": 0.2574123710825498, |
|
"learning_rate": 4.911681901784198e-06, |
|
"loss": 0.3868, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.9808, |
|
"grad_norm": 0.27208360795264636, |
|
"learning_rate": 4.794787480690597e-06, |
|
"loss": 0.368, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 3.9936, |
|
"grad_norm": 0.28666725143337507, |
|
"learning_rate": 4.679111137620442e-06, |
|
"loss": 0.3788, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 4.0064, |
|
"grad_norm": 0.3756949491808068, |
|
"learning_rate": 4.5646621393177e-06, |
|
"loss": 0.3429, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 4.0192, |
|
"grad_norm": 0.3675268483300072, |
|
"learning_rate": 4.451449654204685e-06, |
|
"loss": 0.3471, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 4.032, |
|
"grad_norm": 0.28792739863501965, |
|
"learning_rate": 4.339482751647557e-06, |
|
"loss": 0.3582, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 4.0448, |
|
"grad_norm": 0.32875207421348396, |
|
"learning_rate": 4.228770401229824e-06, |
|
"loss": 0.3564, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 4.0576, |
|
"grad_norm": 0.3782587360480824, |
|
"learning_rate": 4.119321472033779e-06, |
|
"loss": 0.3321, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 4.0704, |
|
"grad_norm": 0.3872900145340423, |
|
"learning_rate": 4.011144731929981e-06, |
|
"loss": 0.3325, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 4.0832, |
|
"grad_norm": 0.3223359591690514, |
|
"learning_rate": 3.904248846874894e-06, |
|
"loss": 0.3276, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"grad_norm": 0.33574973758010285, |
|
"learning_rate": 3.7986423802166705e-06, |
|
"loss": 0.3245, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.1088, |
|
"grad_norm": 0.32491634246657564, |
|
"learning_rate": 3.694333792009115e-06, |
|
"loss": 0.3351, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 4.1216, |
|
"grad_norm": 0.3299263674417641, |
|
"learning_rate": 3.5913314383339937e-06, |
|
"loss": 0.3528, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 4.1344, |
|
"grad_norm": 0.2878382459323737, |
|
"learning_rate": 3.4896435706316e-06, |
|
"loss": 0.351, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 4.1472, |
|
"grad_norm": 0.3031903041525479, |
|
"learning_rate": 3.3892783350397675e-06, |
|
"loss": 0.309, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.3184646973612724, |
|
"learning_rate": 3.290243771741275e-06, |
|
"loss": 0.3427, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.1728, |
|
"grad_norm": 0.3016381771830608, |
|
"learning_rate": 3.1925478143197418e-06, |
|
"loss": 0.3617, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 4.1856, |
|
"grad_norm": 0.3030051610528341, |
|
"learning_rate": 3.0961982891241083e-06, |
|
"loss": 0.3619, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 4.1984, |
|
"grad_norm": 0.29765952101137166, |
|
"learning_rate": 3.001202914641628e-06, |
|
"loss": 0.3471, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 4.2112, |
|
"grad_norm": 0.2954796322341825, |
|
"learning_rate": 2.907569300879596e-06, |
|
"loss": 0.3311, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 4.224, |
|
"grad_norm": 0.2607902733239937, |
|
"learning_rate": 2.815304948755664e-06, |
|
"loss": 0.3817, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.2368, |
|
"grad_norm": 0.259352438077992, |
|
"learning_rate": 2.7244172494969978e-06, |
|
"loss": 0.3361, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 4.2496, |
|
"grad_norm": 0.27425914878006435, |
|
"learning_rate": 2.6349134840481294e-06, |
|
"loss": 0.3233, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 4.2624, |
|
"grad_norm": 0.24715987907218878, |
|
"learning_rate": 2.546800822487714e-06, |
|
"loss": 0.347, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 4.2752, |
|
"grad_norm": 0.25636824736040525, |
|
"learning_rate": 2.4600863234541338e-06, |
|
"loss": 0.3553, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 0.24870904351333006, |
|
"learning_rate": 2.374776933580025e-06, |
|
"loss": 0.3484, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 4.3008, |
|
"grad_norm": 0.23835917870398024, |
|
"learning_rate": 2.2908794869358044e-06, |
|
"loss": 0.3388, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 4.3136, |
|
"grad_norm": 0.22920888616083185, |
|
"learning_rate": 2.2084007044821764e-06, |
|
"loss": 0.3554, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 4.3264, |
|
"grad_norm": 0.25631081382757875, |
|
"learning_rate": 2.127347193531757e-06, |
|
"loss": 0.3124, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 4.3392, |
|
"grad_norm": 0.2583359373347175, |
|
"learning_rate": 2.0477254472197237e-06, |
|
"loss": 0.3389, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"grad_norm": 0.2396511474610029, |
|
"learning_rate": 1.96954184398368e-06, |
|
"loss": 0.3627, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.3648, |
|
"grad_norm": 0.24716904810583543, |
|
"learning_rate": 1.8928026470526917e-06, |
|
"loss": 0.32, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 4.3776, |
|
"grad_norm": 0.2326197051401394, |
|
"learning_rate": 1.817514003945524e-06, |
|
"loss": 0.3464, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 4.3904, |
|
"grad_norm": 0.2421341744892754, |
|
"learning_rate": 1.743681945978184e-06, |
|
"loss": 0.3678, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 4.4032, |
|
"grad_norm": 0.24025637141781805, |
|
"learning_rate": 1.6713123877807413e-06, |
|
"loss": 0.3263, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 4.416, |
|
"grad_norm": 0.24097437823375367, |
|
"learning_rate": 1.6004111268235156e-06, |
|
"loss": 0.3458, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 4.4288, |
|
"grad_norm": 0.2263409591200407, |
|
"learning_rate": 1.5309838429526714e-06, |
|
"loss": 0.3465, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 4.4416, |
|
"grad_norm": 0.22904641123840114, |
|
"learning_rate": 1.4630360979351644e-06, |
|
"loss": 0.3572, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 4.4544, |
|
"grad_norm": 0.24350592466909948, |
|
"learning_rate": 1.396573335013236e-06, |
|
"loss": 0.348, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 4.4672, |
|
"grad_norm": 0.23435074168615438, |
|
"learning_rate": 1.3316008784683265e-06, |
|
"loss": 0.3435, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.23783811548314193, |
|
"learning_rate": 1.2681239331945695e-06, |
|
"loss": 0.3479, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.4928, |
|
"grad_norm": 0.2861107243391282, |
|
"learning_rate": 1.2061475842818337e-06, |
|
"loss": 0.3378, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 4.5056, |
|
"grad_norm": 0.23253504506937483, |
|
"learning_rate": 1.1456767966083393e-06, |
|
"loss": 0.3552, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 4.5184, |
|
"grad_norm": 0.2246628160130799, |
|
"learning_rate": 1.086716414442952e-06, |
|
"loss": 0.3679, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 4.5312, |
|
"grad_norm": 0.21692743863919423, |
|
"learning_rate": 1.0292711610570904e-06, |
|
"loss": 0.3625, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 4.5440000000000005, |
|
"grad_norm": 0.22424168871915062, |
|
"learning_rate": 9.733456383463658e-07, |
|
"loss": 0.3352, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 4.5568, |
|
"grad_norm": 0.21387043655972612, |
|
"learning_rate": 9.189443264619102e-07, |
|
"loss": 0.3598, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 4.5696, |
|
"grad_norm": 0.22191490435125394, |
|
"learning_rate": 8.660715834514977e-07, |
|
"loss": 0.3536, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 4.5824, |
|
"grad_norm": 0.23050300947239208, |
|
"learning_rate": 8.147316449103959e-07, |
|
"loss": 0.3488, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 4.5952, |
|
"grad_norm": 0.2361544030495828, |
|
"learning_rate": 7.649286236420806e-07, |
|
"loss": 0.3383, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 0.2250713610904457, |
|
"learning_rate": 7.166665093287539e-07, |
|
"loss": 0.3362, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.6208, |
|
"grad_norm": 0.22925307838835346, |
|
"learning_rate": 6.69949168211721e-07, |
|
"loss": 0.3325, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 4.6336, |
|
"grad_norm": 0.23543085842037606, |
|
"learning_rate": 6.247803427816945e-07, |
|
"loss": 0.3331, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 4.6464, |
|
"grad_norm": 0.23295996119795018, |
|
"learning_rate": 5.811636514789598e-07, |
|
"loss": 0.3366, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 4.6592, |
|
"grad_norm": 0.23330266394800053, |
|
"learning_rate": 5.391025884035239e-07, |
|
"loss": 0.3307, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 4.672, |
|
"grad_norm": 0.22636221127553688, |
|
"learning_rate": 4.986005230351954e-07, |
|
"loss": 0.354, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 4.6848, |
|
"grad_norm": 0.21858831782914198, |
|
"learning_rate": 4.5966069996365993e-07, |
|
"loss": 0.337, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 4.6975999999999996, |
|
"grad_norm": 0.22700372130796417, |
|
"learning_rate": 4.22286238628562e-07, |
|
"loss": 0.3379, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 4.7104, |
|
"grad_norm": 0.2234230664222341, |
|
"learning_rate": 3.8648013306960664e-07, |
|
"loss": 0.3454, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 4.7232, |
|
"grad_norm": 0.23696571797831542, |
|
"learning_rate": 3.522452516867048e-07, |
|
"loss": 0.3378, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 4.736, |
|
"grad_norm": 0.22904682230998671, |
|
"learning_rate": 3.1958433701019697e-07, |
|
"loss": 0.3434, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.7488, |
|
"grad_norm": 0.22497057374402313, |
|
"learning_rate": 2.8850000548115155e-07, |
|
"loss": 0.3522, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 4.7616, |
|
"grad_norm": 0.2258287682376544, |
|
"learning_rate": 2.5899474724174313e-07, |
|
"loss": 0.34, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 4.7744, |
|
"grad_norm": 0.21724452205355713, |
|
"learning_rate": 2.3107092593579905e-07, |
|
"loss": 0.3521, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 4.7872, |
|
"grad_norm": 0.2426189978091419, |
|
"learning_rate": 2.0473077851942858e-07, |
|
"loss": 0.3176, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.2293805302238564, |
|
"learning_rate": 1.799764150818306e-07, |
|
"loss": 0.3345, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 4.8128, |
|
"grad_norm": 0.22779320333802724, |
|
"learning_rate": 1.5680981867625566e-07, |
|
"loss": 0.3617, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 4.8256, |
|
"grad_norm": 0.22437051418026907, |
|
"learning_rate": 1.3523284516113955e-07, |
|
"loss": 0.3259, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 4.8384, |
|
"grad_norm": 0.23954190278447945, |
|
"learning_rate": 1.1524722305144231e-07, |
|
"loss": 0.3434, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 4.8512, |
|
"grad_norm": 0.2424599078756838, |
|
"learning_rate": 9.685455338016347e-08, |
|
"loss": 0.332, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 0.22993119820564792, |
|
"learning_rate": 8.005630957010014e-08, |
|
"loss": 0.3262, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.8768, |
|
"grad_norm": 0.22123537274119692, |
|
"learning_rate": 6.485383731580142e-08, |
|
"loss": 0.3182, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 4.8896, |
|
"grad_norm": 0.21660597193602085, |
|
"learning_rate": 5.1248354475768034e-08, |
|
"loss": 0.3352, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 4.9024, |
|
"grad_norm": 0.22493613651849081, |
|
"learning_rate": 3.924095097489922e-08, |
|
"loss": 0.3525, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 4.9152000000000005, |
|
"grad_norm": 0.22343841499976613, |
|
"learning_rate": 2.8832588717164766e-08, |
|
"loss": 0.3381, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 0.23852936531830982, |
|
"learning_rate": 2.0024101508555604e-08, |
|
"loss": 0.3303, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 4.9408, |
|
"grad_norm": 0.22543115265579133, |
|
"learning_rate": 1.281619499029274e-08, |
|
"loss": 0.3179, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 4.9536, |
|
"grad_norm": 0.21691606376951345, |
|
"learning_rate": 7.209446582292501e-09, |
|
"loss": 0.3524, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 4.9664, |
|
"grad_norm": 0.22676220521180954, |
|
"learning_rate": 3.2043054369057523e-09, |
|
"loss": 0.3182, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 4.9792, |
|
"grad_norm": 0.22371650819974173, |
|
"learning_rate": 8.010924029533406e-10, |
|
"loss": 0.3207, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 4.992, |
|
"grad_norm": 0.22001504712277892, |
|
"learning_rate": 0.0, |
|
"loss": 0.3288, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.992, |
|
"step": 390, |
|
"total_flos": 1.6438208865301955e+18, |
|
"train_loss": 0.49524986499395124, |
|
"train_runtime": 58207.9475, |
|
"train_samples_per_second": 0.859, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 390, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6438208865301955e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|