|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 784, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012755102040816326, |
|
"grad_norm": 226.4242401123047, |
|
"learning_rate": 3.7974683544303794e-06, |
|
"loss": 48.9032, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006377551020408163, |
|
"grad_norm": 208.31333923339844, |
|
"learning_rate": 1.89873417721519e-05, |
|
"loss": 48.5429, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012755102040816327, |
|
"grad_norm": 113.34983825683594, |
|
"learning_rate": 3.79746835443038e-05, |
|
"loss": 39.6906, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01913265306122449, |
|
"grad_norm": 62.39444351196289, |
|
"learning_rate": 5.696202531645569e-05, |
|
"loss": 31.8222, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.025510204081632654, |
|
"grad_norm": 16.312618255615234, |
|
"learning_rate": 7.59493670886076e-05, |
|
"loss": 24.5018, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03188775510204082, |
|
"grad_norm": 11.158844947814941, |
|
"learning_rate": 9.493670886075949e-05, |
|
"loss": 21.551, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03826530612244898, |
|
"grad_norm": 7.093152046203613, |
|
"learning_rate": 0.00011392405063291139, |
|
"loss": 20.5881, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.044642857142857144, |
|
"grad_norm": 4.10010290145874, |
|
"learning_rate": 0.00013291139240506327, |
|
"loss": 19.0191, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05102040816326531, |
|
"grad_norm": 5.54206657409668, |
|
"learning_rate": 0.0001518987341772152, |
|
"loss": 18.1907, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05739795918367347, |
|
"grad_norm": 10.086193084716797, |
|
"learning_rate": 0.00017088607594936708, |
|
"loss": 17.5312, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06377551020408163, |
|
"grad_norm": 19.117507934570312, |
|
"learning_rate": 0.00018987341772151899, |
|
"loss": 15.1473, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07015306122448979, |
|
"grad_norm": 30.6151180267334, |
|
"learning_rate": 0.0002088607594936709, |
|
"loss": 11.3497, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07653061224489796, |
|
"grad_norm": 21.005615234375, |
|
"learning_rate": 0.00022784810126582277, |
|
"loss": 6.1773, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08290816326530612, |
|
"grad_norm": 11.67768383026123, |
|
"learning_rate": 0.0002468354430379747, |
|
"loss": 3.0207, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 6.371170520782471, |
|
"learning_rate": 0.00026582278481012653, |
|
"loss": 2.478, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09566326530612244, |
|
"grad_norm": 2.4198906421661377, |
|
"learning_rate": 0.00028481012658227844, |
|
"loss": 2.1055, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10204081632653061, |
|
"grad_norm": 1.157726764678955, |
|
"learning_rate": 0.00029999851070045734, |
|
"loss": 1.8032, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10841836734693877, |
|
"grad_norm": 1.53150475025177, |
|
"learning_rate": 0.00029994638832161753, |
|
"loss": 1.6379, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11479591836734694, |
|
"grad_norm": 4.24367618560791, |
|
"learning_rate": 0.0002998198305365145, |
|
"loss": 1.5582, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1211734693877551, |
|
"grad_norm": 0.8776500821113586, |
|
"learning_rate": 0.000299618900170137, |
|
"loss": 1.5178, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12755102040816327, |
|
"grad_norm": 1.7082425355911255, |
|
"learning_rate": 0.00029934369696702433, |
|
"loss": 1.4593, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13392857142857142, |
|
"grad_norm": 2.545419931411743, |
|
"learning_rate": 0.0002989943575417525, |
|
"loss": 1.4022, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.14030612244897958, |
|
"grad_norm": 0.9073415994644165, |
|
"learning_rate": 0.0002985710553111161, |
|
"loss": 1.391, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14668367346938777, |
|
"grad_norm": 0.7289413809776306, |
|
"learning_rate": 0.0002980740004080424, |
|
"loss": 1.3611, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.15306122448979592, |
|
"grad_norm": 0.6918051838874817, |
|
"learning_rate": 0.0002975034395772784, |
|
"loss": 1.3474, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15943877551020408, |
|
"grad_norm": 1.0997295379638672, |
|
"learning_rate": 0.00029685965605290337, |
|
"loss": 1.3001, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16581632653061223, |
|
"grad_norm": 4.529347896575928, |
|
"learning_rate": 0.0002961429694177276, |
|
"loss": 1.3039, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.17219387755102042, |
|
"grad_norm": 2.3405444622039795, |
|
"learning_rate": 0.0002953537354446474, |
|
"loss": 1.2888, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 2.904238224029541, |
|
"learning_rate": 0.00029449234592003436, |
|
"loss": 1.2796, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18494897959183673, |
|
"grad_norm": 1.2537798881530762, |
|
"learning_rate": 0.0002935592284492476, |
|
"loss": 1.2941, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1913265306122449, |
|
"grad_norm": 1.00596284866333, |
|
"learning_rate": 0.0002925548462443644, |
|
"loss": 1.2674, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19770408163265307, |
|
"grad_norm": 1.229261040687561, |
|
"learning_rate": 0.0002914796978942358, |
|
"loss": 1.2493, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.20408163265306123, |
|
"grad_norm": 5.567780494689941, |
|
"learning_rate": 0.0002903343171169803, |
|
"loss": 1.2494, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.21045918367346939, |
|
"grad_norm": 2.096792221069336, |
|
"learning_rate": 0.00028911927249503864, |
|
"loss": 1.2534, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.21683673469387754, |
|
"grad_norm": 2.6790685653686523, |
|
"learning_rate": 0.0002878351671929224, |
|
"loss": 1.2544, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 1.817202091217041, |
|
"learning_rate": 0.0002864826386577947, |
|
"loss": 1.2283, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22959183673469388, |
|
"grad_norm": 1.452048420906067, |
|
"learning_rate": 0.00028506235830303274, |
|
"loss": 1.2416, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23596938775510204, |
|
"grad_norm": 2.810974359512329, |
|
"learning_rate": 0.00028357503117492944, |
|
"loss": 1.2204, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2423469387755102, |
|
"grad_norm": 0.8295219540596008, |
|
"learning_rate": 0.0002820213956026987, |
|
"loss": 1.2267, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24872448979591838, |
|
"grad_norm": 1.347506046295166, |
|
"learning_rate": 0.0002804022228319591, |
|
"loss": 1.2054, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.25510204081632654, |
|
"grad_norm": 1.5803207159042358, |
|
"learning_rate": 0.000278718316641877, |
|
"loss": 1.1688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2614795918367347, |
|
"grad_norm": 0.6454965472221375, |
|
"learning_rate": 0.00027697051294615964, |
|
"loss": 1.2095, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 0.8256592154502869, |
|
"learning_rate": 0.0002751596793780968, |
|
"loss": 1.1808, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.274234693877551, |
|
"grad_norm": 1.8195074796676636, |
|
"learning_rate": 0.00027328671485985515, |
|
"loss": 1.1963, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.28061224489795916, |
|
"grad_norm": 1.3873289823532104, |
|
"learning_rate": 0.0002713525491562421, |
|
"loss": 1.1795, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2869897959183674, |
|
"grad_norm": 2.0349373817443848, |
|
"learning_rate": 0.00026935814241315736, |
|
"loss": 1.1802, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.29336734693877553, |
|
"grad_norm": 1.6414597034454346, |
|
"learning_rate": 0.00026730448468096377, |
|
"loss": 1.173, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2997448979591837, |
|
"grad_norm": 1.2609707117080688, |
|
"learning_rate": 0.00026519259542301317, |
|
"loss": 1.1561, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.30612244897959184, |
|
"grad_norm": 4.516489028930664, |
|
"learning_rate": 0.0002630235230095708, |
|
"loss": 1.1573, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.8223940134048462, |
|
"learning_rate": 0.0002607983441973903, |
|
"loss": 1.1619, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.31887755102040816, |
|
"grad_norm": 1.7611159086227417, |
|
"learning_rate": 0.0002585181635951971, |
|
"loss": 1.1654, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3252551020408163, |
|
"grad_norm": 1.7671295404434204, |
|
"learning_rate": 0.0002561841131153459, |
|
"loss": 1.1701, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.33163265306122447, |
|
"grad_norm": 1.1895421743392944, |
|
"learning_rate": 0.00025379735141192397, |
|
"loss": 1.1517, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3380102040816326, |
|
"grad_norm": 2.26123046875, |
|
"learning_rate": 0.0002513590633055798, |
|
"loss": 1.1304, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.34438775510204084, |
|
"grad_norm": 2.0483951568603516, |
|
"learning_rate": 0.00024887045919536207, |
|
"loss": 1.1403, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.350765306122449, |
|
"grad_norm": 1.611738681793213, |
|
"learning_rate": 0.00024633277445786143, |
|
"loss": 1.1499, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2.4385955333709717, |
|
"learning_rate": 0.00024374726883395273, |
|
"loss": 1.1424, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3635204081632653, |
|
"grad_norm": 1.5102245807647705, |
|
"learning_rate": 0.00024111522580344265, |
|
"loss": 1.1135, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.36989795918367346, |
|
"grad_norm": 0.8719022870063782, |
|
"learning_rate": 0.00023843795194793336, |
|
"loss": 1.1236, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3762755102040816, |
|
"grad_norm": 0.9534187316894531, |
|
"learning_rate": 0.0002357167763022175, |
|
"loss": 1.1253, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3826530612244898, |
|
"grad_norm": 1.8618340492248535, |
|
"learning_rate": 0.00023295304969452748, |
|
"loss": 1.1296, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.38903061224489793, |
|
"grad_norm": 0.7363711595535278, |
|
"learning_rate": 0.00023014814407596628, |
|
"loss": 1.1298, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.39540816326530615, |
|
"grad_norm": 8.200729370117188, |
|
"learning_rate": 0.00022730345183945248, |
|
"loss": 1.1155, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4017857142857143, |
|
"grad_norm": 1.8119416236877441, |
|
"learning_rate": 0.0002244203851285173, |
|
"loss": 1.1161, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 3.4057934284210205, |
|
"learning_rate": 0.000221500375136298, |
|
"loss": 1.1257, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4145408163265306, |
|
"grad_norm": 0.9975801706314087, |
|
"learning_rate": 0.00021854487139507398, |
|
"loss": 1.1356, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.42091836734693877, |
|
"grad_norm": 1.9435389041900635, |
|
"learning_rate": 0.00021555534105669976, |
|
"loss": 1.1142, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4272959183673469, |
|
"grad_norm": 1.204506516456604, |
|
"learning_rate": 0.00021253326816429076, |
|
"loss": 1.1174, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4336734693877551, |
|
"grad_norm": 2.5877513885498047, |
|
"learning_rate": 0.0002094801529155244, |
|
"loss": 1.0873, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.44005102040816324, |
|
"grad_norm": 0.7226624488830566, |
|
"learning_rate": 0.00020639751091792188, |
|
"loss": 1.1018, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 1.762408971786499, |
|
"learning_rate": 0.00020328687243648, |
|
"loss": 1.1056, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4528061224489796, |
|
"grad_norm": 4.760770320892334, |
|
"learning_rate": 0.00020014978163402725, |
|
"loss": 1.0828, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.45918367346938777, |
|
"grad_norm": 0.723813533782959, |
|
"learning_rate": 0.0001969877958046804, |
|
"loss": 1.1015, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4655612244897959, |
|
"grad_norm": 1.1869585514068604, |
|
"learning_rate": 0.0001938024846007828, |
|
"loss": 1.0955, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4719387755102041, |
|
"grad_norm": 0.7096392512321472, |
|
"learning_rate": 0.00019059542925370783, |
|
"loss": 1.0949, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.47831632653061223, |
|
"grad_norm": 2.3162615299224854, |
|
"learning_rate": 0.0001873682217889143, |
|
"loss": 1.0943, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4846938775510204, |
|
"grad_norm": 0.7215905785560608, |
|
"learning_rate": 0.0001841224642356436, |
|
"loss": 1.0957, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.49107142857142855, |
|
"grad_norm": 1.3136285543441772, |
|
"learning_rate": 0.0001808597678316509, |
|
"loss": 1.0865, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.49744897959183676, |
|
"grad_norm": 2.323671817779541, |
|
"learning_rate": 0.00017758175222336496, |
|
"loss": 1.0752, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5038265306122449, |
|
"grad_norm": 1.1084301471710205, |
|
"learning_rate": 0.00017429004466187377, |
|
"loss": 1.0892, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5102040816326531, |
|
"grad_norm": 2.052607297897339, |
|
"learning_rate": 0.00017098627919513528, |
|
"loss": 1.1036, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5165816326530612, |
|
"grad_norm": 1.0028424263000488, |
|
"learning_rate": 0.00016767209585681394, |
|
"loss": 1.0913, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5229591836734694, |
|
"grad_norm": 0.9611196517944336, |
|
"learning_rate": 0.00016434913985214597, |
|
"loss": 1.0825, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5293367346938775, |
|
"grad_norm": 2.9150185585021973, |
|
"learning_rate": 0.0001610190607412373, |
|
"loss": 1.085, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.879117250442505, |
|
"learning_rate": 0.00015768351162019975, |
|
"loss": 1.0889, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5420918367346939, |
|
"grad_norm": 1.0958130359649658, |
|
"learning_rate": 0.0001543441483005318, |
|
"loss": 1.0833, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.548469387755102, |
|
"grad_norm": 0.9718190431594849, |
|
"learning_rate": 0.00015100262848715133, |
|
"loss": 1.0813, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5548469387755102, |
|
"grad_norm": 1.2847260236740112, |
|
"learning_rate": 0.000147660610955489, |
|
"loss": 1.0684, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5612244897959183, |
|
"grad_norm": 1.039225459098816, |
|
"learning_rate": 0.00014431975472804936, |
|
"loss": 1.0797, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5676020408163265, |
|
"grad_norm": 1.2738494873046875, |
|
"learning_rate": 0.00014098171825084995, |
|
"loss": 1.0839, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5739795918367347, |
|
"grad_norm": 1.06371009349823, |
|
"learning_rate": 0.00013764815857014636, |
|
"loss": 1.0649, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5803571428571429, |
|
"grad_norm": 1.394904375076294, |
|
"learning_rate": 0.000134320730509852, |
|
"loss": 1.0798, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5867346938775511, |
|
"grad_norm": 0.6322318315505981, |
|
"learning_rate": 0.00013100108585006146, |
|
"loss": 1.0702, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5931122448979592, |
|
"grad_norm": 1.5734529495239258, |
|
"learning_rate": 0.0001276908725070845, |
|
"loss": 1.0571, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5994897959183674, |
|
"grad_norm": 1.5619932413101196, |
|
"learning_rate": 0.00012439173371539818, |
|
"loss": 1.0714, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6058673469387755, |
|
"grad_norm": 1.8657833337783813, |
|
"learning_rate": 0.00012110530721192319, |
|
"loss": 1.0727, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6122448979591837, |
|
"grad_norm": 2.461625337600708, |
|
"learning_rate": 0.00011783322442302915, |
|
"loss": 1.0472, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6186224489795918, |
|
"grad_norm": 1.1668888330459595, |
|
"learning_rate": 0.00011457710965467266, |
|
"loss": 1.0482, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.11457359790802, |
|
"learning_rate": 0.00011133857928606992, |
|
"loss": 1.0618, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6313775510204082, |
|
"grad_norm": 1.9202951192855835, |
|
"learning_rate": 0.00010811924096730416, |
|
"loss": 1.0494, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6377551020408163, |
|
"grad_norm": 0.7689357995986938, |
|
"learning_rate": 0.00010492069282126685, |
|
"loss": 1.0443, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6441326530612245, |
|
"grad_norm": 2.5137813091278076, |
|
"learning_rate": 0.00010174452265032769, |
|
"loss": 1.0689, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6505102040816326, |
|
"grad_norm": 0.6587682962417603, |
|
"learning_rate": 9.859230714812812e-05, |
|
"loss": 1.0482, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6568877551020408, |
|
"grad_norm": 1.251999020576477, |
|
"learning_rate": 9.546561111688958e-05, |
|
"loss": 1.0439, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6632653061224489, |
|
"grad_norm": 0.9404116272926331, |
|
"learning_rate": 9.236598669062443e-05, |
|
"loss": 1.042, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6696428571428571, |
|
"grad_norm": 2.4428858757019043, |
|
"learning_rate": 8.929497256463586e-05, |
|
"loss": 1.0399, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6760204081632653, |
|
"grad_norm": 0.9080251455307007, |
|
"learning_rate": 8.625409323168858e-05, |
|
"loss": 1.0655, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6823979591836735, |
|
"grad_norm": 0.9034568071365356, |
|
"learning_rate": 8.32448582252305e-05, |
|
"loss": 1.0342, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6887755102040817, |
|
"grad_norm": 0.7829791307449341, |
|
"learning_rate": 8.026876137003956e-05, |
|
"loss": 1.0468, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6951530612244898, |
|
"grad_norm": 0.9003449082374573, |
|
"learning_rate": 7.732728004066927e-05, |
|
"loss": 1.0516, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.701530612244898, |
|
"grad_norm": 1.5128637552261353, |
|
"learning_rate": 7.442187442805997e-05, |
|
"loss": 1.0486, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7079081632653061, |
|
"grad_norm": 0.843595027923584, |
|
"learning_rate": 7.155398681468041e-05, |
|
"loss": 1.035, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.120501160621643, |
|
"learning_rate": 6.872504085855947e-05, |
|
"loss": 1.0468, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7206632653061225, |
|
"grad_norm": 0.6691803932189941, |
|
"learning_rate": 6.593644088656277e-05, |
|
"loss": 1.032, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7270408163265306, |
|
"grad_norm": 1.066981315612793, |
|
"learning_rate": 6.318957119726636e-05, |
|
"loss": 1.0366, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7334183673469388, |
|
"grad_norm": 0.9314639568328857, |
|
"learning_rate": 6.048579537377189e-05, |
|
"loss": 1.0258, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7397959183673469, |
|
"grad_norm": 2.091320753097534, |
|
"learning_rate": 5.782645560680571e-05, |
|
"loss": 1.0328, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7461734693877551, |
|
"grad_norm": 0.6987316012382507, |
|
"learning_rate": 5.5212872028437564e-05, |
|
"loss": 1.0588, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7525510204081632, |
|
"grad_norm": 0.9516999125480652, |
|
"learning_rate": 5.264634205674884e-05, |
|
"loss": 1.036, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7589285714285714, |
|
"grad_norm": 0.9213479161262512, |
|
"learning_rate": 5.012813975177708e-05, |
|
"loss": 1.0275, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7653061224489796, |
|
"grad_norm": 1.1120542287826538, |
|
"learning_rate": 4.765951518305524e-05, |
|
"loss": 1.0286, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7716836734693877, |
|
"grad_norm": 1.146470308303833, |
|
"learning_rate": 4.524169380906042e-05, |
|
"loss": 1.0306, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7780612244897959, |
|
"grad_norm": 0.7507238388061523, |
|
"learning_rate": 4.287587586887991e-05, |
|
"loss": 1.0358, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7844387755102041, |
|
"grad_norm": 0.7958881258964539, |
|
"learning_rate": 4.056323578639599e-05, |
|
"loss": 1.0522, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7908163265306123, |
|
"grad_norm": 1.0853825807571411, |
|
"learning_rate": 3.83049215872865e-05, |
|
"loss": 1.0318, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7971938775510204, |
|
"grad_norm": 0.6897711157798767, |
|
"learning_rate": 3.610205432912914e-05, |
|
"loss": 1.0253, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 1.08404541015625, |
|
"learning_rate": 3.395572754489349e-05, |
|
"loss": 1.0536, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8099489795918368, |
|
"grad_norm": 0.9480798244476318, |
|
"learning_rate": 3.1867006700096625e-05, |
|
"loss": 1.0363, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 1.013339877128601, |
|
"learning_rate": 2.9836928663891517e-05, |
|
"loss": 1.0286, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8227040816326531, |
|
"grad_norm": 0.8233193755149841, |
|
"learning_rate": 2.7866501194351253e-05, |
|
"loss": 1.047, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8290816326530612, |
|
"grad_norm": 0.8080177903175354, |
|
"learning_rate": 2.595670243820461e-05, |
|
"loss": 1.0383, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8354591836734694, |
|
"grad_norm": 0.6761140823364258, |
|
"learning_rate": 2.4108480445270346e-05, |
|
"loss": 1.032, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8418367346938775, |
|
"grad_norm": 1.0132781267166138, |
|
"learning_rate": 2.2322752697833045e-05, |
|
"loss": 1.0381, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8482142857142857, |
|
"grad_norm": 2.3679418563842773, |
|
"learning_rate": 2.0600405655192166e-05, |
|
"loss": 1.035, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.8545918367346939, |
|
"grad_norm": 1.0122100114822388, |
|
"learning_rate": 1.8942294313611974e-05, |
|
"loss": 1.0523, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.860969387755102, |
|
"grad_norm": 0.797031819820404, |
|
"learning_rate": 1.7349241781889654e-05, |
|
"loss": 1.0225, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.8673469387755102, |
|
"grad_norm": 0.6979250907897949, |
|
"learning_rate": 1.5822038872753166e-05, |
|
"loss": 1.023, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8737244897959183, |
|
"grad_norm": 0.5788841843605042, |
|
"learning_rate": 1.4361443710291149e-05, |
|
"loss": 1.0388, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8801020408163265, |
|
"grad_norm": 0.7020506858825684, |
|
"learning_rate": 1.2968181353609852e-05, |
|
"loss": 1.0148, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8864795918367347, |
|
"grad_norm": 0.6345797181129456, |
|
"learning_rate": 1.1642943436904111e-05, |
|
"loss": 1.0269, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.7500460147857666, |
|
"learning_rate": 1.038638782612073e-05, |
|
"loss": 1.0271, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8992346938775511, |
|
"grad_norm": 0.6654278635978699, |
|
"learning_rate": 9.199138292385006e-06, |
|
"loss": 1.0241, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.9056122448979592, |
|
"grad_norm": 0.6786679029464722, |
|
"learning_rate": 8.081784202352404e-06, |
|
"loss": 1.0301, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9119897959183674, |
|
"grad_norm": 0.712835967540741, |
|
"learning_rate": 7.034880225638878e-06, |
|
"loss": 1.0355, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.9183673469387755, |
|
"grad_norm": 0.691426694393158, |
|
"learning_rate": 6.058946059475595e-06, |
|
"loss": 1.0374, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9247448979591837, |
|
"grad_norm": 0.6497547626495361, |
|
"learning_rate": 5.154466170724031e-06, |
|
"loss": 1.03, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9311224489795918, |
|
"grad_norm": 0.7832013964653015, |
|
"learning_rate": 4.32188955538022e-06, |
|
"loss": 1.0222, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.1313321590423584, |
|
"learning_rate": 3.5616295156870754e-06, |
|
"loss": 1.0005, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9438775510204082, |
|
"grad_norm": 0.7574149966239929, |
|
"learning_rate": 2.874063454965558e-06, |
|
"loss": 1.0293, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9502551020408163, |
|
"grad_norm": 0.6553170680999756, |
|
"learning_rate": 2.2595326902666412e-06, |
|
"loss": 1.027, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9566326530612245, |
|
"grad_norm": 0.7936610579490662, |
|
"learning_rate": 1.7183422829368409e-06, |
|
"loss": 1.0301, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9630102040816326, |
|
"grad_norm": 0.6665956974029541, |
|
"learning_rate": 1.2507608871817566e-06, |
|
"loss": 1.0299, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9693877551020408, |
|
"grad_norm": 0.6638586521148682, |
|
"learning_rate": 8.570206167024807e-07, |
|
"loss": 1.0108, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9757653061224489, |
|
"grad_norm": 0.6911438703536987, |
|
"learning_rate": 5.37316929471121e-07, |
|
"loss": 1.0219, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 0.6120124459266663, |
|
"learning_rate": 2.9180853070303044e-07, |
|
"loss": 1.0279, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9885204081632653, |
|
"grad_norm": 0.6324964165687561, |
|
"learning_rate": 1.2061729407324416e-07, |
|
"loss": 1.0428, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9948979591836735, |
|
"grad_norm": 0.5991183519363403, |
|
"learning_rate": 2.3828201216774533e-08, |
|
"loss": 1.01, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.388667106628418, |
|
"eval_runtime": 1.6706, |
|
"eval_samples_per_second": 5.986, |
|
"eval_steps_per_second": 1.197, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 784, |
|
"total_flos": 5.976388653306348e+17, |
|
"train_loss": 2.8103741881798725, |
|
"train_runtime": 4265.0133, |
|
"train_samples_per_second": 2.94, |
|
"train_steps_per_second": 0.184 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 784, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.976388653306348e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|