|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9865470852017937, |
|
"eval_steps": 500, |
|
"global_step": 55, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017937219730941704, |
|
"grad_norm": 0.6536183953285217, |
|
"learning_rate": 4.995922759815339e-05, |
|
"loss": 0.8371, |
|
"num_input_tokens_seen": 2097152, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03587443946188341, |
|
"grad_norm": 0.517680823802948, |
|
"learning_rate": 4.9837043383713753e-05, |
|
"loss": 0.7804, |
|
"num_input_tokens_seen": 4194304, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.053811659192825115, |
|
"grad_norm": 0.4423481225967407, |
|
"learning_rate": 4.963384589619233e-05, |
|
"loss": 0.7695, |
|
"num_input_tokens_seen": 6291456, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07174887892376682, |
|
"grad_norm": 0.39828750491142273, |
|
"learning_rate": 4.935029792355834e-05, |
|
"loss": 0.7419, |
|
"num_input_tokens_seen": 8388608, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08968609865470852, |
|
"grad_norm": 0.31201115250587463, |
|
"learning_rate": 4.898732434036244e-05, |
|
"loss": 0.7166, |
|
"num_input_tokens_seen": 10485760, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.10762331838565023, |
|
"grad_norm": 0.2536958158016205, |
|
"learning_rate": 4.854610909098812e-05, |
|
"loss": 0.7194, |
|
"num_input_tokens_seen": 12582912, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12556053811659193, |
|
"grad_norm": 0.2193588763475418, |
|
"learning_rate": 4.802809132787125e-05, |
|
"loss": 0.6975, |
|
"num_input_tokens_seen": 14680064, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.14349775784753363, |
|
"grad_norm": 0.18916621804237366, |
|
"learning_rate": 4.743496071728396e-05, |
|
"loss": 0.7168, |
|
"num_input_tokens_seen": 16777216, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.16143497757847533, |
|
"grad_norm": 0.1561172604560852, |
|
"learning_rate": 4.6768651927994434e-05, |
|
"loss": 0.6707, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.17937219730941703, |
|
"grad_norm": 0.12857139110565186, |
|
"learning_rate": 4.6031338320779534e-05, |
|
"loss": 0.6769, |
|
"num_input_tokens_seen": 20971520, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.19730941704035873, |
|
"grad_norm": 0.11340289562940598, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.6873, |
|
"num_input_tokens_seen": 23068672, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.21524663677130046, |
|
"grad_norm": 0.10658581554889679, |
|
"learning_rate": 4.4353540265977064e-05, |
|
"loss": 0.6643, |
|
"num_input_tokens_seen": 25165824, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.23318385650224216, |
|
"grad_norm": 0.08937722444534302, |
|
"learning_rate": 4.341852844691012e-05, |
|
"loss": 0.6849, |
|
"num_input_tokens_seen": 27262976, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.25112107623318386, |
|
"grad_norm": 0.07756289094686508, |
|
"learning_rate": 4.242343921638234e-05, |
|
"loss": 0.6461, |
|
"num_input_tokens_seen": 29360128, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.26905829596412556, |
|
"grad_norm": 0.07581546157598495, |
|
"learning_rate": 4.137151834863213e-05, |
|
"loss": 0.6623, |
|
"num_input_tokens_seen": 31457280, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.28699551569506726, |
|
"grad_norm": 0.07386067509651184, |
|
"learning_rate": 4.0266196990885955e-05, |
|
"loss": 0.6751, |
|
"num_input_tokens_seen": 33554432, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.30493273542600896, |
|
"grad_norm": 0.06293580681085587, |
|
"learning_rate": 3.911108047166924e-05, |
|
"loss": 0.6472, |
|
"num_input_tokens_seen": 35651584, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.32286995515695066, |
|
"grad_norm": 0.06199085712432861, |
|
"learning_rate": 3.790993654097405e-05, |
|
"loss": 0.6728, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.34080717488789236, |
|
"grad_norm": 0.060734592378139496, |
|
"learning_rate": 3.6666683080641846e-05, |
|
"loss": 0.7017, |
|
"num_input_tokens_seen": 39845888, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.35874439461883406, |
|
"grad_norm": 0.05623164027929306, |
|
"learning_rate": 3.5385375325047166e-05, |
|
"loss": 0.6502, |
|
"num_input_tokens_seen": 41943040, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.37668161434977576, |
|
"grad_norm": 0.0574677549302578, |
|
"learning_rate": 3.4070192633766025e-05, |
|
"loss": 0.6476, |
|
"num_input_tokens_seen": 44040192, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.39461883408071746, |
|
"grad_norm": 0.05185185372829437, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.6411, |
|
"num_input_tokens_seen": 46137344, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4125560538116592, |
|
"grad_norm": 0.05139186978340149, |
|
"learning_rate": 3.135545835483718e-05, |
|
"loss": 0.6428, |
|
"num_input_tokens_seen": 48234496, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.4304932735426009, |
|
"grad_norm": 0.050159115344285965, |
|
"learning_rate": 2.996476166614364e-05, |
|
"loss": 0.6661, |
|
"num_input_tokens_seen": 50331648, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4484304932735426, |
|
"grad_norm": 0.04851464927196503, |
|
"learning_rate": 2.8557870956832132e-05, |
|
"loss": 0.6378, |
|
"num_input_tokens_seen": 52428800, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4663677130044843, |
|
"grad_norm": 0.04896726831793785, |
|
"learning_rate": 2.7139375211970996e-05, |
|
"loss": 0.6532, |
|
"num_input_tokens_seen": 54525952, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.484304932735426, |
|
"grad_norm": 0.04698600620031357, |
|
"learning_rate": 2.5713901269842404e-05, |
|
"loss": 0.6403, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5022421524663677, |
|
"grad_norm": 0.048034097999334335, |
|
"learning_rate": 2.42860987301576e-05, |
|
"loss": 0.6248, |
|
"num_input_tokens_seen": 58720256, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5201793721973094, |
|
"grad_norm": 0.044828303158283234, |
|
"learning_rate": 2.2860624788029013e-05, |
|
"loss": 0.6583, |
|
"num_input_tokens_seen": 60817408, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5381165919282511, |
|
"grad_norm": 0.04563640430569649, |
|
"learning_rate": 2.1442129043167874e-05, |
|
"loss": 0.6579, |
|
"num_input_tokens_seen": 62914560, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5560538116591929, |
|
"grad_norm": 0.044318560510873795, |
|
"learning_rate": 2.003523833385637e-05, |
|
"loss": 0.6659, |
|
"num_input_tokens_seen": 65011712, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5739910313901345, |
|
"grad_norm": 0.04331167787313461, |
|
"learning_rate": 1.8644541645162834e-05, |
|
"loss": 0.6423, |
|
"num_input_tokens_seen": 67108864, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5919282511210763, |
|
"grad_norm": 0.04475367069244385, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 0.6509, |
|
"num_input_tokens_seen": 69206016, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6098654708520179, |
|
"grad_norm": 0.045547887682914734, |
|
"learning_rate": 1.5929807366233977e-05, |
|
"loss": 0.6551, |
|
"num_input_tokens_seen": 71303168, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6278026905829597, |
|
"grad_norm": 0.043985530734062195, |
|
"learning_rate": 1.4614624674952842e-05, |
|
"loss": 0.6232, |
|
"num_input_tokens_seen": 73400320, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6457399103139013, |
|
"grad_norm": 0.0414094403386116, |
|
"learning_rate": 1.3333316919358157e-05, |
|
"loss": 0.6137, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6636771300448431, |
|
"grad_norm": 0.041019294410943985, |
|
"learning_rate": 1.2090063459025955e-05, |
|
"loss": 0.6426, |
|
"num_input_tokens_seen": 77594624, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6816143497757847, |
|
"grad_norm": 0.04383592680096626, |
|
"learning_rate": 1.0888919528330777e-05, |
|
"loss": 0.6512, |
|
"num_input_tokens_seen": 79691776, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6995515695067265, |
|
"grad_norm": 0.040539514273405075, |
|
"learning_rate": 9.733803009114045e-06, |
|
"loss": 0.6269, |
|
"num_input_tokens_seen": 81788928, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7174887892376681, |
|
"grad_norm": 0.04238974675536156, |
|
"learning_rate": 8.628481651367876e-06, |
|
"loss": 0.6201, |
|
"num_input_tokens_seen": 83886080, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7354260089686099, |
|
"grad_norm": 0.04115669056773186, |
|
"learning_rate": 7.576560783617668e-06, |
|
"loss": 0.642, |
|
"num_input_tokens_seen": 85983232, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.7533632286995515, |
|
"grad_norm": 0.04178008437156677, |
|
"learning_rate": 6.5814715530898745e-06, |
|
"loss": 0.648, |
|
"num_input_tokens_seen": 88080384, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7713004484304933, |
|
"grad_norm": 0.04329155012965202, |
|
"learning_rate": 5.646459734022938e-06, |
|
"loss": 0.6442, |
|
"num_input_tokens_seen": 90177536, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7892376681614349, |
|
"grad_norm": 0.043740272521972656, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.6488, |
|
"num_input_tokens_seen": 92274688, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8071748878923767, |
|
"grad_norm": 0.04263562709093094, |
|
"learning_rate": 3.968661679220468e-06, |
|
"loss": 0.65, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8251121076233184, |
|
"grad_norm": 0.041693028062582016, |
|
"learning_rate": 3.2313480720055745e-06, |
|
"loss": 0.6584, |
|
"num_input_tokens_seen": 96468992, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.8430493273542601, |
|
"grad_norm": 0.04151754826307297, |
|
"learning_rate": 2.565039282716045e-06, |
|
"loss": 0.6392, |
|
"num_input_tokens_seen": 98566144, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.8609865470852018, |
|
"grad_norm": 0.04260968416929245, |
|
"learning_rate": 1.97190867212875e-06, |
|
"loss": 0.6524, |
|
"num_input_tokens_seen": 100663296, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.8789237668161435, |
|
"grad_norm": 0.04022514820098877, |
|
"learning_rate": 1.4538909090118846e-06, |
|
"loss": 0.6276, |
|
"num_input_tokens_seen": 102760448, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8968609865470852, |
|
"grad_norm": 0.039072513580322266, |
|
"learning_rate": 1.0126756596375686e-06, |
|
"loss": 0.6282, |
|
"num_input_tokens_seen": 104857600, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9147982062780269, |
|
"grad_norm": 0.03952722251415253, |
|
"learning_rate": 6.497020764416633e-07, |
|
"loss": 0.6344, |
|
"num_input_tokens_seen": 106954752, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9327354260089686, |
|
"grad_norm": 0.04045777767896652, |
|
"learning_rate": 3.6615410380767544e-07, |
|
"loss": 0.6464, |
|
"num_input_tokens_seen": 109051904, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.9506726457399103, |
|
"grad_norm": 0.03984501212835312, |
|
"learning_rate": 1.6295661628624447e-07, |
|
"loss": 0.6253, |
|
"num_input_tokens_seen": 111149056, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.968609865470852, |
|
"grad_norm": 0.040761884301900864, |
|
"learning_rate": 4.07724018466088e-08, |
|
"loss": 0.6375, |
|
"num_input_tokens_seen": 113246208, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.9865470852017937, |
|
"grad_norm": 0.04142209142446518, |
|
"learning_rate": 0.0, |
|
"loss": 0.6419, |
|
"num_input_tokens_seen": 115343360, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9865470852017937, |
|
"num_input_tokens_seen": 115343360, |
|
"step": 55, |
|
"total_flos": 5.104238176512246e+18, |
|
"train_loss": 0.6637221011248502, |
|
"train_runtime": 9208.1472, |
|
"train_samples_per_second": 3.097, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 55, |
|
"num_input_tokens_seen": 115343360, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.104238176512246e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|