|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.127659574468085, |
|
"eval_steps": 200, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010638297872340425, |
|
"eval_loss": 2.7702033519744873, |
|
"eval_runtime": 30.6604, |
|
"eval_samples_per_second": 48.956, |
|
"eval_steps_per_second": 6.132, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 8e-05, |
|
"loss": 2.0559, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.00016, |
|
"loss": 2.0743, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 5.75, |
|
"learning_rate": 0.00019994532573409262, |
|
"loss": 2.4587, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 0.00019950829025450114, |
|
"loss": 2.6661, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 6.75, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 2.8731, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 0.0001973326597248006, |
|
"loss": 2.8111, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 0.00019560357815343577, |
|
"loss": 2.8978, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 0.0001934564464599461, |
|
"loss": 2.9503, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 10.125, |
|
"learning_rate": 0.00019090065350491626, |
|
"loss": 2.9875, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 2.1433, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.00018460952524209355, |
|
"loss": 1.4928, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.6588, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.00017684011108568592, |
|
"loss": 1.5762, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.00017244252047910892, |
|
"loss": 1.6862, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 1.6834, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.0001627176358473537, |
|
"loss": 1.6762, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.8085106382978724, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.00015743286626829437, |
|
"loss": 1.7259, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 0.00015189695737812152, |
|
"loss": 1.8411, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.021276595744681, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 0.0001461341162978688, |
|
"loss": 1.5208, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 2.75, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 0.6512, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"eval_loss": 3.396796941757202, |
|
"eval_runtime": 28.9515, |
|
"eval_samples_per_second": 51.845, |
|
"eval_steps_per_second": 6.494, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4755282835996672e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|