End of training

Browse files

Files changed (5) hide show

README.md +2 -1
all_results.json +8 -0
train_results.json +8 -0
trainer_state.json +1057 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: Qwen/Qwen2.5-7B-Instruct
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: no_pipeline_science_100k
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # no_pipeline_science_100k
-This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
 ## Model description

 base_model: Qwen/Qwen2.5-7B-Instruct
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: no_pipeline_science_100k
 # no_pipeline_science_100k
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/no_pipeline_science_100k dataset.
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.937369519832985,
+    "total_flos": 3.738141667979428e+18,
+    "train_loss": 0.2130410626016814,
+    "train_runtime": 6079.0514,
+    "train_samples_per_second": 12.591,
+    "train_steps_per_second": 0.024
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.937369519832985,
+    "total_flos": 3.738141667979428e+18,
+    "train_loss": 0.2130410626016814,
+    "train_runtime": 6079.0514,
+    "train_samples_per_second": 12.591,
+    "train_steps_per_second": 0.024
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1057 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.937369519832985,
+  "eval_steps": 500,
+  "global_step": 145,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.033402922755741124,
+      "grad_norm": 7.154098089649519,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 1.2049,
+      "step": 1
+    },
+    {
+      "epoch": 0.06680584551148225,
+      "grad_norm": 7.224671814367719,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 1.2046,
+      "step": 2
+    },
+    {
+      "epoch": 0.10020876826722339,
+      "grad_norm": 5.112010906482035,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.1276,
+      "step": 3
+    },
+    {
+      "epoch": 0.1336116910229645,
+      "grad_norm": 5.240191362417293,
+      "learning_rate": 2.1333333333333335e-05,
+      "loss": 1.0958,
+      "step": 4
+    },
+    {
+      "epoch": 0.16701461377870563,
+      "grad_norm": 4.398708169023894,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.0242,
+      "step": 5
+    },
+    {
+      "epoch": 0.20041753653444677,
+      "grad_norm": 4.9473080352678895,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.0277,
+      "step": 6
+    },
+    {
+      "epoch": 0.23382045929018788,
+      "grad_norm": 3.8837845230573835,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 0.9755,
+      "step": 7
+    },
+    {
+      "epoch": 0.267223382045929,
+      "grad_norm": 2.957475416120971,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 0.9334,
+      "step": 8
+    },
+    {
+      "epoch": 0.30062630480167013,
+      "grad_norm": 2.187999537773017,
+      "learning_rate": 4.8e-05,
+      "loss": 0.9211,
+      "step": 9
+    },
+    {
+      "epoch": 0.33402922755741127,
+      "grad_norm": 2.3155958654718614,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 0.8983,
+      "step": 10
+    },
+    {
+      "epoch": 0.3674321503131524,
+      "grad_norm": 2.8444701447464436,
+      "learning_rate": 5.8666666666666665e-05,
+      "loss": 0.8975,
+      "step": 11
+    },
+    {
+      "epoch": 0.40083507306889354,
+      "grad_norm": 3.0982586639870213,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 0.8856,
+      "step": 12
+    },
+    {
+      "epoch": 0.4342379958246347,
+      "grad_norm": 1.8874954111166966,
+      "learning_rate": 6.933333333333334e-05,
+      "loss": 0.872,
+      "step": 13
+    },
+    {
+      "epoch": 0.46764091858037576,
+      "grad_norm": 3.038189077699479,
+      "learning_rate": 7.466666666666667e-05,
+      "loss": 0.8892,
+      "step": 14
+    },
+    {
+      "epoch": 0.5010438413361169,
+      "grad_norm": 1.7381979961116139,
+      "learning_rate": 8e-05,
+      "loss": 0.8535,
+      "step": 15
+    },
+    {
+      "epoch": 0.534446764091858,
+      "grad_norm": 2188.323143153608,
+      "learning_rate": 7.998832056320773e-05,
+      "loss": 1.0923,
+      "step": 16
+    },
+    {
+      "epoch": 0.5678496868475992,
+      "grad_norm": 5.568753243553315,
+      "learning_rate": 7.995328907329308e-05,
+      "loss": 0.9433,
+      "step": 17
+    },
+    {
+      "epoch": 0.6012526096033403,
+      "grad_norm": 3.003707178203899,
+      "learning_rate": 7.989492598765966e-05,
+      "loss": 0.8783,
+      "step": 18
+    },
+    {
+      "epoch": 0.6346555323590815,
+      "grad_norm": 3.3787729580945367,
+      "learning_rate": 7.981326538870596e-05,
+      "loss": 0.8657,
+      "step": 19
+    },
+    {
+      "epoch": 0.6680584551148225,
+      "grad_norm": 3.0096085489081923,
+      "learning_rate": 7.970835496392216e-05,
+      "loss": 0.8705,
+      "step": 20
+    },
+    {
+      "epoch": 0.7014613778705637,
+      "grad_norm": 2.2251853165136186,
+      "learning_rate": 7.958025597804205e-05,
+      "loss": 0.8591,
+      "step": 21
+    },
+    {
+      "epoch": 0.7348643006263048,
+      "grad_norm": 1.393655838420696,
+      "learning_rate": 7.942904323726604e-05,
+      "loss": 0.8202,
+      "step": 22
+    },
+    {
+      "epoch": 0.7682672233820459,
+      "grad_norm": 1.709595774468825,
+      "learning_rate": 7.925480504557654e-05,
+      "loss": 0.8239,
+      "step": 23
+    },
+    {
+      "epoch": 0.8016701461377871,
+      "grad_norm": 1.04923819217978,
+      "learning_rate": 7.90576431531709e-05,
+      "loss": 0.8236,
+      "step": 24
+    },
+    {
+      "epoch": 0.8350730688935282,
+      "grad_norm": 1.4676306786694173,
+      "learning_rate": 7.883767269704209e-05,
+      "loss": 0.8083,
+      "step": 25
+    },
+    {
+      "epoch": 0.8684759916492694,
+      "grad_norm": 417.78005391043996,
+      "learning_rate": 7.859502213374207e-05,
+      "loss": 1.1719,
+      "step": 26
+    },
+    {
+      "epoch": 0.9018789144050104,
+      "grad_norm": 12.203128331470229,
+      "learning_rate": 7.832983316436666e-05,
+      "loss": 0.8597,
+      "step": 27
+    },
+    {
+      "epoch": 0.9352818371607515,
+      "grad_norm": 4.957392164656034,
+      "learning_rate": 7.804226065180615e-05,
+      "loss": 0.9382,
+      "step": 28
+    },
+    {
+      "epoch": 0.9686847599164927,
+      "grad_norm": 10.61998316626802,
+      "learning_rate": 7.773247253030973e-05,
+      "loss": 0.9939,
+      "step": 29
+    },
+    {
+      "epoch": 1.0083507306889352,
+      "grad_norm": 54.87377316789775,
+      "learning_rate": 7.740064970741661e-05,
+      "loss": 0.8724,
+      "step": 30
+    },
+    {
+      "epoch": 1.0417536534446765,
+      "grad_norm": 88.85368950052036,
+      "learning_rate": 7.704698595831107e-05,
+      "loss": 0.9805,
+      "step": 31
+    },
+    {
+      "epoch": 1.0751565762004176,
+      "grad_norm": 12.07917493536492,
+      "learning_rate": 7.667168781266331e-05,
+      "loss": 0.9689,
+      "step": 32
+    },
+    {
+      "epoch": 1.1085594989561587,
+      "grad_norm": 30.730151095536375,
+      "learning_rate": 7.627497443402182e-05,
+      "loss": 1.0908,
+      "step": 33
+    },
+    {
+      "epoch": 1.1419624217118998,
+      "grad_norm": 4.303465706734318,
+      "learning_rate": 7.585707749182816e-05,
+      "loss": 0.8883,
+      "step": 34
+    },
+    {
+      "epoch": 1.1753653444676408,
+      "grad_norm": 1.3597826636785535,
+      "learning_rate": 7.541824102612839e-05,
+      "loss": 0.8376,
+      "step": 35
+    },
+    {
+      "epoch": 1.2087682672233822,
+      "grad_norm": 2.028071696746282,
+      "learning_rate": 7.495872130506072e-05,
+      "loss": 0.8018,
+      "step": 36
+    },
+    {
+      "epoch": 1.2421711899791232,
+      "grad_norm": 1.5668710744698326,
+      "learning_rate": 7.447878667520198e-05,
+      "loss": 0.7901,
+      "step": 37
+    },
+    {
+      "epoch": 1.2755741127348643,
+      "grad_norm": 0.9166659557892114,
+      "learning_rate": 7.397871740486085e-05,
+      "loss": 0.7699,
+      "step": 38
+    },
+    {
+      "epoch": 1.3089770354906054,
+      "grad_norm": 8.207728871032339,
+      "learning_rate": 7.345880552040907e-05,
+      "loss": 0.7735,
+      "step": 39
+    },
+    {
+      "epoch": 1.3423799582463465,
+      "grad_norm": 2.508524585534657,
+      "learning_rate": 7.291935463574626e-05,
+      "loss": 0.8447,
+      "step": 40
+    },
+    {
+      "epoch": 1.3757828810020878,
+      "grad_norm": 1.364586902470756,
+      "learning_rate": 7.236067977499791e-05,
+      "loss": 0.7856,
+      "step": 41
+    },
+    {
+      "epoch": 1.4091858037578289,
+      "grad_norm": 1.8720653087352772,
+      "learning_rate": 7.178310718855018e-05,
+      "loss": 0.7829,
+      "step": 42
+    },
+    {
+      "epoch": 1.44258872651357,
+      "grad_norm": 1.8178558573775088,
+      "learning_rate": 7.11869741625289e-05,
+      "loss": 0.7737,
+      "step": 43
+    },
+    {
+      "epoch": 1.475991649269311,
+      "grad_norm": 1.6638629849138615,
+      "learning_rate": 7.057262882183393e-05,
+      "loss": 0.7737,
+      "step": 44
+    },
+    {
+      "epoch": 1.5093945720250521,
+      "grad_norm": 1.1958951695778888,
+      "learning_rate": 6.994042992684406e-05,
+      "loss": 0.7499,
+      "step": 45
+    },
+    {
+      "epoch": 1.5427974947807934,
+      "grad_norm": 1.1237749762548175,
+      "learning_rate": 6.929074666391095e-05,
+      "loss": 0.7457,
+      "step": 46
+    },
+    {
+      "epoch": 1.5762004175365343,
+      "grad_norm": 0.9523363043794499,
+      "learning_rate": 6.862395842976484e-05,
+      "loss": 0.7449,
+      "step": 47
+    },
+    {
+      "epoch": 1.6096033402922756,
+      "grad_norm": 0.7794493625394828,
+      "learning_rate": 6.79404546099575e-05,
+      "loss": 0.7471,
+      "step": 48
+    },
+    {
+      "epoch": 1.6430062630480167,
+      "grad_norm": 2.1007938107258224,
+      "learning_rate": 6.724063435147189e-05,
+      "loss": 0.738,
+      "step": 49
+    },
+    {
+      "epoch": 1.6764091858037578,
+      "grad_norm": 0.8397590052899939,
+      "learning_rate": 6.652490632963182e-05,
+      "loss": 0.7366,
+      "step": 50
+    },
+    {
+      "epoch": 1.709812108559499,
+      "grad_norm": 1.5244169148841136,
+      "learning_rate": 6.579368850944683e-05,
+      "loss": 0.7518,
+      "step": 51
+    },
+    {
+      "epoch": 1.7432150313152401,
+      "grad_norm": 0.97500410064134,
+      "learning_rate": 6.504740790153255e-05,
+      "loss": 0.7365,
+      "step": 52
+    },
+    {
+      "epoch": 1.7766179540709812,
+      "grad_norm": 1.833109071141926,
+      "learning_rate": 6.428650031274845e-05,
+      "loss": 0.7327,
+      "step": 53
+    },
+    {
+      "epoch": 1.8100208768267223,
+      "grad_norm": 1.4707510085946327,
+      "learning_rate": 6.351141009169893e-05,
+      "loss": 0.7227,
+      "step": 54
+    },
+    {
+      "epoch": 1.8434237995824634,
+      "grad_norm": 1.2363917202252765,
+      "learning_rate": 6.272258986924624e-05,
+      "loss": 0.7405,
+      "step": 55
+    },
+    {
+      "epoch": 1.8768267223382047,
+      "grad_norm": 1.0298920741813498,
+      "learning_rate": 6.192050029418682e-05,
+      "loss": 0.7241,
+      "step": 56
+    },
+    {
+      "epoch": 1.9102296450939458,
+      "grad_norm": 0.9097363351471279,
+      "learning_rate": 6.110560976424531e-05,
+      "loss": 0.7167,
+      "step": 57
+    },
+    {
+      "epoch": 1.9436325678496869,
+      "grad_norm": 0.8471695952793523,
+      "learning_rate": 6.027839415254362e-05,
+      "loss": 0.7181,
+      "step": 58
+    },
+    {
+      "epoch": 1.977035490605428,
+      "grad_norm": 0.6602662698524506,
+      "learning_rate": 5.943933652970424e-05,
+      "loss": 0.7088,
+      "step": 59
+    },
+    {
+      "epoch": 2.0167014613778704,
+      "grad_norm": 0.624041177687339,
+      "learning_rate": 5.858892688175075e-05,
+      "loss": 0.6922,
+      "step": 60
+    },
+    {
+      "epoch": 2.0501043841336117,
+      "grad_norm": 0.731560229530671,
+      "learning_rate": 5.772766182396966e-05,
+      "loss": 0.6655,
+      "step": 61
+    },
+    {
+      "epoch": 2.083507306889353,
+      "grad_norm": 0.5160825456760252,
+      "learning_rate": 5.685604431090117e-05,
+      "loss": 0.6624,
+      "step": 62
+    },
+    {
+      "epoch": 2.116910229645094,
+      "grad_norm": 0.6466642583190281,
+      "learning_rate": 5.597458334262782e-05,
+      "loss": 0.6474,
+      "step": 63
+    },
+    {
+      "epoch": 2.150313152400835,
+      "grad_norm": 0.6905839273768964,
+      "learning_rate": 5.508379366753282e-05,
+      "loss": 0.6512,
+      "step": 64
+    },
+    {
+      "epoch": 2.183716075156576,
+      "grad_norm": 0.3760316450742919,
+      "learning_rate": 5.4184195481701425e-05,
+      "loss": 0.6523,
+      "step": 65
+    },
+    {
+      "epoch": 2.2171189979123174,
+      "grad_norm": 0.606234562718693,
+      "learning_rate": 5.3276314125141144e-05,
+      "loss": 0.6487,
+      "step": 66
+    },
+    {
+      "epoch": 2.2505219206680582,
+      "grad_norm": 0.44809718292050676,
+      "learning_rate": 5.23606797749979e-05,
+      "loss": 0.649,
+      "step": 67
+    },
+    {
+      "epoch": 2.2839248434237995,
+      "grad_norm": 0.40244410097202155,
+      "learning_rate": 5.1437827135947566e-05,
+      "loss": 0.6468,
+      "step": 68
+    },
+    {
+      "epoch": 2.317327766179541,
+      "grad_norm": 0.359719180741915,
+      "learning_rate": 5.050829512794348e-05,
+      "loss": 0.6409,
+      "step": 69
+    },
+    {
+      "epoch": 2.3507306889352817,
+      "grad_norm": 0.40415638024369727,
+      "learning_rate": 4.9572626571502316e-05,
+      "loss": 0.639,
+      "step": 70
+    },
+    {
+      "epoch": 2.384133611691023,
+      "grad_norm": 0.3340843503248373,
+      "learning_rate": 4.8631367870712254e-05,
+      "loss": 0.6326,
+      "step": 71
+    },
+    {
+      "epoch": 2.4175365344467643,
+      "grad_norm": 0.3262882595570267,
+      "learning_rate": 4.768506869414834e-05,
+      "loss": 0.6298,
+      "step": 72
+    },
+    {
+      "epoch": 2.450939457202505,
+      "grad_norm": 0.3253891492249243,
+      "learning_rate": 4.6734281653881536e-05,
+      "loss": 0.6326,
+      "step": 73
+    },
+    {
+      "epoch": 2.4843423799582465,
+      "grad_norm": 0.35311540573233735,
+      "learning_rate": 4.577956198276886e-05,
+      "loss": 0.6291,
+      "step": 74
+    },
+    {
+      "epoch": 2.5177453027139873,
+      "grad_norm": 0.3440383701499157,
+      "learning_rate": 4.4821467210212924e-05,
+      "loss": 0.6332,
+      "step": 75
+    },
+    {
+      "epoch": 2.5511482254697286,
+      "grad_norm": 0.30978369513311105,
+      "learning_rate": 4.386055683658061e-05,
+      "loss": 0.6408,
+      "step": 76
+    },
+    {
+      "epoch": 2.5845511482254695,
+      "grad_norm": 0.3823149004105222,
+      "learning_rate": 4.2897392006470503e-05,
+      "loss": 0.6246,
+      "step": 77
+    },
+    {
+      "epoch": 2.617954070981211,
+      "grad_norm": 0.2810880790539587,
+      "learning_rate": 4.1932535181020286e-05,
+      "loss": 0.6293,
+      "step": 78
+    },
+    {
+      "epoch": 2.651356993736952,
+      "grad_norm": 0.2835535239751324,
+      "learning_rate": 4.096654980944529e-05,
+      "loss": 0.6252,
+      "step": 79
+    },
+    {
+      "epoch": 2.684759916492693,
+      "grad_norm": 0.336833154001104,
+      "learning_rate": 4e-05,
+      "loss": 0.6305,
+      "step": 80
+    },
+    {
+      "epoch": 2.7181628392484343,
+      "grad_norm": 0.23274589850456745,
+      "learning_rate": 3.903345019055472e-05,
+      "loss": 0.6298,
+      "step": 81
+    },
+    {
+      "epoch": 2.7515657620041756,
+      "grad_norm": 0.2420684628004819,
+      "learning_rate": 3.806746481897973e-05,
+      "loss": 0.6241,
+      "step": 82
+    },
+    {
+      "epoch": 2.7849686847599164,
+      "grad_norm": 0.23622928619950834,
+      "learning_rate": 3.710260799352951e-05,
+      "loss": 0.6167,
+      "step": 83
+    },
+    {
+      "epoch": 2.8183716075156577,
+      "grad_norm": 0.21286687906297902,
+      "learning_rate": 3.6139443163419394e-05,
+      "loss": 0.6268,
+      "step": 84
+    },
+    {
+      "epoch": 2.8517745302713986,
+      "grad_norm": 0.20113400910479923,
+      "learning_rate": 3.517853278978708e-05,
+      "loss": 0.622,
+      "step": 85
+    },
+    {
+      "epoch": 2.88517745302714,
+      "grad_norm": 0.19296938971649688,
+      "learning_rate": 3.422043801723116e-05,
+      "loss": 0.6167,
+      "step": 86
+    },
+    {
+      "epoch": 2.9185803757828808,
+      "grad_norm": 0.17640926051127553,
+      "learning_rate": 3.3265718346118464e-05,
+      "loss": 0.6251,
+      "step": 87
+    },
+    {
+      "epoch": 2.951983298538622,
+      "grad_norm": 0.17760201524918323,
+      "learning_rate": 3.231493130585167e-05,
+      "loss": 0.6195,
+      "step": 88
+    },
+    {
+      "epoch": 2.9853862212943634,
+      "grad_norm": 0.18267169419590248,
+      "learning_rate": 3.136863212928776e-05,
+      "loss": 0.6214,
+      "step": 89
+    },
+    {
+      "epoch": 3.1002087682672235,
+      "grad_norm": 0.2479134339023779,
+      "learning_rate": 3.0427373428497704e-05,
+      "loss": 0.5792,
+      "step": 90
+    },
+    {
+      "epoch": 3.1336116910229643,
+      "grad_norm": 0.17829804990091588,
+      "learning_rate": 2.9491704872056525e-05,
+      "loss": 0.571,
+      "step": 91
+    },
+    {
+      "epoch": 3.1670146137787056,
+      "grad_norm": 0.2102957726786887,
+      "learning_rate": 2.8562172864052437e-05,
+      "loss": 0.5665,
+      "step": 92
+    },
+    {
+      "epoch": 3.200417536534447,
+      "grad_norm": 0.18138996143773695,
+      "learning_rate": 2.7639320225002108e-05,
+      "loss": 0.5734,
+      "step": 93
+    },
+    {
+      "epoch": 3.233820459290188,
+      "grad_norm": 0.18231114685106467,
+      "learning_rate": 2.6723685874858873e-05,
+      "loss": 0.5665,
+      "step": 94
+    },
+    {
+      "epoch": 3.267223382045929,
+      "grad_norm": 0.1891068826294468,
+      "learning_rate": 2.5815804518298575e-05,
+      "loss": 0.5649,
+      "step": 95
+    },
+    {
+      "epoch": 3.30062630480167,
+      "grad_norm": 0.1449193634467542,
+      "learning_rate": 2.4916206332467184e-05,
+      "loss": 0.5626,
+      "step": 96
+    },
+    {
+      "epoch": 3.3340292275574113,
+      "grad_norm": 0.17521384625415576,
+      "learning_rate": 2.4025416657372186e-05,
+      "loss": 0.5672,
+      "step": 97
+    },
+    {
+      "epoch": 3.3674321503131526,
+      "grad_norm": 0.17060274829594732,
+      "learning_rate": 2.3143955689098844e-05,
+      "loss": 0.5701,
+      "step": 98
+    },
+    {
+      "epoch": 3.4008350730688934,
+      "grad_norm": 0.16427792254004098,
+      "learning_rate": 2.2272338176030354e-05,
+      "loss": 0.5648,
+      "step": 99
+    },
+    {
+      "epoch": 3.4342379958246347,
+      "grad_norm": 0.16851785214921267,
+      "learning_rate": 2.141107311824926e-05,
+      "loss": 0.5637,
+      "step": 100
+    },
+    {
+      "epoch": 3.4676409185803756,
+      "grad_norm": 0.1647295715319099,
+      "learning_rate": 2.056066347029576e-05,
+      "loss": 0.5698,
+      "step": 101
+    },
+    {
+      "epoch": 3.501043841336117,
+      "grad_norm": 0.14383360405355872,
+      "learning_rate": 1.9721605847456397e-05,
+      "loss": 0.5678,
+      "step": 102
+    },
+    {
+      "epoch": 3.534446764091858,
+      "grad_norm": 0.16369393007489977,
+      "learning_rate": 1.8894390235754686e-05,
+      "loss": 0.5687,
+      "step": 103
+    },
+    {
+      "epoch": 3.567849686847599,
+      "grad_norm": 0.1484533364671656,
+      "learning_rate": 1.807949970581321e-05,
+      "loss": 0.5612,
+      "step": 104
+    },
+    {
+      "epoch": 3.6012526096033404,
+      "grad_norm": 0.13327395767499348,
+      "learning_rate": 1.7277410130753775e-05,
+      "loss": 0.5621,
+      "step": 105
+    },
+    {
+      "epoch": 3.6346555323590817,
+      "grad_norm": 0.14483989970799924,
+      "learning_rate": 1.648858990830108e-05,
+      "loss": 0.5602,
+      "step": 106
+    },
+    {
+      "epoch": 3.6680584551148225,
+      "grad_norm": 0.11501467177953302,
+      "learning_rate": 1.5713499687251554e-05,
+      "loss": 0.5625,
+      "step": 107
+    },
+    {
+      "epoch": 3.701461377870564,
+      "grad_norm": 0.12471522633663724,
+      "learning_rate": 1.4952592098467453e-05,
+      "loss": 0.5566,
+      "step": 108
+    },
+    {
+      "epoch": 3.7348643006263047,
+      "grad_norm": 0.12841415317626956,
+      "learning_rate": 1.4206311490553187e-05,
+      "loss": 0.5563,
+      "step": 109
+    },
+    {
+      "epoch": 3.768267223382046,
+      "grad_norm": 0.13024977809323665,
+      "learning_rate": 1.3475093670368202e-05,
+      "loss": 0.5642,
+      "step": 110
+    },
+    {
+      "epoch": 3.801670146137787,
+      "grad_norm": 0.12141142140280577,
+      "learning_rate": 1.275936564852811e-05,
+      "loss": 0.5619,
+      "step": 111
+    },
+    {
+      "epoch": 3.835073068893528,
+      "grad_norm": 0.1189681977822036,
+      "learning_rate": 1.2059545390042526e-05,
+      "loss": 0.5627,
+      "step": 112
+    },
+    {
+      "epoch": 3.8684759916492695,
+      "grad_norm": 0.11637565722872692,
+      "learning_rate": 1.1376041570235162e-05,
+      "loss": 0.5597,
+      "step": 113
+    },
+    {
+      "epoch": 3.9018789144050103,
+      "grad_norm": 0.11126444342562675,
+      "learning_rate": 1.070925333608907e-05,
+      "loss": 0.5646,
+      "step": 114
+    },
+    {
+      "epoch": 3.9352818371607516,
+      "grad_norm": 0.11144727795080511,
+      "learning_rate": 1.0059570073155953e-05,
+      "loss": 0.5663,
+      "step": 115
+    },
+    {
+      "epoch": 3.968684759916493,
+      "grad_norm": 0.11568625785765184,
+      "learning_rate": 9.427371178166065e-06,
+      "loss": 0.5628,
+      "step": 116
+    },
+    {
+      "epoch": 4.002087682672234,
+      "grad_norm": 0.1172051146855964,
+      "learning_rate": 8.81302583747111e-06,
+      "loss": 0.5657,
+      "step": 117
+    },
+    {
+      "epoch": 4.035490605427975,
+      "grad_norm": 0.1386962246589997,
+      "learning_rate": 8.216892811449834e-06,
+      "loss": 0.5431,
+      "step": 118
+    },
+    {
+      "epoch": 4.068893528183716,
+      "grad_norm": 0.12227065820674828,
+      "learning_rate": 7.639320225002106e-06,
+      "loss": 0.5386,
+      "step": 119
+    },
+    {
+      "epoch": 4.102296450939457,
+      "grad_norm": 0.11676189189940173,
+      "learning_rate": 7.080645364253747e-06,
+      "loss": 0.5341,
+      "step": 120
+    },
+    {
+      "epoch": 4.135699373695198,
+      "grad_norm": 0.1086671391408473,
+      "learning_rate": 6.541194479590931e-06,
+      "loss": 0.5472,
+      "step": 121
+    },
+    {
+      "epoch": 4.16910229645094,
+      "grad_norm": 0.11780635228612878,
+      "learning_rate": 6.021282595139167e-06,
+      "loss": 0.5376,
+      "step": 122
+    },
+    {
+      "epoch": 4.202505219206681,
+      "grad_norm": 0.11122819389457546,
+      "learning_rate": 5.521213324798029e-06,
+      "loss": 0.5405,
+      "step": 123
+    },
+    {
+      "epoch": 4.235908141962422,
+      "grad_norm": 0.11302950909483094,
+      "learning_rate": 5.0412786949392845e-06,
+      "loss": 0.5389,
+      "step": 124
+    },
+    {
+      "epoch": 4.2693110647181625,
+      "grad_norm": 0.10897006989347469,
+      "learning_rate": 4.581758973871609e-06,
+      "loss": 0.5443,
+      "step": 125
+    },
+    {
+      "epoch": 4.302713987473904,
+      "grad_norm": 0.10317913683812792,
+      "learning_rate": 4.142922508171849e-06,
+      "loss": 0.5363,
+      "step": 126
+    },
+    {
+      "epoch": 4.336116910229645,
+      "grad_norm": 0.10003486708202455,
+      "learning_rate": 3.7250255659781844e-06,
+      "loss": 0.5364,
+      "step": 127
+    },
+    {
+      "epoch": 4.369519832985386,
+      "grad_norm": 0.1071731871255614,
+      "learning_rate": 3.3283121873367043e-06,
+      "loss": 0.5432,
+      "step": 128
+    },
+    {
+      "epoch": 4.402922755741128,
+      "grad_norm": 0.10539727253992291,
+      "learning_rate": 2.9530140416889465e-06,
+      "loss": 0.5373,
+      "step": 129
+    },
+    {
+      "epoch": 4.4363256784968685,
+      "grad_norm": 0.09606764766200912,
+      "learning_rate": 2.5993502925834115e-06,
+      "loss": 0.5333,
+      "step": 130
+    },
+    {
+      "epoch": 4.469728601252609,
+      "grad_norm": 0.09219554498498256,
+      "learning_rate": 2.2675274696902737e-06,
+      "loss": 0.5315,
+      "step": 131
+    },
+    {
+      "epoch": 4.503131524008351,
+      "grad_norm": 0.08960509523269163,
+      "learning_rate": 1.957739348193859e-06,
+      "loss": 0.5334,
+      "step": 132
+    },
+    {
+      "epoch": 4.536534446764092,
+      "grad_norm": 0.09160224849384657,
+      "learning_rate": 1.670166835633351e-06,
+      "loss": 0.5384,
+      "step": 133
+    },
+    {
+      "epoch": 4.569937369519833,
+      "grad_norm": 0.08852713488345453,
+      "learning_rate": 1.4049778662579462e-06,
+      "loss": 0.53,
+      "step": 134
+    },
+    {
+      "epoch": 4.603340292275574,
+      "grad_norm": 0.09132315616256415,
+      "learning_rate": 1.1623273029579195e-06,
+      "loss": 0.538,
+      "step": 135
+    },
+    {
+      "epoch": 4.6367432150313155,
+      "grad_norm": 0.09194633813127549,
+      "learning_rate": 9.423568468291156e-07,
+      "loss": 0.541,
+      "step": 136
+    },
+    {
+      "epoch": 4.670146137787056,
+      "grad_norm": 0.09237238332398756,
+      "learning_rate": 7.451949544234627e-07,
+      "loss": 0.5379,
+      "step": 137
+    },
+    {
+      "epoch": 4.703549060542797,
+      "grad_norm": 0.08974432368849375,
+      "learning_rate": 5.709567627339674e-07,
+      "loss": 0.5443,
+      "step": 138
+    },
+    {
+      "epoch": 4.736951983298539,
+      "grad_norm": 0.09013165116820136,
+      "learning_rate": 4.1974402195795514e-07,
+      "loss": 0.535,
+      "step": 139
+    },
+    {
+      "epoch": 4.77035490605428,
+      "grad_norm": 0.09022158123863006,
+      "learning_rate": 2.916450360778411e-07,
+      "loss": 0.5333,
+      "step": 140
+    },
+    {
+      "epoch": 4.803757828810021,
+      "grad_norm": 0.08981410735542258,
+      "learning_rate": 1.867346112940549e-07,
+      "loss": 0.5462,
+      "step": 141
+    },
+    {
+      "epoch": 4.8371607515657615,
+      "grad_norm": 0.09141559902413697,
+      "learning_rate": 1.0507401234035819e-07,
+      "loss": 0.5377,
+      "step": 142
+    },
+    {
+      "epoch": 4.870563674321503,
+      "grad_norm": 0.08951832053531815,
+      "learning_rate": 4.6710926706934336e-08,
+      "loss": 0.5305,
+      "step": 143
+    },
+    {
+      "epoch": 4.903966597077244,
+      "grad_norm": 0.08926873218925392,
+      "learning_rate": 1.1679436792282339e-08,
+      "loss": 0.54,
+      "step": 144
+    },
+    {
+      "epoch": 4.937369519832985,
+      "grad_norm": 0.08723888890212035,
+      "learning_rate": 0.0,
+      "loss": 0.54,
+      "step": 145
+    },
+    {
+      "epoch": 4.937369519832985,
+      "step": 145,
+      "total_flos": 3.738141667979428e+18,
+      "train_loss": 0.2130410626016814,
+      "train_runtime": 6079.0514,
+      "train_samples_per_second": 12.591,
+      "train_steps_per_second": 0.024
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 145,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.738141667979428e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_loss.png ADDED Viewed