no_pipeline_science_100k / trainer_state.json
neginr's picture
End of training
72828bc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.937369519832985,
"eval_steps": 500,
"global_step": 145,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.033402922755741124,
"grad_norm": 7.154098089649519,
"learning_rate": 5.333333333333334e-06,
"loss": 1.2049,
"step": 1
},
{
"epoch": 0.06680584551148225,
"grad_norm": 7.224671814367719,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.2046,
"step": 2
},
{
"epoch": 0.10020876826722339,
"grad_norm": 5.112010906482035,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.1276,
"step": 3
},
{
"epoch": 0.1336116910229645,
"grad_norm": 5.240191362417293,
"learning_rate": 2.1333333333333335e-05,
"loss": 1.0958,
"step": 4
},
{
"epoch": 0.16701461377870563,
"grad_norm": 4.398708169023894,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.0242,
"step": 5
},
{
"epoch": 0.20041753653444677,
"grad_norm": 4.9473080352678895,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.0277,
"step": 6
},
{
"epoch": 0.23382045929018788,
"grad_norm": 3.8837845230573835,
"learning_rate": 3.733333333333334e-05,
"loss": 0.9755,
"step": 7
},
{
"epoch": 0.267223382045929,
"grad_norm": 2.957475416120971,
"learning_rate": 4.266666666666667e-05,
"loss": 0.9334,
"step": 8
},
{
"epoch": 0.30062630480167013,
"grad_norm": 2.187999537773017,
"learning_rate": 4.8e-05,
"loss": 0.9211,
"step": 9
},
{
"epoch": 0.33402922755741127,
"grad_norm": 2.3155958654718614,
"learning_rate": 5.333333333333333e-05,
"loss": 0.8983,
"step": 10
},
{
"epoch": 0.3674321503131524,
"grad_norm": 2.8444701447464436,
"learning_rate": 5.8666666666666665e-05,
"loss": 0.8975,
"step": 11
},
{
"epoch": 0.40083507306889354,
"grad_norm": 3.0982586639870213,
"learning_rate": 6.400000000000001e-05,
"loss": 0.8856,
"step": 12
},
{
"epoch": 0.4342379958246347,
"grad_norm": 1.8874954111166966,
"learning_rate": 6.933333333333334e-05,
"loss": 0.872,
"step": 13
},
{
"epoch": 0.46764091858037576,
"grad_norm": 3.038189077699479,
"learning_rate": 7.466666666666667e-05,
"loss": 0.8892,
"step": 14
},
{
"epoch": 0.5010438413361169,
"grad_norm": 1.7381979961116139,
"learning_rate": 8e-05,
"loss": 0.8535,
"step": 15
},
{
"epoch": 0.534446764091858,
"grad_norm": 2188.323143153608,
"learning_rate": 7.998832056320773e-05,
"loss": 1.0923,
"step": 16
},
{
"epoch": 0.5678496868475992,
"grad_norm": 5.568753243553315,
"learning_rate": 7.995328907329308e-05,
"loss": 0.9433,
"step": 17
},
{
"epoch": 0.6012526096033403,
"grad_norm": 3.003707178203899,
"learning_rate": 7.989492598765966e-05,
"loss": 0.8783,
"step": 18
},
{
"epoch": 0.6346555323590815,
"grad_norm": 3.3787729580945367,
"learning_rate": 7.981326538870596e-05,
"loss": 0.8657,
"step": 19
},
{
"epoch": 0.6680584551148225,
"grad_norm": 3.0096085489081923,
"learning_rate": 7.970835496392216e-05,
"loss": 0.8705,
"step": 20
},
{
"epoch": 0.7014613778705637,
"grad_norm": 2.2251853165136186,
"learning_rate": 7.958025597804205e-05,
"loss": 0.8591,
"step": 21
},
{
"epoch": 0.7348643006263048,
"grad_norm": 1.393655838420696,
"learning_rate": 7.942904323726604e-05,
"loss": 0.8202,
"step": 22
},
{
"epoch": 0.7682672233820459,
"grad_norm": 1.709595774468825,
"learning_rate": 7.925480504557654e-05,
"loss": 0.8239,
"step": 23
},
{
"epoch": 0.8016701461377871,
"grad_norm": 1.04923819217978,
"learning_rate": 7.90576431531709e-05,
"loss": 0.8236,
"step": 24
},
{
"epoch": 0.8350730688935282,
"grad_norm": 1.4676306786694173,
"learning_rate": 7.883767269704209e-05,
"loss": 0.8083,
"step": 25
},
{
"epoch": 0.8684759916492694,
"grad_norm": 417.78005391043996,
"learning_rate": 7.859502213374207e-05,
"loss": 1.1719,
"step": 26
},
{
"epoch": 0.9018789144050104,
"grad_norm": 12.203128331470229,
"learning_rate": 7.832983316436666e-05,
"loss": 0.8597,
"step": 27
},
{
"epoch": 0.9352818371607515,
"grad_norm": 4.957392164656034,
"learning_rate": 7.804226065180615e-05,
"loss": 0.9382,
"step": 28
},
{
"epoch": 0.9686847599164927,
"grad_norm": 10.61998316626802,
"learning_rate": 7.773247253030973e-05,
"loss": 0.9939,
"step": 29
},
{
"epoch": 1.0083507306889352,
"grad_norm": 54.87377316789775,
"learning_rate": 7.740064970741661e-05,
"loss": 0.8724,
"step": 30
},
{
"epoch": 1.0417536534446765,
"grad_norm": 88.85368950052036,
"learning_rate": 7.704698595831107e-05,
"loss": 0.9805,
"step": 31
},
{
"epoch": 1.0751565762004176,
"grad_norm": 12.07917493536492,
"learning_rate": 7.667168781266331e-05,
"loss": 0.9689,
"step": 32
},
{
"epoch": 1.1085594989561587,
"grad_norm": 30.730151095536375,
"learning_rate": 7.627497443402182e-05,
"loss": 1.0908,
"step": 33
},
{
"epoch": 1.1419624217118998,
"grad_norm": 4.303465706734318,
"learning_rate": 7.585707749182816e-05,
"loss": 0.8883,
"step": 34
},
{
"epoch": 1.1753653444676408,
"grad_norm": 1.3597826636785535,
"learning_rate": 7.541824102612839e-05,
"loss": 0.8376,
"step": 35
},
{
"epoch": 1.2087682672233822,
"grad_norm": 2.028071696746282,
"learning_rate": 7.495872130506072e-05,
"loss": 0.8018,
"step": 36
},
{
"epoch": 1.2421711899791232,
"grad_norm": 1.5668710744698326,
"learning_rate": 7.447878667520198e-05,
"loss": 0.7901,
"step": 37
},
{
"epoch": 1.2755741127348643,
"grad_norm": 0.9166659557892114,
"learning_rate": 7.397871740486085e-05,
"loss": 0.7699,
"step": 38
},
{
"epoch": 1.3089770354906054,
"grad_norm": 8.207728871032339,
"learning_rate": 7.345880552040907e-05,
"loss": 0.7735,
"step": 39
},
{
"epoch": 1.3423799582463465,
"grad_norm": 2.508524585534657,
"learning_rate": 7.291935463574626e-05,
"loss": 0.8447,
"step": 40
},
{
"epoch": 1.3757828810020878,
"grad_norm": 1.364586902470756,
"learning_rate": 7.236067977499791e-05,
"loss": 0.7856,
"step": 41
},
{
"epoch": 1.4091858037578289,
"grad_norm": 1.8720653087352772,
"learning_rate": 7.178310718855018e-05,
"loss": 0.7829,
"step": 42
},
{
"epoch": 1.44258872651357,
"grad_norm": 1.8178558573775088,
"learning_rate": 7.11869741625289e-05,
"loss": 0.7737,
"step": 43
},
{
"epoch": 1.475991649269311,
"grad_norm": 1.6638629849138615,
"learning_rate": 7.057262882183393e-05,
"loss": 0.7737,
"step": 44
},
{
"epoch": 1.5093945720250521,
"grad_norm": 1.1958951695778888,
"learning_rate": 6.994042992684406e-05,
"loss": 0.7499,
"step": 45
},
{
"epoch": 1.5427974947807934,
"grad_norm": 1.1237749762548175,
"learning_rate": 6.929074666391095e-05,
"loss": 0.7457,
"step": 46
},
{
"epoch": 1.5762004175365343,
"grad_norm": 0.9523363043794499,
"learning_rate": 6.862395842976484e-05,
"loss": 0.7449,
"step": 47
},
{
"epoch": 1.6096033402922756,
"grad_norm": 0.7794493625394828,
"learning_rate": 6.79404546099575e-05,
"loss": 0.7471,
"step": 48
},
{
"epoch": 1.6430062630480167,
"grad_norm": 2.1007938107258224,
"learning_rate": 6.724063435147189e-05,
"loss": 0.738,
"step": 49
},
{
"epoch": 1.6764091858037578,
"grad_norm": 0.8397590052899939,
"learning_rate": 6.652490632963182e-05,
"loss": 0.7366,
"step": 50
},
{
"epoch": 1.709812108559499,
"grad_norm": 1.5244169148841136,
"learning_rate": 6.579368850944683e-05,
"loss": 0.7518,
"step": 51
},
{
"epoch": 1.7432150313152401,
"grad_norm": 0.97500410064134,
"learning_rate": 6.504740790153255e-05,
"loss": 0.7365,
"step": 52
},
{
"epoch": 1.7766179540709812,
"grad_norm": 1.833109071141926,
"learning_rate": 6.428650031274845e-05,
"loss": 0.7327,
"step": 53
},
{
"epoch": 1.8100208768267223,
"grad_norm": 1.4707510085946327,
"learning_rate": 6.351141009169893e-05,
"loss": 0.7227,
"step": 54
},
{
"epoch": 1.8434237995824634,
"grad_norm": 1.2363917202252765,
"learning_rate": 6.272258986924624e-05,
"loss": 0.7405,
"step": 55
},
{
"epoch": 1.8768267223382047,
"grad_norm": 1.0298920741813498,
"learning_rate": 6.192050029418682e-05,
"loss": 0.7241,
"step": 56
},
{
"epoch": 1.9102296450939458,
"grad_norm": 0.9097363351471279,
"learning_rate": 6.110560976424531e-05,
"loss": 0.7167,
"step": 57
},
{
"epoch": 1.9436325678496869,
"grad_norm": 0.8471695952793523,
"learning_rate": 6.027839415254362e-05,
"loss": 0.7181,
"step": 58
},
{
"epoch": 1.977035490605428,
"grad_norm": 0.6602662698524506,
"learning_rate": 5.943933652970424e-05,
"loss": 0.7088,
"step": 59
},
{
"epoch": 2.0167014613778704,
"grad_norm": 0.624041177687339,
"learning_rate": 5.858892688175075e-05,
"loss": 0.6922,
"step": 60
},
{
"epoch": 2.0501043841336117,
"grad_norm": 0.731560229530671,
"learning_rate": 5.772766182396966e-05,
"loss": 0.6655,
"step": 61
},
{
"epoch": 2.083507306889353,
"grad_norm": 0.5160825456760252,
"learning_rate": 5.685604431090117e-05,
"loss": 0.6624,
"step": 62
},
{
"epoch": 2.116910229645094,
"grad_norm": 0.6466642583190281,
"learning_rate": 5.597458334262782e-05,
"loss": 0.6474,
"step": 63
},
{
"epoch": 2.150313152400835,
"grad_norm": 0.6905839273768964,
"learning_rate": 5.508379366753282e-05,
"loss": 0.6512,
"step": 64
},
{
"epoch": 2.183716075156576,
"grad_norm": 0.3760316450742919,
"learning_rate": 5.4184195481701425e-05,
"loss": 0.6523,
"step": 65
},
{
"epoch": 2.2171189979123174,
"grad_norm": 0.606234562718693,
"learning_rate": 5.3276314125141144e-05,
"loss": 0.6487,
"step": 66
},
{
"epoch": 2.2505219206680582,
"grad_norm": 0.44809718292050676,
"learning_rate": 5.23606797749979e-05,
"loss": 0.649,
"step": 67
},
{
"epoch": 2.2839248434237995,
"grad_norm": 0.40244410097202155,
"learning_rate": 5.1437827135947566e-05,
"loss": 0.6468,
"step": 68
},
{
"epoch": 2.317327766179541,
"grad_norm": 0.359719180741915,
"learning_rate": 5.050829512794348e-05,
"loss": 0.6409,
"step": 69
},
{
"epoch": 2.3507306889352817,
"grad_norm": 0.40415638024369727,
"learning_rate": 4.9572626571502316e-05,
"loss": 0.639,
"step": 70
},
{
"epoch": 2.384133611691023,
"grad_norm": 0.3340843503248373,
"learning_rate": 4.8631367870712254e-05,
"loss": 0.6326,
"step": 71
},
{
"epoch": 2.4175365344467643,
"grad_norm": 0.3262882595570267,
"learning_rate": 4.768506869414834e-05,
"loss": 0.6298,
"step": 72
},
{
"epoch": 2.450939457202505,
"grad_norm": 0.3253891492249243,
"learning_rate": 4.6734281653881536e-05,
"loss": 0.6326,
"step": 73
},
{
"epoch": 2.4843423799582465,
"grad_norm": 0.35311540573233735,
"learning_rate": 4.577956198276886e-05,
"loss": 0.6291,
"step": 74
},
{
"epoch": 2.5177453027139873,
"grad_norm": 0.3440383701499157,
"learning_rate": 4.4821467210212924e-05,
"loss": 0.6332,
"step": 75
},
{
"epoch": 2.5511482254697286,
"grad_norm": 0.30978369513311105,
"learning_rate": 4.386055683658061e-05,
"loss": 0.6408,
"step": 76
},
{
"epoch": 2.5845511482254695,
"grad_norm": 0.3823149004105222,
"learning_rate": 4.2897392006470503e-05,
"loss": 0.6246,
"step": 77
},
{
"epoch": 2.617954070981211,
"grad_norm": 0.2810880790539587,
"learning_rate": 4.1932535181020286e-05,
"loss": 0.6293,
"step": 78
},
{
"epoch": 2.651356993736952,
"grad_norm": 0.2835535239751324,
"learning_rate": 4.096654980944529e-05,
"loss": 0.6252,
"step": 79
},
{
"epoch": 2.684759916492693,
"grad_norm": 0.336833154001104,
"learning_rate": 4e-05,
"loss": 0.6305,
"step": 80
},
{
"epoch": 2.7181628392484343,
"grad_norm": 0.23274589850456745,
"learning_rate": 3.903345019055472e-05,
"loss": 0.6298,
"step": 81
},
{
"epoch": 2.7515657620041756,
"grad_norm": 0.2420684628004819,
"learning_rate": 3.806746481897973e-05,
"loss": 0.6241,
"step": 82
},
{
"epoch": 2.7849686847599164,
"grad_norm": 0.23622928619950834,
"learning_rate": 3.710260799352951e-05,
"loss": 0.6167,
"step": 83
},
{
"epoch": 2.8183716075156577,
"grad_norm": 0.21286687906297902,
"learning_rate": 3.6139443163419394e-05,
"loss": 0.6268,
"step": 84
},
{
"epoch": 2.8517745302713986,
"grad_norm": 0.20113400910479923,
"learning_rate": 3.517853278978708e-05,
"loss": 0.622,
"step": 85
},
{
"epoch": 2.88517745302714,
"grad_norm": 0.19296938971649688,
"learning_rate": 3.422043801723116e-05,
"loss": 0.6167,
"step": 86
},
{
"epoch": 2.9185803757828808,
"grad_norm": 0.17640926051127553,
"learning_rate": 3.3265718346118464e-05,
"loss": 0.6251,
"step": 87
},
{
"epoch": 2.951983298538622,
"grad_norm": 0.17760201524918323,
"learning_rate": 3.231493130585167e-05,
"loss": 0.6195,
"step": 88
},
{
"epoch": 2.9853862212943634,
"grad_norm": 0.18267169419590248,
"learning_rate": 3.136863212928776e-05,
"loss": 0.6214,
"step": 89
},
{
"epoch": 3.1002087682672235,
"grad_norm": 0.2479134339023779,
"learning_rate": 3.0427373428497704e-05,
"loss": 0.5792,
"step": 90
},
{
"epoch": 3.1336116910229643,
"grad_norm": 0.17829804990091588,
"learning_rate": 2.9491704872056525e-05,
"loss": 0.571,
"step": 91
},
{
"epoch": 3.1670146137787056,
"grad_norm": 0.2102957726786887,
"learning_rate": 2.8562172864052437e-05,
"loss": 0.5665,
"step": 92
},
{
"epoch": 3.200417536534447,
"grad_norm": 0.18138996143773695,
"learning_rate": 2.7639320225002108e-05,
"loss": 0.5734,
"step": 93
},
{
"epoch": 3.233820459290188,
"grad_norm": 0.18231114685106467,
"learning_rate": 2.6723685874858873e-05,
"loss": 0.5665,
"step": 94
},
{
"epoch": 3.267223382045929,
"grad_norm": 0.1891068826294468,
"learning_rate": 2.5815804518298575e-05,
"loss": 0.5649,
"step": 95
},
{
"epoch": 3.30062630480167,
"grad_norm": 0.1449193634467542,
"learning_rate": 2.4916206332467184e-05,
"loss": 0.5626,
"step": 96
},
{
"epoch": 3.3340292275574113,
"grad_norm": 0.17521384625415576,
"learning_rate": 2.4025416657372186e-05,
"loss": 0.5672,
"step": 97
},
{
"epoch": 3.3674321503131526,
"grad_norm": 0.17060274829594732,
"learning_rate": 2.3143955689098844e-05,
"loss": 0.5701,
"step": 98
},
{
"epoch": 3.4008350730688934,
"grad_norm": 0.16427792254004098,
"learning_rate": 2.2272338176030354e-05,
"loss": 0.5648,
"step": 99
},
{
"epoch": 3.4342379958246347,
"grad_norm": 0.16851785214921267,
"learning_rate": 2.141107311824926e-05,
"loss": 0.5637,
"step": 100
},
{
"epoch": 3.4676409185803756,
"grad_norm": 0.1647295715319099,
"learning_rate": 2.056066347029576e-05,
"loss": 0.5698,
"step": 101
},
{
"epoch": 3.501043841336117,
"grad_norm": 0.14383360405355872,
"learning_rate": 1.9721605847456397e-05,
"loss": 0.5678,
"step": 102
},
{
"epoch": 3.534446764091858,
"grad_norm": 0.16369393007489977,
"learning_rate": 1.8894390235754686e-05,
"loss": 0.5687,
"step": 103
},
{
"epoch": 3.567849686847599,
"grad_norm": 0.1484533364671656,
"learning_rate": 1.807949970581321e-05,
"loss": 0.5612,
"step": 104
},
{
"epoch": 3.6012526096033404,
"grad_norm": 0.13327395767499348,
"learning_rate": 1.7277410130753775e-05,
"loss": 0.5621,
"step": 105
},
{
"epoch": 3.6346555323590817,
"grad_norm": 0.14483989970799924,
"learning_rate": 1.648858990830108e-05,
"loss": 0.5602,
"step": 106
},
{
"epoch": 3.6680584551148225,
"grad_norm": 0.11501467177953302,
"learning_rate": 1.5713499687251554e-05,
"loss": 0.5625,
"step": 107
},
{
"epoch": 3.701461377870564,
"grad_norm": 0.12471522633663724,
"learning_rate": 1.4952592098467453e-05,
"loss": 0.5566,
"step": 108
},
{
"epoch": 3.7348643006263047,
"grad_norm": 0.12841415317626956,
"learning_rate": 1.4206311490553187e-05,
"loss": 0.5563,
"step": 109
},
{
"epoch": 3.768267223382046,
"grad_norm": 0.13024977809323665,
"learning_rate": 1.3475093670368202e-05,
"loss": 0.5642,
"step": 110
},
{
"epoch": 3.801670146137787,
"grad_norm": 0.12141142140280577,
"learning_rate": 1.275936564852811e-05,
"loss": 0.5619,
"step": 111
},
{
"epoch": 3.835073068893528,
"grad_norm": 0.1189681977822036,
"learning_rate": 1.2059545390042526e-05,
"loss": 0.5627,
"step": 112
},
{
"epoch": 3.8684759916492695,
"grad_norm": 0.11637565722872692,
"learning_rate": 1.1376041570235162e-05,
"loss": 0.5597,
"step": 113
},
{
"epoch": 3.9018789144050103,
"grad_norm": 0.11126444342562675,
"learning_rate": 1.070925333608907e-05,
"loss": 0.5646,
"step": 114
},
{
"epoch": 3.9352818371607516,
"grad_norm": 0.11144727795080511,
"learning_rate": 1.0059570073155953e-05,
"loss": 0.5663,
"step": 115
},
{
"epoch": 3.968684759916493,
"grad_norm": 0.11568625785765184,
"learning_rate": 9.427371178166065e-06,
"loss": 0.5628,
"step": 116
},
{
"epoch": 4.002087682672234,
"grad_norm": 0.1172051146855964,
"learning_rate": 8.81302583747111e-06,
"loss": 0.5657,
"step": 117
},
{
"epoch": 4.035490605427975,
"grad_norm": 0.1386962246589997,
"learning_rate": 8.216892811449834e-06,
"loss": 0.5431,
"step": 118
},
{
"epoch": 4.068893528183716,
"grad_norm": 0.12227065820674828,
"learning_rate": 7.639320225002106e-06,
"loss": 0.5386,
"step": 119
},
{
"epoch": 4.102296450939457,
"grad_norm": 0.11676189189940173,
"learning_rate": 7.080645364253747e-06,
"loss": 0.5341,
"step": 120
},
{
"epoch": 4.135699373695198,
"grad_norm": 0.1086671391408473,
"learning_rate": 6.541194479590931e-06,
"loss": 0.5472,
"step": 121
},
{
"epoch": 4.16910229645094,
"grad_norm": 0.11780635228612878,
"learning_rate": 6.021282595139167e-06,
"loss": 0.5376,
"step": 122
},
{
"epoch": 4.202505219206681,
"grad_norm": 0.11122819389457546,
"learning_rate": 5.521213324798029e-06,
"loss": 0.5405,
"step": 123
},
{
"epoch": 4.235908141962422,
"grad_norm": 0.11302950909483094,
"learning_rate": 5.0412786949392845e-06,
"loss": 0.5389,
"step": 124
},
{
"epoch": 4.2693110647181625,
"grad_norm": 0.10897006989347469,
"learning_rate": 4.581758973871609e-06,
"loss": 0.5443,
"step": 125
},
{
"epoch": 4.302713987473904,
"grad_norm": 0.10317913683812792,
"learning_rate": 4.142922508171849e-06,
"loss": 0.5363,
"step": 126
},
{
"epoch": 4.336116910229645,
"grad_norm": 0.10003486708202455,
"learning_rate": 3.7250255659781844e-06,
"loss": 0.5364,
"step": 127
},
{
"epoch": 4.369519832985386,
"grad_norm": 0.1071731871255614,
"learning_rate": 3.3283121873367043e-06,
"loss": 0.5432,
"step": 128
},
{
"epoch": 4.402922755741128,
"grad_norm": 0.10539727253992291,
"learning_rate": 2.9530140416889465e-06,
"loss": 0.5373,
"step": 129
},
{
"epoch": 4.4363256784968685,
"grad_norm": 0.09606764766200912,
"learning_rate": 2.5993502925834115e-06,
"loss": 0.5333,
"step": 130
},
{
"epoch": 4.469728601252609,
"grad_norm": 0.09219554498498256,
"learning_rate": 2.2675274696902737e-06,
"loss": 0.5315,
"step": 131
},
{
"epoch": 4.503131524008351,
"grad_norm": 0.08960509523269163,
"learning_rate": 1.957739348193859e-06,
"loss": 0.5334,
"step": 132
},
{
"epoch": 4.536534446764092,
"grad_norm": 0.09160224849384657,
"learning_rate": 1.670166835633351e-06,
"loss": 0.5384,
"step": 133
},
{
"epoch": 4.569937369519833,
"grad_norm": 0.08852713488345453,
"learning_rate": 1.4049778662579462e-06,
"loss": 0.53,
"step": 134
},
{
"epoch": 4.603340292275574,
"grad_norm": 0.09132315616256415,
"learning_rate": 1.1623273029579195e-06,
"loss": 0.538,
"step": 135
},
{
"epoch": 4.6367432150313155,
"grad_norm": 0.09194633813127549,
"learning_rate": 9.423568468291156e-07,
"loss": 0.541,
"step": 136
},
{
"epoch": 4.670146137787056,
"grad_norm": 0.09237238332398756,
"learning_rate": 7.451949544234627e-07,
"loss": 0.5379,
"step": 137
},
{
"epoch": 4.703549060542797,
"grad_norm": 0.08974432368849375,
"learning_rate": 5.709567627339674e-07,
"loss": 0.5443,
"step": 138
},
{
"epoch": 4.736951983298539,
"grad_norm": 0.09013165116820136,
"learning_rate": 4.1974402195795514e-07,
"loss": 0.535,
"step": 139
},
{
"epoch": 4.77035490605428,
"grad_norm": 0.09022158123863006,
"learning_rate": 2.916450360778411e-07,
"loss": 0.5333,
"step": 140
},
{
"epoch": 4.803757828810021,
"grad_norm": 0.08981410735542258,
"learning_rate": 1.867346112940549e-07,
"loss": 0.5462,
"step": 141
},
{
"epoch": 4.8371607515657615,
"grad_norm": 0.09141559902413697,
"learning_rate": 1.0507401234035819e-07,
"loss": 0.5377,
"step": 142
},
{
"epoch": 4.870563674321503,
"grad_norm": 0.08951832053531815,
"learning_rate": 4.6710926706934336e-08,
"loss": 0.5305,
"step": 143
},
{
"epoch": 4.903966597077244,
"grad_norm": 0.08926873218925392,
"learning_rate": 1.1679436792282339e-08,
"loss": 0.54,
"step": 144
},
{
"epoch": 4.937369519832985,
"grad_norm": 0.08723888890212035,
"learning_rate": 0.0,
"loss": 0.54,
"step": 145
},
{
"epoch": 4.937369519832985,
"step": 145,
"total_flos": 3.738141667979428e+18,
"train_loss": 0.2130410626016814,
"train_runtime": 6079.0514,
"train_samples_per_second": 12.591,
"train_steps_per_second": 0.024
}
],
"logging_steps": 1,
"max_steps": 145,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.738141667979428e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}