|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.937369519832985, |
|
"eval_steps": 500, |
|
"global_step": 145, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.033402922755741124, |
|
"grad_norm": 7.154098089649519, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.2049, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06680584551148225, |
|
"grad_norm": 7.224671814367719, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 1.2046, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.10020876826722339, |
|
"grad_norm": 5.112010906482035, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.1276, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1336116910229645, |
|
"grad_norm": 5.240191362417293, |
|
"learning_rate": 2.1333333333333335e-05, |
|
"loss": 1.0958, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16701461377870563, |
|
"grad_norm": 4.398708169023894, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 1.0242, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20041753653444677, |
|
"grad_norm": 4.9473080352678895, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.0277, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.23382045929018788, |
|
"grad_norm": 3.8837845230573835, |
|
"learning_rate": 3.733333333333334e-05, |
|
"loss": 0.9755, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.267223382045929, |
|
"grad_norm": 2.957475416120971, |
|
"learning_rate": 4.266666666666667e-05, |
|
"loss": 0.9334, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.30062630480167013, |
|
"grad_norm": 2.187999537773017, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.9211, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.33402922755741127, |
|
"grad_norm": 2.3155958654718614, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 0.8983, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3674321503131524, |
|
"grad_norm": 2.8444701447464436, |
|
"learning_rate": 5.8666666666666665e-05, |
|
"loss": 0.8975, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.40083507306889354, |
|
"grad_norm": 3.0982586639870213, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.8856, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4342379958246347, |
|
"grad_norm": 1.8874954111166966, |
|
"learning_rate": 6.933333333333334e-05, |
|
"loss": 0.872, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.46764091858037576, |
|
"grad_norm": 3.038189077699479, |
|
"learning_rate": 7.466666666666667e-05, |
|
"loss": 0.8892, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5010438413361169, |
|
"grad_norm": 1.7381979961116139, |
|
"learning_rate": 8e-05, |
|
"loss": 0.8535, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.534446764091858, |
|
"grad_norm": 2188.323143153608, |
|
"learning_rate": 7.998832056320773e-05, |
|
"loss": 1.0923, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5678496868475992, |
|
"grad_norm": 5.568753243553315, |
|
"learning_rate": 7.995328907329308e-05, |
|
"loss": 0.9433, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6012526096033403, |
|
"grad_norm": 3.003707178203899, |
|
"learning_rate": 7.989492598765966e-05, |
|
"loss": 0.8783, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6346555323590815, |
|
"grad_norm": 3.3787729580945367, |
|
"learning_rate": 7.981326538870596e-05, |
|
"loss": 0.8657, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6680584551148225, |
|
"grad_norm": 3.0096085489081923, |
|
"learning_rate": 7.970835496392216e-05, |
|
"loss": 0.8705, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7014613778705637, |
|
"grad_norm": 2.2251853165136186, |
|
"learning_rate": 7.958025597804205e-05, |
|
"loss": 0.8591, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7348643006263048, |
|
"grad_norm": 1.393655838420696, |
|
"learning_rate": 7.942904323726604e-05, |
|
"loss": 0.8202, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7682672233820459, |
|
"grad_norm": 1.709595774468825, |
|
"learning_rate": 7.925480504557654e-05, |
|
"loss": 0.8239, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8016701461377871, |
|
"grad_norm": 1.04923819217978, |
|
"learning_rate": 7.90576431531709e-05, |
|
"loss": 0.8236, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8350730688935282, |
|
"grad_norm": 1.4676306786694173, |
|
"learning_rate": 7.883767269704209e-05, |
|
"loss": 0.8083, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8684759916492694, |
|
"grad_norm": 417.78005391043996, |
|
"learning_rate": 7.859502213374207e-05, |
|
"loss": 1.1719, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9018789144050104, |
|
"grad_norm": 12.203128331470229, |
|
"learning_rate": 7.832983316436666e-05, |
|
"loss": 0.8597, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9352818371607515, |
|
"grad_norm": 4.957392164656034, |
|
"learning_rate": 7.804226065180615e-05, |
|
"loss": 0.9382, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9686847599164927, |
|
"grad_norm": 10.61998316626802, |
|
"learning_rate": 7.773247253030973e-05, |
|
"loss": 0.9939, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0083507306889352, |
|
"grad_norm": 54.87377316789775, |
|
"learning_rate": 7.740064970741661e-05, |
|
"loss": 0.8724, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0417536534446765, |
|
"grad_norm": 88.85368950052036, |
|
"learning_rate": 7.704698595831107e-05, |
|
"loss": 0.9805, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0751565762004176, |
|
"grad_norm": 12.07917493536492, |
|
"learning_rate": 7.667168781266331e-05, |
|
"loss": 0.9689, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.1085594989561587, |
|
"grad_norm": 30.730151095536375, |
|
"learning_rate": 7.627497443402182e-05, |
|
"loss": 1.0908, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.1419624217118998, |
|
"grad_norm": 4.303465706734318, |
|
"learning_rate": 7.585707749182816e-05, |
|
"loss": 0.8883, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.1753653444676408, |
|
"grad_norm": 1.3597826636785535, |
|
"learning_rate": 7.541824102612839e-05, |
|
"loss": 0.8376, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.2087682672233822, |
|
"grad_norm": 2.028071696746282, |
|
"learning_rate": 7.495872130506072e-05, |
|
"loss": 0.8018, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.2421711899791232, |
|
"grad_norm": 1.5668710744698326, |
|
"learning_rate": 7.447878667520198e-05, |
|
"loss": 0.7901, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.2755741127348643, |
|
"grad_norm": 0.9166659557892114, |
|
"learning_rate": 7.397871740486085e-05, |
|
"loss": 0.7699, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.3089770354906054, |
|
"grad_norm": 8.207728871032339, |
|
"learning_rate": 7.345880552040907e-05, |
|
"loss": 0.7735, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.3423799582463465, |
|
"grad_norm": 2.508524585534657, |
|
"learning_rate": 7.291935463574626e-05, |
|
"loss": 0.8447, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.3757828810020878, |
|
"grad_norm": 1.364586902470756, |
|
"learning_rate": 7.236067977499791e-05, |
|
"loss": 0.7856, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.4091858037578289, |
|
"grad_norm": 1.8720653087352772, |
|
"learning_rate": 7.178310718855018e-05, |
|
"loss": 0.7829, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.44258872651357, |
|
"grad_norm": 1.8178558573775088, |
|
"learning_rate": 7.11869741625289e-05, |
|
"loss": 0.7737, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.475991649269311, |
|
"grad_norm": 1.6638629849138615, |
|
"learning_rate": 7.057262882183393e-05, |
|
"loss": 0.7737, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.5093945720250521, |
|
"grad_norm": 1.1958951695778888, |
|
"learning_rate": 6.994042992684406e-05, |
|
"loss": 0.7499, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.5427974947807934, |
|
"grad_norm": 1.1237749762548175, |
|
"learning_rate": 6.929074666391095e-05, |
|
"loss": 0.7457, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.5762004175365343, |
|
"grad_norm": 0.9523363043794499, |
|
"learning_rate": 6.862395842976484e-05, |
|
"loss": 0.7449, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.6096033402922756, |
|
"grad_norm": 0.7794493625394828, |
|
"learning_rate": 6.79404546099575e-05, |
|
"loss": 0.7471, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.6430062630480167, |
|
"grad_norm": 2.1007938107258224, |
|
"learning_rate": 6.724063435147189e-05, |
|
"loss": 0.738, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6764091858037578, |
|
"grad_norm": 0.8397590052899939, |
|
"learning_rate": 6.652490632963182e-05, |
|
"loss": 0.7366, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.709812108559499, |
|
"grad_norm": 1.5244169148841136, |
|
"learning_rate": 6.579368850944683e-05, |
|
"loss": 0.7518, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.7432150313152401, |
|
"grad_norm": 0.97500410064134, |
|
"learning_rate": 6.504740790153255e-05, |
|
"loss": 0.7365, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.7766179540709812, |
|
"grad_norm": 1.833109071141926, |
|
"learning_rate": 6.428650031274845e-05, |
|
"loss": 0.7327, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.8100208768267223, |
|
"grad_norm": 1.4707510085946327, |
|
"learning_rate": 6.351141009169893e-05, |
|
"loss": 0.7227, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.8434237995824634, |
|
"grad_norm": 1.2363917202252765, |
|
"learning_rate": 6.272258986924624e-05, |
|
"loss": 0.7405, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.8768267223382047, |
|
"grad_norm": 1.0298920741813498, |
|
"learning_rate": 6.192050029418682e-05, |
|
"loss": 0.7241, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.9102296450939458, |
|
"grad_norm": 0.9097363351471279, |
|
"learning_rate": 6.110560976424531e-05, |
|
"loss": 0.7167, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.9436325678496869, |
|
"grad_norm": 0.8471695952793523, |
|
"learning_rate": 6.027839415254362e-05, |
|
"loss": 0.7181, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.977035490605428, |
|
"grad_norm": 0.6602662698524506, |
|
"learning_rate": 5.943933652970424e-05, |
|
"loss": 0.7088, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.0167014613778704, |
|
"grad_norm": 0.624041177687339, |
|
"learning_rate": 5.858892688175075e-05, |
|
"loss": 0.6922, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.0501043841336117, |
|
"grad_norm": 0.731560229530671, |
|
"learning_rate": 5.772766182396966e-05, |
|
"loss": 0.6655, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.083507306889353, |
|
"grad_norm": 0.5160825456760252, |
|
"learning_rate": 5.685604431090117e-05, |
|
"loss": 0.6624, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.116910229645094, |
|
"grad_norm": 0.6466642583190281, |
|
"learning_rate": 5.597458334262782e-05, |
|
"loss": 0.6474, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.150313152400835, |
|
"grad_norm": 0.6905839273768964, |
|
"learning_rate": 5.508379366753282e-05, |
|
"loss": 0.6512, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.183716075156576, |
|
"grad_norm": 0.3760316450742919, |
|
"learning_rate": 5.4184195481701425e-05, |
|
"loss": 0.6523, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.2171189979123174, |
|
"grad_norm": 0.606234562718693, |
|
"learning_rate": 5.3276314125141144e-05, |
|
"loss": 0.6487, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.2505219206680582, |
|
"grad_norm": 0.44809718292050676, |
|
"learning_rate": 5.23606797749979e-05, |
|
"loss": 0.649, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.2839248434237995, |
|
"grad_norm": 0.40244410097202155, |
|
"learning_rate": 5.1437827135947566e-05, |
|
"loss": 0.6468, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.317327766179541, |
|
"grad_norm": 0.359719180741915, |
|
"learning_rate": 5.050829512794348e-05, |
|
"loss": 0.6409, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.3507306889352817, |
|
"grad_norm": 0.40415638024369727, |
|
"learning_rate": 4.9572626571502316e-05, |
|
"loss": 0.639, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.384133611691023, |
|
"grad_norm": 0.3340843503248373, |
|
"learning_rate": 4.8631367870712254e-05, |
|
"loss": 0.6326, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.4175365344467643, |
|
"grad_norm": 0.3262882595570267, |
|
"learning_rate": 4.768506869414834e-05, |
|
"loss": 0.6298, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.450939457202505, |
|
"grad_norm": 0.3253891492249243, |
|
"learning_rate": 4.6734281653881536e-05, |
|
"loss": 0.6326, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.4843423799582465, |
|
"grad_norm": 0.35311540573233735, |
|
"learning_rate": 4.577956198276886e-05, |
|
"loss": 0.6291, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.5177453027139873, |
|
"grad_norm": 0.3440383701499157, |
|
"learning_rate": 4.4821467210212924e-05, |
|
"loss": 0.6332, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.5511482254697286, |
|
"grad_norm": 0.30978369513311105, |
|
"learning_rate": 4.386055683658061e-05, |
|
"loss": 0.6408, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.5845511482254695, |
|
"grad_norm": 0.3823149004105222, |
|
"learning_rate": 4.2897392006470503e-05, |
|
"loss": 0.6246, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.617954070981211, |
|
"grad_norm": 0.2810880790539587, |
|
"learning_rate": 4.1932535181020286e-05, |
|
"loss": 0.6293, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.651356993736952, |
|
"grad_norm": 0.2835535239751324, |
|
"learning_rate": 4.096654980944529e-05, |
|
"loss": 0.6252, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.684759916492693, |
|
"grad_norm": 0.336833154001104, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6305, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.7181628392484343, |
|
"grad_norm": 0.23274589850456745, |
|
"learning_rate": 3.903345019055472e-05, |
|
"loss": 0.6298, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.7515657620041756, |
|
"grad_norm": 0.2420684628004819, |
|
"learning_rate": 3.806746481897973e-05, |
|
"loss": 0.6241, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.7849686847599164, |
|
"grad_norm": 0.23622928619950834, |
|
"learning_rate": 3.710260799352951e-05, |
|
"loss": 0.6167, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.8183716075156577, |
|
"grad_norm": 0.21286687906297902, |
|
"learning_rate": 3.6139443163419394e-05, |
|
"loss": 0.6268, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.8517745302713986, |
|
"grad_norm": 0.20113400910479923, |
|
"learning_rate": 3.517853278978708e-05, |
|
"loss": 0.622, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.88517745302714, |
|
"grad_norm": 0.19296938971649688, |
|
"learning_rate": 3.422043801723116e-05, |
|
"loss": 0.6167, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.9185803757828808, |
|
"grad_norm": 0.17640926051127553, |
|
"learning_rate": 3.3265718346118464e-05, |
|
"loss": 0.6251, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.951983298538622, |
|
"grad_norm": 0.17760201524918323, |
|
"learning_rate": 3.231493130585167e-05, |
|
"loss": 0.6195, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.9853862212943634, |
|
"grad_norm": 0.18267169419590248, |
|
"learning_rate": 3.136863212928776e-05, |
|
"loss": 0.6214, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 3.1002087682672235, |
|
"grad_norm": 0.2479134339023779, |
|
"learning_rate": 3.0427373428497704e-05, |
|
"loss": 0.5792, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.1336116910229643, |
|
"grad_norm": 0.17829804990091588, |
|
"learning_rate": 2.9491704872056525e-05, |
|
"loss": 0.571, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 3.1670146137787056, |
|
"grad_norm": 0.2102957726786887, |
|
"learning_rate": 2.8562172864052437e-05, |
|
"loss": 0.5665, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 3.200417536534447, |
|
"grad_norm": 0.18138996143773695, |
|
"learning_rate": 2.7639320225002108e-05, |
|
"loss": 0.5734, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.233820459290188, |
|
"grad_norm": 0.18231114685106467, |
|
"learning_rate": 2.6723685874858873e-05, |
|
"loss": 0.5665, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.267223382045929, |
|
"grad_norm": 0.1891068826294468, |
|
"learning_rate": 2.5815804518298575e-05, |
|
"loss": 0.5649, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.30062630480167, |
|
"grad_norm": 0.1449193634467542, |
|
"learning_rate": 2.4916206332467184e-05, |
|
"loss": 0.5626, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.3340292275574113, |
|
"grad_norm": 0.17521384625415576, |
|
"learning_rate": 2.4025416657372186e-05, |
|
"loss": 0.5672, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.3674321503131526, |
|
"grad_norm": 0.17060274829594732, |
|
"learning_rate": 2.3143955689098844e-05, |
|
"loss": 0.5701, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.4008350730688934, |
|
"grad_norm": 0.16427792254004098, |
|
"learning_rate": 2.2272338176030354e-05, |
|
"loss": 0.5648, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.4342379958246347, |
|
"grad_norm": 0.16851785214921267, |
|
"learning_rate": 2.141107311824926e-05, |
|
"loss": 0.5637, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.4676409185803756, |
|
"grad_norm": 0.1647295715319099, |
|
"learning_rate": 2.056066347029576e-05, |
|
"loss": 0.5698, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.501043841336117, |
|
"grad_norm": 0.14383360405355872, |
|
"learning_rate": 1.9721605847456397e-05, |
|
"loss": 0.5678, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.534446764091858, |
|
"grad_norm": 0.16369393007489977, |
|
"learning_rate": 1.8894390235754686e-05, |
|
"loss": 0.5687, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.567849686847599, |
|
"grad_norm": 0.1484533364671656, |
|
"learning_rate": 1.807949970581321e-05, |
|
"loss": 0.5612, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.6012526096033404, |
|
"grad_norm": 0.13327395767499348, |
|
"learning_rate": 1.7277410130753775e-05, |
|
"loss": 0.5621, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.6346555323590817, |
|
"grad_norm": 0.14483989970799924, |
|
"learning_rate": 1.648858990830108e-05, |
|
"loss": 0.5602, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.6680584551148225, |
|
"grad_norm": 0.11501467177953302, |
|
"learning_rate": 1.5713499687251554e-05, |
|
"loss": 0.5625, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.701461377870564, |
|
"grad_norm": 0.12471522633663724, |
|
"learning_rate": 1.4952592098467453e-05, |
|
"loss": 0.5566, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.7348643006263047, |
|
"grad_norm": 0.12841415317626956, |
|
"learning_rate": 1.4206311490553187e-05, |
|
"loss": 0.5563, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.768267223382046, |
|
"grad_norm": 0.13024977809323665, |
|
"learning_rate": 1.3475093670368202e-05, |
|
"loss": 0.5642, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.801670146137787, |
|
"grad_norm": 0.12141142140280577, |
|
"learning_rate": 1.275936564852811e-05, |
|
"loss": 0.5619, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.835073068893528, |
|
"grad_norm": 0.1189681977822036, |
|
"learning_rate": 1.2059545390042526e-05, |
|
"loss": 0.5627, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.8684759916492695, |
|
"grad_norm": 0.11637565722872692, |
|
"learning_rate": 1.1376041570235162e-05, |
|
"loss": 0.5597, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.9018789144050103, |
|
"grad_norm": 0.11126444342562675, |
|
"learning_rate": 1.070925333608907e-05, |
|
"loss": 0.5646, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.9352818371607516, |
|
"grad_norm": 0.11144727795080511, |
|
"learning_rate": 1.0059570073155953e-05, |
|
"loss": 0.5663, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.968684759916493, |
|
"grad_norm": 0.11568625785765184, |
|
"learning_rate": 9.427371178166065e-06, |
|
"loss": 0.5628, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 4.002087682672234, |
|
"grad_norm": 0.1172051146855964, |
|
"learning_rate": 8.81302583747111e-06, |
|
"loss": 0.5657, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 4.035490605427975, |
|
"grad_norm": 0.1386962246589997, |
|
"learning_rate": 8.216892811449834e-06, |
|
"loss": 0.5431, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 4.068893528183716, |
|
"grad_norm": 0.12227065820674828, |
|
"learning_rate": 7.639320225002106e-06, |
|
"loss": 0.5386, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 4.102296450939457, |
|
"grad_norm": 0.11676189189940173, |
|
"learning_rate": 7.080645364253747e-06, |
|
"loss": 0.5341, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.135699373695198, |
|
"grad_norm": 0.1086671391408473, |
|
"learning_rate": 6.541194479590931e-06, |
|
"loss": 0.5472, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 4.16910229645094, |
|
"grad_norm": 0.11780635228612878, |
|
"learning_rate": 6.021282595139167e-06, |
|
"loss": 0.5376, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 4.202505219206681, |
|
"grad_norm": 0.11122819389457546, |
|
"learning_rate": 5.521213324798029e-06, |
|
"loss": 0.5405, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 4.235908141962422, |
|
"grad_norm": 0.11302950909483094, |
|
"learning_rate": 5.0412786949392845e-06, |
|
"loss": 0.5389, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 4.2693110647181625, |
|
"grad_norm": 0.10897006989347469, |
|
"learning_rate": 4.581758973871609e-06, |
|
"loss": 0.5443, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.302713987473904, |
|
"grad_norm": 0.10317913683812792, |
|
"learning_rate": 4.142922508171849e-06, |
|
"loss": 0.5363, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.336116910229645, |
|
"grad_norm": 0.10003486708202455, |
|
"learning_rate": 3.7250255659781844e-06, |
|
"loss": 0.5364, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.369519832985386, |
|
"grad_norm": 0.1071731871255614, |
|
"learning_rate": 3.3283121873367043e-06, |
|
"loss": 0.5432, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.402922755741128, |
|
"grad_norm": 0.10539727253992291, |
|
"learning_rate": 2.9530140416889465e-06, |
|
"loss": 0.5373, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.4363256784968685, |
|
"grad_norm": 0.09606764766200912, |
|
"learning_rate": 2.5993502925834115e-06, |
|
"loss": 0.5333, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.469728601252609, |
|
"grad_norm": 0.09219554498498256, |
|
"learning_rate": 2.2675274696902737e-06, |
|
"loss": 0.5315, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.503131524008351, |
|
"grad_norm": 0.08960509523269163, |
|
"learning_rate": 1.957739348193859e-06, |
|
"loss": 0.5334, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.536534446764092, |
|
"grad_norm": 0.09160224849384657, |
|
"learning_rate": 1.670166835633351e-06, |
|
"loss": 0.5384, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.569937369519833, |
|
"grad_norm": 0.08852713488345453, |
|
"learning_rate": 1.4049778662579462e-06, |
|
"loss": 0.53, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.603340292275574, |
|
"grad_norm": 0.09132315616256415, |
|
"learning_rate": 1.1623273029579195e-06, |
|
"loss": 0.538, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.6367432150313155, |
|
"grad_norm": 0.09194633813127549, |
|
"learning_rate": 9.423568468291156e-07, |
|
"loss": 0.541, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.670146137787056, |
|
"grad_norm": 0.09237238332398756, |
|
"learning_rate": 7.451949544234627e-07, |
|
"loss": 0.5379, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.703549060542797, |
|
"grad_norm": 0.08974432368849375, |
|
"learning_rate": 5.709567627339674e-07, |
|
"loss": 0.5443, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.736951983298539, |
|
"grad_norm": 0.09013165116820136, |
|
"learning_rate": 4.1974402195795514e-07, |
|
"loss": 0.535, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.77035490605428, |
|
"grad_norm": 0.09022158123863006, |
|
"learning_rate": 2.916450360778411e-07, |
|
"loss": 0.5333, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.803757828810021, |
|
"grad_norm": 0.08981410735542258, |
|
"learning_rate": 1.867346112940549e-07, |
|
"loss": 0.5462, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.8371607515657615, |
|
"grad_norm": 0.09141559902413697, |
|
"learning_rate": 1.0507401234035819e-07, |
|
"loss": 0.5377, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.870563674321503, |
|
"grad_norm": 0.08951832053531815, |
|
"learning_rate": 4.6710926706934336e-08, |
|
"loss": 0.5305, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.903966597077244, |
|
"grad_norm": 0.08926873218925392, |
|
"learning_rate": 1.1679436792282339e-08, |
|
"loss": 0.54, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.937369519832985, |
|
"grad_norm": 0.08723888890212035, |
|
"learning_rate": 0.0, |
|
"loss": 0.54, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.937369519832985, |
|
"step": 145, |
|
"total_flos": 3.738141667979428e+18, |
|
"train_loss": 0.2130410626016814, |
|
"train_runtime": 6079.0514, |
|
"train_samples_per_second": 12.591, |
|
"train_steps_per_second": 0.024 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 145, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.738141667979428e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|