simonycl's picture
Upload folder using huggingface_hub
cf126a6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990762978015888,
"eval_steps": 400,
"global_step": 507,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001970564689943962,
"grad_norm": 3.539861305637653,
"learning_rate": 9.803921568627451e-09,
"logits/chosen": -0.03196336328983307,
"logits/rejected": -0.15967734158039093,
"logps/chosen": -99.96153259277344,
"logps/rejected": -93.94828033447266,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.00985282344971981,
"grad_norm": 3.74004975366523,
"learning_rate": 4.901960784313725e-08,
"logits/chosen": -0.042198292911052704,
"logits/rejected": -0.34676456451416016,
"logps/chosen": -112.20402526855469,
"logps/rejected": -101.837646484375,
"loss": 0.6932,
"rewards/accuracies": 0.421875,
"rewards/chosen": 0.0011955354129895568,
"rewards/margins": 0.0004524323157966137,
"rewards/rejected": 0.0007431029807776213,
"step": 5
},
{
"epoch": 0.01970564689943962,
"grad_norm": 3.5769285168326417,
"learning_rate": 9.80392156862745e-08,
"logits/chosen": -0.16411139070987701,
"logits/rejected": -0.29173845052719116,
"logps/chosen": -94.1719741821289,
"logps/rejected": -96.02313232421875,
"loss": 0.6933,
"rewards/accuracies": 0.4375,
"rewards/chosen": 4.2173640395049006e-05,
"rewards/margins": -0.001043520518578589,
"rewards/rejected": 0.001085694064386189,
"step": 10
},
{
"epoch": 0.02955847034915943,
"grad_norm": 3.3653465070579673,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": -0.07717452943325043,
"logits/rejected": -0.3865337371826172,
"logps/chosen": -100.953125,
"logps/rejected": -92.61229705810547,
"loss": 0.6931,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.00025451680994592607,
"rewards/margins": -0.0001873960136435926,
"rewards/rejected": -6.712078902637586e-05,
"step": 15
},
{
"epoch": 0.03941129379887924,
"grad_norm": 3.373649973394512,
"learning_rate": 1.96078431372549e-07,
"logits/chosen": -0.0971444696187973,
"logits/rejected": -0.35578858852386475,
"logps/chosen": -106.3487319946289,
"logps/rejected": -102.8326416015625,
"loss": 0.6929,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.0003346280718687922,
"rewards/margins": 0.0001382694172207266,
"rewards/rejected": -0.00047289757640101016,
"step": 20
},
{
"epoch": 0.049264117248599054,
"grad_norm": 3.224755352064536,
"learning_rate": 2.4509803921568627e-07,
"logits/chosen": -0.10794611275196075,
"logits/rejected": -0.29162880778312683,
"logps/chosen": -99.07095336914062,
"logps/rejected": -95.20055389404297,
"loss": 0.6927,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.00042314338497817516,
"rewards/margins": 0.0012360246619209647,
"rewards/rejected": -0.0008128813351504505,
"step": 25
},
{
"epoch": 0.05911694069831886,
"grad_norm": 3.444342352448875,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -0.12300117313861847,
"logits/rejected": -0.27830368280410767,
"logps/chosen": -105.83805847167969,
"logps/rejected": -104.1891860961914,
"loss": 0.6914,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.002045437227934599,
"rewards/margins": 0.004656647797673941,
"rewards/rejected": -0.002611211035400629,
"step": 30
},
{
"epoch": 0.06896976414803867,
"grad_norm": 3.5299163808469496,
"learning_rate": 3.431372549019608e-07,
"logits/chosen": -0.03614411875605583,
"logits/rejected": -0.3109976053237915,
"logps/chosen": -99.59745788574219,
"logps/rejected": -98.72537231445312,
"loss": 0.6903,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.000885780609678477,
"rewards/margins": 0.0061457473784685135,
"rewards/rejected": -0.005259966477751732,
"step": 35
},
{
"epoch": 0.07882258759775848,
"grad_norm": 3.9275316344127242,
"learning_rate": 3.92156862745098e-07,
"logits/chosen": -0.08717192709445953,
"logits/rejected": -0.30741095542907715,
"logps/chosen": -99.77064514160156,
"logps/rejected": -96.07014465332031,
"loss": 0.6882,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.0018755150958895683,
"rewards/margins": 0.007984376512467861,
"rewards/rejected": -0.00985989160835743,
"step": 40
},
{
"epoch": 0.0886754110474783,
"grad_norm": 3.5883285534109493,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": -0.032524555921554565,
"logits/rejected": -0.3046508729457855,
"logps/chosen": -96.41732788085938,
"logps/rejected": -95.6390380859375,
"loss": 0.6842,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.005956724286079407,
"rewards/margins": 0.01741139218211174,
"rewards/rejected": -0.023368116468191147,
"step": 45
},
{
"epoch": 0.09852823449719811,
"grad_norm": 3.3794927211727392,
"learning_rate": 4.901960784313725e-07,
"logits/chosen": -0.12240082025527954,
"logits/rejected": -0.28499796986579895,
"logps/chosen": -106.6746597290039,
"logps/rejected": -104.4252700805664,
"loss": 0.6811,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.017990436404943466,
"rewards/margins": 0.024439355358481407,
"rewards/rejected": -0.042429789900779724,
"step": 50
},
{
"epoch": 0.10838105794691791,
"grad_norm": 3.8844805866458234,
"learning_rate": 4.999050767562379e-07,
"logits/chosen": -0.06140371039509773,
"logits/rejected": -0.36449694633483887,
"logps/chosen": -112.74903869628906,
"logps/rejected": -107.06253814697266,
"loss": 0.6729,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.048891425132751465,
"rewards/margins": 0.04517052322626114,
"rewards/rejected": -0.0940619483590126,
"step": 55
},
{
"epoch": 0.11823388139663772,
"grad_norm": 4.246075688346676,
"learning_rate": 4.99519574616467e-07,
"logits/chosen": -0.08420858532190323,
"logits/rejected": -0.2296031415462494,
"logps/chosen": -106.52708435058594,
"logps/rejected": -114.71268463134766,
"loss": 0.6598,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.0871507003903389,
"rewards/margins": 0.07720854133367538,
"rewards/rejected": -0.16435924172401428,
"step": 60
},
{
"epoch": 0.12808670484635754,
"grad_norm": 4.078056870628684,
"learning_rate": 4.988380179235842e-07,
"logits/chosen": -0.04726668819785118,
"logits/rejected": -0.2177656590938568,
"logps/chosen": -117.36442565917969,
"logps/rejected": -122.31159973144531,
"loss": 0.6488,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.15060417354106903,
"rewards/margins": 0.0838489979505539,
"rewards/rejected": -0.23445317149162292,
"step": 65
},
{
"epoch": 0.13793952829607734,
"grad_norm": 3.820719733352528,
"learning_rate": 4.978612153434526e-07,
"logits/chosen": -0.05979006737470627,
"logits/rejected": -0.19735023379325867,
"logps/chosen": -124.60560607910156,
"logps/rejected": -155.1808624267578,
"loss": 0.6318,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2229953557252884,
"rewards/margins": 0.31397417187690735,
"rewards/rejected": -0.5369695425033569,
"step": 70
},
{
"epoch": 0.14779235174579716,
"grad_norm": 5.031693747430946,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -0.059552647173404694,
"logits/rejected": -0.19415248930454254,
"logps/chosen": -136.63735961914062,
"logps/rejected": -191.6803741455078,
"loss": 0.6086,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.35170239210128784,
"rewards/margins": 0.49619174003601074,
"rewards/rejected": -0.8478941917419434,
"step": 75
},
{
"epoch": 0.15764517519551696,
"grad_norm": 5.160412651014448,
"learning_rate": 4.950268573535011e-07,
"logits/chosen": -0.0038110867608338594,
"logits/rejected": -0.15630409121513367,
"logps/chosen": -147.3404083251953,
"logps/rejected": -173.90728759765625,
"loss": 0.5967,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.4520920217037201,
"rewards/margins": 0.27063217759132385,
"rewards/rejected": -0.7227243185043335,
"step": 80
},
{
"epoch": 0.16749799864523676,
"grad_norm": 6.145536461291785,
"learning_rate": 4.93172664904641e-07,
"logits/chosen": 0.01297803781926632,
"logits/rejected": -0.1908242255449295,
"logps/chosen": -167.2682647705078,
"logps/rejected": -198.43177795410156,
"loss": 0.5701,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.589021623134613,
"rewards/margins": 0.3708820641040802,
"rewards/rejected": -0.9599035978317261,
"step": 85
},
{
"epoch": 0.1773508220949566,
"grad_norm": 60.6627948159201,
"learning_rate": 4.910299485003033e-07,
"logits/chosen": -0.007486692164093256,
"logits/rejected": -0.18025372922420502,
"logps/chosen": -183.25140380859375,
"logps/rejected": -288.912109375,
"loss": 0.5755,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8218961954116821,
"rewards/margins": 1.0762311220169067,
"rewards/rejected": -1.8981273174285889,
"step": 90
},
{
"epoch": 0.1872036455446764,
"grad_norm": 6.619891594866407,
"learning_rate": 4.886012504698769e-07,
"logits/chosen": -0.00334315188229084,
"logits/rejected": -0.3066111207008362,
"logps/chosen": -222.06753540039062,
"logps/rejected": -272.4456481933594,
"loss": 0.5397,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1969377994537354,
"rewards/margins": 0.5773923397064209,
"rewards/rejected": -1.7743301391601562,
"step": 95
},
{
"epoch": 0.19705646899439622,
"grad_norm": 6.740434195327616,
"learning_rate": 4.858894524594652e-07,
"logits/chosen": -0.1177954450249672,
"logits/rejected": -0.3193029761314392,
"logps/chosen": -216.84396362304688,
"logps/rejected": -431.8846740722656,
"loss": 0.5071,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.120563268661499,
"rewards/margins": 2.212986707687378,
"rewards/rejected": -3.333550214767456,
"step": 100
},
{
"epoch": 0.20690929244411602,
"grad_norm": 7.617693374089276,
"learning_rate": 4.828977720128198e-07,
"logits/chosen": -0.1505376100540161,
"logits/rejected": -0.3399549126625061,
"logps/chosen": -235.60617065429688,
"logps/rejected": -358.7390441894531,
"loss": 0.497,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.3088951110839844,
"rewards/margins": 1.3302855491638184,
"rewards/rejected": -2.639180898666382,
"step": 105
},
{
"epoch": 0.21676211589383582,
"grad_norm": 7.105788785064987,
"learning_rate": 4.796297587537285e-07,
"logits/chosen": -0.1506040096282959,
"logits/rejected": -0.3250656723976135,
"logps/chosen": -257.82464599609375,
"logps/rejected": -416.75958251953125,
"loss": 0.4453,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5325710773468018,
"rewards/margins": 1.6099077463150024,
"rewards/rejected": -3.1424789428710938,
"step": 110
},
{
"epoch": 0.22661493934355564,
"grad_norm": 8.60769452133117,
"learning_rate": 4.760892901743944e-07,
"logits/chosen": -0.13159573078155518,
"logits/rejected": -0.3428110182285309,
"logps/chosen": -300.0444030761719,
"logps/rejected": -514.811279296875,
"loss": 0.4607,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.9326972961425781,
"rewards/margins": 2.217697858810425,
"rewards/rejected": -4.150395393371582,
"step": 115
},
{
"epoch": 0.23646776279327544,
"grad_norm": 10.209138794878942,
"learning_rate": 4.7228056703479626e-07,
"logits/chosen": -0.16759036481380463,
"logits/rejected": -0.3990747332572937,
"logps/chosen": -299.72705078125,
"logps/rejected": -433.87823486328125,
"loss": 0.4381,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.9569202661514282,
"rewards/margins": 1.37308669090271,
"rewards/rejected": -3.3300068378448486,
"step": 120
},
{
"epoch": 0.24632058624299527,
"grad_norm": 28.609743111268017,
"learning_rate": 4.6820810837849535e-07,
"logits/chosen": -0.20535437762737274,
"logits/rejected": -0.4542999267578125,
"logps/chosen": -307.52362060546875,
"logps/rejected": -520.55322265625,
"loss": 0.4403,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.0457329750061035,
"rewards/margins": 2.181645393371582,
"rewards/rejected": -4.227377891540527,
"step": 125
},
{
"epoch": 0.25617340969271507,
"grad_norm": 11.650217681846684,
"learning_rate": 4.63876746170797e-07,
"logits/chosen": -0.23113389313220978,
"logits/rejected": -0.47989240288734436,
"logps/chosen": -344.0136413574219,
"logps/rejected": -512.1621704101562,
"loss": 0.474,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.392538547515869,
"rewards/margins": 1.7528173923492432,
"rewards/rejected": -4.145355701446533,
"step": 130
},
{
"epoch": 0.2660262331424349,
"grad_norm": 12.280394789341818,
"learning_rate": 4.592916195656321e-07,
"logits/chosen": -0.2849624454975128,
"logits/rejected": -0.4641537070274353,
"logps/chosen": -360.0565490722656,
"logps/rejected": -593.3041381835938,
"loss": 0.4143,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.587618112564087,
"rewards/margins": 2.348435401916504,
"rewards/rejected": -4.936053276062012,
"step": 135
},
{
"epoch": 0.27587905659215467,
"grad_norm": 11.781977035196428,
"learning_rate": 4.544581688079602e-07,
"logits/chosen": -0.27669793367385864,
"logits/rejected": -0.4249224066734314,
"logps/chosen": -346.13238525390625,
"logps/rejected": -542.8922119140625,
"loss": 0.3764,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.43330454826355,
"rewards/margins": 1.9707410335540771,
"rewards/rejected": -4.404046058654785,
"step": 140
},
{
"epoch": 0.2857318800418745,
"grad_norm": 12.725521218797944,
"learning_rate": 4.493821287789272e-07,
"logits/chosen": -0.29486262798309326,
"logits/rejected": -0.45644649863243103,
"logps/chosen": -350.2372131347656,
"logps/rejected": -533.0991821289062,
"loss": 0.3874,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.510737657546997,
"rewards/margins": 1.8067106008529663,
"rewards/rejected": -4.317447662353516,
"step": 145
},
{
"epoch": 0.2955847034915943,
"grad_norm": 10.47948759187309,
"learning_rate": 4.4406952219143934e-07,
"logits/chosen": -0.20662228763103485,
"logits/rejected": -0.47787055373191833,
"logps/chosen": -331.3938903808594,
"logps/rejected": -496.76922607421875,
"loss": 0.3901,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2830259799957275,
"rewards/margins": 1.7167097330093384,
"rewards/rejected": -3.9997353553771973,
"step": 150
},
{
"epoch": 0.3054375269413141,
"grad_norm": 9.243554014229469,
"learning_rate": 4.38526652444224e-07,
"logits/chosen": -0.23520083725452423,
"logits/rejected": -0.45836538076400757,
"logps/chosen": -338.83935546875,
"logps/rejected": -600.7113037109375,
"loss": 0.3776,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.335677146911621,
"rewards/margins": 2.6365208625793457,
"rewards/rejected": -4.972198009490967,
"step": 155
},
{
"epoch": 0.3152903503910339,
"grad_norm": 16.302237845544973,
"learning_rate": 4.3276009614285824e-07,
"logits/chosen": -0.27397865056991577,
"logits/rejected": -0.4509497582912445,
"logps/chosen": -340.3817443847656,
"logps/rejected": -519.3179321289062,
"loss": 0.373,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.389813184738159,
"rewards/margins": 1.820207953453064,
"rewards/rejected": -4.210021018981934,
"step": 160
},
{
"epoch": 0.32514317384075375,
"grad_norm": 12.790316412360088,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -0.3117635250091553,
"logits/rejected": -0.5069926977157593,
"logps/chosen": -408.1647033691406,
"logps/rejected": -684.1016845703125,
"loss": 0.3554,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.9762721061706543,
"rewards/margins": 2.816100597381592,
"rewards/rejected": -5.792372703552246,
"step": 165
},
{
"epoch": 0.3349959972904735,
"grad_norm": 12.163515409448873,
"learning_rate": 4.2058354920054043e-07,
"logits/chosen": -0.32501062750816345,
"logits/rejected": -0.4444299340248108,
"logps/chosen": -392.1680603027344,
"logps/rejected": -582.3343505859375,
"loss": 0.3309,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.916874885559082,
"rewards/margins": 1.9237887859344482,
"rewards/rejected": -4.840663909912109,
"step": 170
},
{
"epoch": 0.34484882074019335,
"grad_norm": 16.509396763840595,
"learning_rate": 4.141880060119336e-07,
"logits/chosen": -0.30287298560142517,
"logits/rejected": -0.5002428293228149,
"logps/chosen": -392.53900146484375,
"logps/rejected": -621.8858642578125,
"loss": 0.3799,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.818398952484131,
"rewards/margins": 2.3253607749938965,
"rewards/rejected": -5.143759250640869,
"step": 175
},
{
"epoch": 0.3547016441899132,
"grad_norm": 19.861599589919685,
"learning_rate": 4.0759765403198877e-07,
"logits/chosen": -0.2992296814918518,
"logits/rejected": -0.46128687262535095,
"logps/chosen": -369.04058837890625,
"logps/rejected": -594.5716552734375,
"loss": 0.3318,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.6274733543395996,
"rewards/margins": 2.262349843978882,
"rewards/rejected": -4.889822959899902,
"step": 180
},
{
"epoch": 0.364554467639633,
"grad_norm": 20.974452355570566,
"learning_rate": 4.008203127021797e-07,
"logits/chosen": -0.25085026025772095,
"logits/rejected": -0.46279406547546387,
"logps/chosen": -358.7597351074219,
"logps/rejected": -571.667724609375,
"loss": 0.3218,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -2.5963218212127686,
"rewards/margins": 2.143275022506714,
"rewards/rejected": -4.739596843719482,
"step": 185
},
{
"epoch": 0.3744072910893528,
"grad_norm": 14.073373162388602,
"learning_rate": 3.9386402332652754e-07,
"logits/chosen": -0.23608064651489258,
"logits/rejected": -0.5011879205703735,
"logps/chosen": -423.06573486328125,
"logps/rejected": -701.5946044921875,
"loss": 0.3171,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.2071080207824707,
"rewards/margins": 2.7721924781799316,
"rewards/rejected": -5.979300498962402,
"step": 190
},
{
"epoch": 0.3842601145390726,
"grad_norm": 22.462855346029198,
"learning_rate": 3.867370395306068e-07,
"logits/chosen": -0.21024306118488312,
"logits/rejected": -0.48892831802368164,
"logps/chosen": -421.2330627441406,
"logps/rejected": -660.6759643554688,
"loss": 0.3264,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.2450497150421143,
"rewards/margins": 2.429110527038574,
"rewards/rejected": -5.674160003662109,
"step": 195
},
{
"epoch": 0.39411293798879243,
"grad_norm": 12.932519297904385,
"learning_rate": 3.794478174686328e-07,
"logits/chosen": -0.29106825590133667,
"logits/rejected": -0.4926213324069977,
"logps/chosen": -411.04840087890625,
"logps/rejected": -645.6761474609375,
"loss": 0.344,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.0619587898254395,
"rewards/margins": 2.4115443229675293,
"rewards/rejected": -5.473503112792969,
"step": 200
},
{
"epoch": 0.4039657614385122,
"grad_norm": 12.885808088365131,
"learning_rate": 3.720050057902495e-07,
"logits/chosen": -0.28087863326072693,
"logits/rejected": -0.5519760847091675,
"logps/chosen": -393.69659423828125,
"logps/rejected": -613.2288208007812,
"loss": 0.3363,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.869983196258545,
"rewards/margins": 2.265076160430908,
"rewards/rejected": -5.135059356689453,
"step": 205
},
{
"epoch": 0.41381858488823203,
"grad_norm": 16.468484087079414,
"learning_rate": 3.644174353789204e-07,
"logits/chosen": -0.3173820376396179,
"logits/rejected": -0.41476958990097046,
"logps/chosen": -439.092529296875,
"logps/rejected": -734.161865234375,
"loss": 0.2861,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.3481502532958984,
"rewards/margins": 2.985670566558838,
"rewards/rejected": -6.3338212966918945,
"step": 210
},
{
"epoch": 0.42367140833795186,
"grad_norm": 22.743564527030898,
"learning_rate": 3.566941088741009e-07,
"logits/chosen": -0.32451528310775757,
"logits/rejected": -0.4603727459907532,
"logps/chosen": -506.1102600097656,
"logps/rejected": -843.5612182617188,
"loss": 0.3019,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.0731306076049805,
"rewards/margins": 3.3347747325897217,
"rewards/rejected": -7.407905578613281,
"step": 215
},
{
"epoch": 0.43352423178767163,
"grad_norm": 13.898026449525894,
"learning_rate": 3.488441899896217e-07,
"logits/chosen": -0.36527490615844727,
"logits/rejected": -0.5059664845466614,
"logps/chosen": -498.8099670410156,
"logps/rejected": -819.1517333984375,
"loss": 0.2831,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.9226620197296143,
"rewards/margins": 3.1661388874053955,
"rewards/rejected": -7.088801383972168,
"step": 220
},
{
"epoch": 0.44337705523739146,
"grad_norm": 13.52248766735538,
"learning_rate": 3.408769926409574e-07,
"logits/chosen": -0.2923319637775421,
"logits/rejected": -0.5522831082344055,
"logps/chosen": -440.209716796875,
"logps/rejected": -757.4591064453125,
"loss": 0.2934,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.353994369506836,
"rewards/margins": 3.2045319080352783,
"rewards/rejected": -6.558526039123535,
"step": 225
},
{
"epoch": 0.4532298786871113,
"grad_norm": 25.763440445028106,
"learning_rate": 3.3280196989428263e-07,
"logits/chosen": -0.28175559639930725,
"logits/rejected": -0.5395274758338928,
"logps/chosen": -408.2548828125,
"logps/rejected": -689.34326171875,
"loss": 0.3182,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.0639290809631348,
"rewards/margins": 2.8385584354400635,
"rewards/rejected": -5.902487754821777,
"step": 230
},
{
"epoch": 0.46308270213683106,
"grad_norm": 25.454226685247495,
"learning_rate": 3.2462870275042367e-07,
"logits/chosen": -0.3556864261627197,
"logits/rejected": -0.5125774145126343,
"logps/chosen": -393.869140625,
"logps/rejected": -721.3883666992188,
"loss": 0.2842,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.9227001667022705,
"rewards/margins": 3.2367324829101562,
"rewards/rejected": -6.159432888031006,
"step": 235
},
{
"epoch": 0.4729355255865509,
"grad_norm": 17.441738861595127,
"learning_rate": 3.1636688877701806e-07,
"logits/chosen": -0.2837878167629242,
"logits/rejected": -0.4791291654109955,
"logps/chosen": -416.69989013671875,
"logps/rejected": -730.2517700195312,
"loss": 0.2738,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.118884563446045,
"rewards/margins": 3.163923978805542,
"rewards/rejected": -6.282808303833008,
"step": 240
},
{
"epoch": 0.4827883490362707,
"grad_norm": 15.400566409989613,
"learning_rate": 3.080263306023669e-07,
"logits/chosen": -0.30386677384376526,
"logits/rejected": -0.5350344777107239,
"logps/chosen": -433.35760498046875,
"logps/rejected": -742.5108642578125,
"loss": 0.2983,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.27720308303833,
"rewards/margins": 3.1466774940490723,
"rewards/rejected": -6.423880100250244,
"step": 245
},
{
"epoch": 0.49264117248599054,
"grad_norm": 13.243347184925653,
"learning_rate": 2.996169242846328e-07,
"logits/chosen": -0.2507760524749756,
"logits/rejected": -0.5307763814926147,
"logps/chosen": -420.93218994140625,
"logps/rejected": -727.508056640625,
"loss": 0.2864,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.2063565254211426,
"rewards/margins": 3.135401725769043,
"rewards/rejected": -6.3417582511901855,
"step": 250
},
{
"epoch": 0.5024939959357103,
"grad_norm": 15.905893971492043,
"learning_rate": 2.911486475701835e-07,
"logits/chosen": -0.3323180675506592,
"logits/rejected": -0.500321090221405,
"logps/chosen": -417.96533203125,
"logps/rejected": -669.329345703125,
"loss": 0.3038,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.0756640434265137,
"rewards/margins": 2.5683434009552,
"rewards/rejected": -5.644008159637451,
"step": 255
},
{
"epoch": 0.5123468193854301,
"grad_norm": 15.678554296764496,
"learning_rate": 2.826315480550129e-07,
"logits/chosen": -0.2549038529396057,
"logits/rejected": -0.5205506682395935,
"logps/chosen": -397.9513244628906,
"logps/rejected": -706.6505126953125,
"loss": 0.2709,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.9337005615234375,
"rewards/margins": 3.1359124183654785,
"rewards/rejected": -6.069613456726074,
"step": 260
},
{
"epoch": 0.52219964283515,
"grad_norm": 13.183965337617314,
"learning_rate": 2.740757312632854e-07,
"logits/chosen": -0.3502804636955261,
"logits/rejected": -0.47828468680381775,
"logps/chosen": -458.28009033203125,
"logps/rejected": -756.2743530273438,
"loss": 0.3101,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.547351837158203,
"rewards/margins": 3.0040035247802734,
"rewards/rejected": -6.551354885101318,
"step": 265
},
{
"epoch": 0.5320524662848698,
"grad_norm": 14.251066041226494,
"learning_rate": 2.654913486571487e-07,
"logits/chosen": -0.29742032289505005,
"logits/rejected": -0.512765109539032,
"logps/chosen": -433.20281982421875,
"logps/rejected": -742.1527099609375,
"loss": 0.269,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.3142356872558594,
"rewards/margins": 3.1091339588165283,
"rewards/rejected": -6.423369407653809,
"step": 270
},
{
"epoch": 0.5419052897345896,
"grad_norm": 15.970456495692506,
"learning_rate": 2.5688858559204053e-07,
"logits/chosen": -0.29328054189682007,
"logits/rejected": -0.5653128027915955,
"logps/chosen": -457.0283203125,
"logps/rejected": -802.5032958984375,
"loss": 0.2619,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.5228774547576904,
"rewards/margins": 3.504716396331787,
"rewards/rejected": -7.027594566345215,
"step": 275
},
{
"epoch": 0.5517581131843093,
"grad_norm": 30.315717109626856,
"learning_rate": 2.4827764923178246e-07,
"logits/chosen": -0.2728544771671295,
"logits/rejected": -0.5340480208396912,
"logps/chosen": -508.76715087890625,
"logps/rejected": -814.9278564453125,
"loss": 0.3282,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -4.023861408233643,
"rewards/margins": 3.080327272415161,
"rewards/rejected": -7.104189395904541,
"step": 280
},
{
"epoch": 0.5616109366340292,
"grad_norm": 12.181715177068432,
"learning_rate": 2.3966875643779667e-07,
"logits/chosen": -0.39192137122154236,
"logits/rejected": -0.48601895570755005,
"logps/chosen": -509.3650817871094,
"logps/rejected": -919.6329956054688,
"loss": 0.2728,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.043977737426758,
"rewards/margins": 4.052145957946777,
"rewards/rejected": -8.096123695373535,
"step": 285
},
{
"epoch": 0.571463760083749,
"grad_norm": 22.18619533848792,
"learning_rate": 2.3107212164681774e-07,
"logits/chosen": -0.2627003788948059,
"logits/rejected": -0.512112557888031,
"logps/chosen": -514.5509033203125,
"logps/rejected": -777.1829833984375,
"loss": 0.286,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.147472381591797,
"rewards/margins": 2.6902387142181396,
"rewards/rejected": -6.837711334228516,
"step": 290
},
{
"epoch": 0.5813165835334688,
"grad_norm": 20.397352654188396,
"learning_rate": 2.2249794475148019e-07,
"logits/chosen": -0.2886350750923157,
"logits/rejected": -0.533146858215332,
"logps/chosen": -465.264892578125,
"logps/rejected": -807.4385986328125,
"loss": 0.2676,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.6474337577819824,
"rewards/margins": 3.4217605590820312,
"rewards/rejected": -7.0691938400268555,
"step": 295
},
{
"epoch": 0.5911694069831886,
"grad_norm": 12.525676548560202,
"learning_rate": 2.1395639899816332e-07,
"logits/chosen": -0.35506710410118103,
"logits/rejected": -0.4460170865058899,
"logps/chosen": -424.7347106933594,
"logps/rejected": -713.3164672851562,
"loss": 0.2296,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.2564120292663574,
"rewards/margins": 2.842578411102295,
"rewards/rejected": -6.098990440368652,
"step": 300
},
{
"epoch": 0.6010222304329085,
"grad_norm": 16.20564210489265,
"learning_rate": 2.0545761891645177e-07,
"logits/chosen": -0.2978189289569855,
"logits/rejected": -0.5122548937797546,
"logps/chosen": -471.35693359375,
"logps/rejected": -824.7840576171875,
"loss": 0.2482,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.6659018993377686,
"rewards/margins": 3.483975887298584,
"rewards/rejected": -7.14987850189209,
"step": 305
},
{
"epoch": 0.6108750538826282,
"grad_norm": 16.59602026766405,
"learning_rate": 1.9701168829453305e-07,
"logits/chosen": -0.2841472625732422,
"logits/rejected": -0.49612703919410706,
"logps/chosen": -475.79559326171875,
"logps/rejected": -832.6185302734375,
"loss": 0.2832,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.748015880584717,
"rewards/margins": 3.5668914318084717,
"rewards/rejected": -7.314908027648926,
"step": 310
},
{
"epoch": 0.620727877332348,
"grad_norm": 14.834560333592078,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -0.2684577405452728,
"logits/rejected": -0.5325924158096313,
"logps/chosen": -491.4776306152344,
"logps/rejected": -837.3449096679688,
"loss": 0.2599,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.8538050651550293,
"rewards/margins": 3.5408568382263184,
"rewards/rejected": -7.394662380218506,
"step": 315
},
{
"epoch": 0.6305807007820678,
"grad_norm": 11.160506827485138,
"learning_rate": 1.8031838516385422e-07,
"logits/chosen": -0.2939426898956299,
"logits/rejected": -0.4827597141265869,
"logps/chosen": -524.61865234375,
"logps/rejected": -873.7683715820312,
"loss": 0.294,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -4.23654842376709,
"rewards/margins": 3.4801859855651855,
"rewards/rejected": -7.716734409332275,
"step": 320
},
{
"epoch": 0.6404335242317877,
"grad_norm": 25.16794488629472,
"learning_rate": 1.7209081923101472e-07,
"logits/chosen": -0.37951114773750305,
"logits/rejected": -0.5442458391189575,
"logps/chosen": -484.8412170410156,
"logps/rejected": -843.31494140625,
"loss": 0.2438,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.8335394859313965,
"rewards/margins": 3.537074565887451,
"rewards/rejected": -7.370614528656006,
"step": 325
},
{
"epoch": 0.6502863476815075,
"grad_norm": 15.675332902369199,
"learning_rate": 1.639556924093404e-07,
"logits/chosen": -0.31534910202026367,
"logits/rejected": -0.6055206656455994,
"logps/chosen": -469.186767578125,
"logps/rejected": -785.3204345703125,
"loss": 0.2862,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.6316142082214355,
"rewards/margins": 3.245419979095459,
"rewards/rejected": -6.8770341873168945,
"step": 330
},
{
"epoch": 0.6601391711312273,
"grad_norm": 12.550466940425805,
"learning_rate": 1.5592265701304114e-07,
"logits/chosen": -0.309912770986557,
"logits/rejected": -0.580001950263977,
"logps/chosen": -455.7469177246094,
"logps/rejected": -795.7465209960938,
"loss": 0.2543,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.550823211669922,
"rewards/margins": 3.412391185760498,
"rewards/rejected": -6.9632134437561035,
"step": 335
},
{
"epoch": 0.669991994580947,
"grad_norm": 14.379670239758621,
"learning_rate": 1.4800124422502334e-07,
"logits/chosen": -0.3434782028198242,
"logits/rejected": -0.5037192106246948,
"logps/chosen": -427.98486328125,
"logps/rejected": -760.0518798828125,
"loss": 0.2582,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.309481143951416,
"rewards/margins": 3.3087706565856934,
"rewards/rejected": -6.618251800537109,
"step": 340
},
{
"epoch": 0.6798448180306669,
"grad_norm": 16.948885121756778,
"learning_rate": 1.4020085278815743e-07,
"logits/chosen": -0.3305511176586151,
"logits/rejected": -0.510036289691925,
"logps/chosen": -473.0335998535156,
"logps/rejected": -838.19970703125,
"loss": 0.2595,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.7052788734436035,
"rewards/margins": 3.6906940937042236,
"rewards/rejected": -7.39597225189209,
"step": 345
},
{
"epoch": 0.6896976414803867,
"grad_norm": 13.672982554566653,
"learning_rate": 1.3253073785368545e-07,
"logits/chosen": -0.3359353244304657,
"logits/rejected": -0.599565863609314,
"logps/chosen": -450.7706604003906,
"logps/rejected": -766.9132080078125,
"loss": 0.2378,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.4409427642822266,
"rewards/margins": 3.247152328491211,
"rewards/rejected": -6.6880950927734375,
"step": 350
},
{
"epoch": 0.6995504649301065,
"grad_norm": 17.198800285063513,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -0.3707168996334076,
"logits/rejected": -0.5359587669372559,
"logps/chosen": -492.1787109375,
"logps/rejected": -895.4451904296875,
"loss": 0.266,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.8479511737823486,
"rewards/margins": 4.02894926071167,
"rewards/rejected": -7.876900672912598,
"step": 355
},
{
"epoch": 0.7094032883798264,
"grad_norm": 19.550687307980805,
"learning_rate": 1.1761757443482285e-07,
"logits/chosen": -0.2943348288536072,
"logits/rejected": -0.6027869582176208,
"logps/chosen": -481.8168029785156,
"logps/rejected": -860.5421142578125,
"loss": 0.2475,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.7917988300323486,
"rewards/margins": 3.795625686645508,
"rewards/rejected": -7.587424278259277,
"step": 360
},
{
"epoch": 0.7192561118295462,
"grad_norm": 19.931316668144966,
"learning_rate": 1.1039222039359644e-07,
"logits/chosen": -0.32548245787620544,
"logits/rejected": -0.5251120328903198,
"logps/chosen": -466.42352294921875,
"logps/rejected": -837.3107299804688,
"loss": 0.2503,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.647482395172119,
"rewards/margins": 3.7225258350372314,
"rewards/rejected": -7.370007514953613,
"step": 365
},
{
"epoch": 0.729108935279266,
"grad_norm": 27.992034585729865,
"learning_rate": 1.0333251074666608e-07,
"logits/chosen": -0.26019373536109924,
"logits/rejected": -0.5685330629348755,
"logps/chosen": -508.7705078125,
"logps/rejected": -834.8313598632812,
"loss": 0.3009,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.032122611999512,
"rewards/margins": 3.357773542404175,
"rewards/rejected": -7.389896392822266,
"step": 370
},
{
"epoch": 0.7389617587289857,
"grad_norm": 11.391526012410443,
"learning_rate": 9.644682182758304e-08,
"logits/chosen": -0.36888641119003296,
"logits/rejected": -0.544438362121582,
"logps/chosen": -501.13922119140625,
"logps/rejected": -866.8273315429688,
"loss": 0.267,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.018528938293457,
"rewards/margins": 3.660531997680664,
"rewards/rejected": -7.679060459136963,
"step": 375
},
{
"epoch": 0.7488145821787056,
"grad_norm": 15.381753957706216,
"learning_rate": 8.974332349459992e-08,
"logits/chosen": -0.3592904210090637,
"logits/rejected": -0.5230213403701782,
"logps/chosen": -498.76397705078125,
"logps/rejected": -873.2203369140625,
"loss": 0.2913,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.937748432159424,
"rewards/margins": 3.7432987689971924,
"rewards/rejected": -7.681046962738037,
"step": 380
},
{
"epoch": 0.7586674056284254,
"grad_norm": 12.706615805349994,
"learning_rate": 8.322996943714672e-08,
"logits/chosen": -0.3497922718524933,
"logits/rejected": -0.5577541589736938,
"logps/chosen": -512.2010498046875,
"logps/rejected": -885.4027099609375,
"loss": 0.2356,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.10722017288208,
"rewards/margins": 3.7454257011413574,
"rewards/rejected": -7.8526458740234375,
"step": 385
},
{
"epoch": 0.7685202290781452,
"grad_norm": 14.360275587084523,
"learning_rate": 7.691448773879256e-08,
"logits/chosen": -0.3780885934829712,
"logits/rejected": -0.5432588458061218,
"logps/chosen": -449.843505859375,
"logps/rejected": -840.5940551757812,
"loss": 0.2795,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.502520799636841,
"rewards/margins": 3.937730073928833,
"rewards/rejected": -7.440250396728516,
"step": 390
},
{
"epoch": 0.778373052527865,
"grad_norm": 15.407447239164402,
"learning_rate": 7.080437170788722e-08,
"logits/chosen": -0.38400599360466003,
"logits/rejected": -0.5888391733169556,
"logps/chosen": -466.9847717285156,
"logps/rejected": -768.0460205078125,
"loss": 0.2824,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.600980281829834,
"rewards/margins": 3.0612690448760986,
"rewards/rejected": -6.6622490882873535,
"step": 395
},
{
"epoch": 0.7882258759775849,
"grad_norm": 11.960557567104859,
"learning_rate": 6.490687098676332e-08,
"logits/chosen": -0.4111432433128357,
"logits/rejected": -0.6113660335540771,
"logps/chosen": -448.69500732421875,
"logps/rejected": -851.4993896484375,
"loss": 0.2753,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.424553394317627,
"rewards/margins": 4.026993751525879,
"rewards/rejected": -7.451546669006348,
"step": 400
},
{
"epoch": 0.7882258759775849,
"eval_logits/chosen": -1.0704303979873657,
"eval_logits/rejected": -0.814034640789032,
"eval_logps/chosen": -502.4184875488281,
"eval_logps/rejected": -705.5203857421875,
"eval_loss": 0.7013445496559143,
"eval_rewards/accuracies": 0.7059999704360962,
"eval_rewards/chosen": -4.094460487365723,
"eval_rewards/margins": 1.768728494644165,
"eval_rewards/rejected": -5.863188743591309,
"eval_runtime": 197.0588,
"eval_samples_per_second": 10.144,
"eval_steps_per_second": 1.269,
"step": 400
},
{
"epoch": 0.7980786994273046,
"grad_norm": 16.569681590429596,
"learning_rate": 5.9228982950048414e-08,
"logits/chosen": -0.3508976995944977,
"logits/rejected": -0.6730154752731323,
"logps/chosen": -446.61688232421875,
"logps/rejected": -841.1275634765625,
"loss": 0.2751,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.3948426246643066,
"rewards/margins": 4.001449108123779,
"rewards/rejected": -7.396292209625244,
"step": 405
},
{
"epoch": 0.8079315228770244,
"grad_norm": 30.74667877593499,
"learning_rate": 5.3777444402291345e-08,
"logits/chosen": -0.44255560636520386,
"logits/rejected": -0.5289443135261536,
"logps/chosen": -484.78363037109375,
"logps/rejected": -847.515625,
"loss": 0.2925,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.900998592376709,
"rewards/margins": 3.6134960651397705,
"rewards/rejected": -7.5144944190979,
"step": 410
},
{
"epoch": 0.8177843463267442,
"grad_norm": 23.687222476192307,
"learning_rate": 4.855872358475546e-08,
"logits/chosen": -0.3782200217247009,
"logits/rejected": -0.6156097650527954,
"logps/chosen": -456.51220703125,
"logps/rejected": -778.0684814453125,
"loss": 0.266,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.4957103729248047,
"rewards/margins": 3.254422664642334,
"rewards/rejected": -6.750133514404297,
"step": 415
},
{
"epoch": 0.8276371697764641,
"grad_norm": 19.649935116061837,
"learning_rate": 4.357901250086107e-08,
"logits/chosen": -0.42209166288375854,
"logits/rejected": -0.5967798233032227,
"logps/chosen": -440.5074157714844,
"logps/rejected": -706.137451171875,
"loss": 0.2765,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.3461577892303467,
"rewards/margins": 2.7582011222839355,
"rewards/rejected": -6.104359149932861,
"step": 420
},
{
"epoch": 0.8374899932261839,
"grad_norm": 20.031137924167396,
"learning_rate": 3.884421956938377e-08,
"logits/chosen": -0.3795103430747986,
"logits/rejected": -0.5693720579147339,
"logps/chosen": -415.31976318359375,
"logps/rejected": -812.3355712890625,
"loss": 0.2851,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.1916797161102295,
"rewards/margins": 3.95634126663208,
"rewards/rejected": -7.1480207443237305,
"step": 425
},
{
"epoch": 0.8473428166759037,
"grad_norm": 13.471904858040782,
"learning_rate": 3.435996261412591e-08,
"logits/chosen": -0.37054741382598877,
"logits/rejected": -0.5658844113349915,
"logps/chosen": -443.5189514160156,
"logps/rejected": -789.6598510742188,
"loss": 0.2174,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.431079864501953,
"rewards/margins": 3.494525909423828,
"rewards/rejected": -6.925606727600098,
"step": 430
},
{
"epoch": 0.8571956401256235,
"grad_norm": 41.43328285681965,
"learning_rate": 3.013156219837776e-08,
"logits/chosen": -0.42507949471473694,
"logits/rejected": -0.6270584464073181,
"logps/chosen": -437.37841796875,
"logps/rejected": -766.0833129882812,
"loss": 0.2904,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.347297191619873,
"rewards/margins": 3.312955856323242,
"rewards/rejected": -6.660252571105957,
"step": 435
},
{
"epoch": 0.8670484635753433,
"grad_norm": 14.214226148262764,
"learning_rate": 2.6164035312078447e-08,
"logits/chosen": -0.35758644342422485,
"logits/rejected": -0.5952532291412354,
"logps/chosen": -442.4962463378906,
"logps/rejected": -813.2142944335938,
"loss": 0.2391,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.3745460510253906,
"rewards/margins": 3.7038657665252686,
"rewards/rejected": -7.0784125328063965,
"step": 440
},
{
"epoch": 0.8769012870250631,
"grad_norm": 12.722193098483684,
"learning_rate": 2.2462089419165776e-08,
"logits/chosen": -0.3699408173561096,
"logits/rejected": -0.6010391116142273,
"logps/chosen": -450.82745361328125,
"logps/rejected": -841.6207885742188,
"loss": 0.2671,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.4586853981018066,
"rewards/margins": 3.931617259979248,
"rewards/rejected": -7.3903021812438965,
"step": 445
},
{
"epoch": 0.8867541104747829,
"grad_norm": 34.89251514032782,
"learning_rate": 1.9030116872178314e-08,
"logits/chosen": -0.42120829224586487,
"logits/rejected": -0.5930547118186951,
"logps/chosen": -482.32049560546875,
"logps/rejected": -877.3797607421875,
"loss": 0.2528,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.762233018875122,
"rewards/margins": 3.9991488456726074,
"rewards/rejected": -7.761382102966309,
"step": 450
},
{
"epoch": 0.8966069339245027,
"grad_norm": 11.63474736568058,
"learning_rate": 1.5872189700736337e-08,
"logits/chosen": -0.3359231948852539,
"logits/rejected": -0.580034613609314,
"logps/chosen": -476.1517028808594,
"logps/rejected": -840.4141845703125,
"loss": 0.2568,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.7410130500793457,
"rewards/margins": 3.678678512573242,
"rewards/rejected": -7.419691562652588,
"step": 455
},
{
"epoch": 0.9064597573742226,
"grad_norm": 17.200657346282192,
"learning_rate": 1.2992054780085692e-08,
"logits/chosen": -0.42435139417648315,
"logits/rejected": -0.5707032084465027,
"logps/chosen": -480.90673828125,
"logps/rejected": -864.6921997070312,
"loss": 0.2681,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.753577470779419,
"rewards/margins": 3.8412506580352783,
"rewards/rejected": -7.594828128814697,
"step": 460
},
{
"epoch": 0.9163125808239424,
"grad_norm": 15.168319053661703,
"learning_rate": 1.0393129385436823e-08,
"logits/chosen": -0.41962409019470215,
"logits/rejected": -0.5582268238067627,
"logps/chosen": -480.22650146484375,
"logps/rejected": -835.0930786132812,
"loss": 0.2639,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.7359073162078857,
"rewards/margins": 3.5815021991729736,
"rewards/rejected": -7.317408561706543,
"step": 465
},
{
"epoch": 0.9261654042736621,
"grad_norm": 14.145988270109239,
"learning_rate": 8.078497137373242e-09,
"logits/chosen": -0.3046739399433136,
"logits/rejected": -0.6777099967002869,
"logps/chosen": -452.22406005859375,
"logps/rejected": -792.261474609375,
"loss": 0.2466,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.451488494873047,
"rewards/margins": 3.4896602630615234,
"rewards/rejected": -6.941148281097412,
"step": 470
},
{
"epoch": 0.936018227723382,
"grad_norm": 17.26395488122693,
"learning_rate": 6.0509043431410945e-09,
"logits/chosen": -0.47448891401290894,
"logits/rejected": -0.5780371427536011,
"logps/chosen": -473.33148193359375,
"logps/rejected": -845.0810546875,
"loss": 0.2735,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.652672290802002,
"rewards/margins": 3.7301712036132812,
"rewards/rejected": -7.382843971252441,
"step": 475
},
{
"epoch": 0.9458710511731018,
"grad_norm": 13.502447902766075,
"learning_rate": 4.312756738160145e-09,
"logits/chosen": -0.4307557940483093,
"logits/rejected": -0.609841525554657,
"logps/chosen": -455.236572265625,
"logps/rejected": -856.6760864257812,
"loss": 0.2483,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.540134906768799,
"rewards/margins": 3.9815449714660645,
"rewards/rejected": -7.521679878234863,
"step": 480
},
{
"epoch": 0.9557238746228216,
"grad_norm": 14.00663655974014,
"learning_rate": 2.8661166316229223e-09,
"logits/chosen": -0.34750238060951233,
"logits/rejected": -0.5667217969894409,
"logps/chosen": -460.013671875,
"logps/rejected": -754.5597534179688,
"loss": 0.2729,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.5511062145233154,
"rewards/margins": 2.9699506759643555,
"rewards/rejected": -6.52105712890625,
"step": 485
},
{
"epoch": 0.9655766980725414,
"grad_norm": 14.399934375372629,
"learning_rate": 1.7127004595681727e-09,
"logits/chosen": -0.37273770570755005,
"logits/rejected": -0.6115278005599976,
"logps/chosen": -470.6495056152344,
"logps/rejected": -818.9450073242188,
"loss": 0.2724,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.666602611541748,
"rewards/margins": 3.5529751777648926,
"rewards/rejected": -7.219576835632324,
"step": 490
},
{
"epoch": 0.9754295215222613,
"grad_norm": 13.760376876126982,
"learning_rate": 8.538767483325383e-10,
"logits/chosen": -0.3895708918571472,
"logits/rejected": -0.6080259084701538,
"logps/chosen": -463.2498474121094,
"logps/rejected": -805.6424560546875,
"loss": 0.2489,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.570629119873047,
"rewards/margins": 3.4926178455352783,
"rewards/rejected": -7.063246726989746,
"step": 495
},
{
"epoch": 0.9852823449719811,
"grad_norm": 23.653347787632313,
"learning_rate": 2.9066449079634404e-10,
"logits/chosen": -0.37257179617881775,
"logits/rejected": -0.5408270955085754,
"logps/chosen": -501.0938415527344,
"logps/rejected": -795.1988525390625,
"loss": 0.2689,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.9710209369659424,
"rewards/margins": 2.9601473808288574,
"rewards/rejected": -6.931168556213379,
"step": 500
},
{
"epoch": 0.9951351684217008,
"grad_norm": 14.616427588956885,
"learning_rate": 2.3731937350224273e-11,
"logits/chosen": -0.4466114044189453,
"logits/rejected": -0.6212998032569885,
"logps/chosen": -427.31536865234375,
"logps/rejected": -899.2130126953125,
"loss": 0.268,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.2273106575012207,
"rewards/margins": 4.719491481781006,
"rewards/rejected": -7.946801662445068,
"step": 505
},
{
"epoch": 0.9990762978015888,
"step": 507,
"total_flos": 0.0,
"train_loss": 0.3691282767280789,
"train_runtime": 28319.5283,
"train_samples_per_second": 2.294,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 507,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}