diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9982631930527722, + "epoch": 0.9999333733093477, "eval_steps": 400, - "global_step": 467, + "global_step": 469, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0021376085504342017, - "grad_norm": 3.826100473605733, + "epoch": 0.0021320541008728097, + "grad_norm": 4.17070478980581, "learning_rate": 1.0638297872340425e-08, - "logits/chosen": -0.8263033032417297, - "logits/rejected": -0.9354065656661987, - "logps/chosen": -160.1599884033203, - "logps/rejected": -148.3292999267578, + "logits/chosen": -0.4388880133628845, + "logits/rejected": -0.6813962459564209, + "logps/chosen": -137.1171112060547, + "logps/rejected": -114.13969421386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,1428 +24,1428 @@ "step": 1 }, { - "epoch": 0.01068804275217101, - "grad_norm": 3.875305608004963, + "epoch": 0.010660270504364048, + "grad_norm": 3.7299717491618436, "learning_rate": 5.3191489361702123e-08, - "logits/chosen": -0.8571268320083618, - "logits/rejected": -0.9861307144165039, - "logps/chosen": -152.2813720703125, - "logps/rejected": -132.81561279296875, - "loss": 0.6933, - "rewards/accuracies": 0.296875, - "rewards/chosen": -0.0004895668826065958, - "rewards/margins": -0.0007417945889756083, - "rewards/rejected": 0.00025222758995369077, + "logits/chosen": -0.4889238774776459, + "logits/rejected": -0.6665000319480896, + "logps/chosen": -169.8695068359375, + "logps/rejected": -153.95947265625, + "loss": 0.6932, + "rewards/accuracies": 0.3671875, + "rewards/chosen": 0.00029664667090401053, + "rewards/margins": -0.00023018479987513274, + "rewards/rejected": 0.0005268314271233976, "step": 5 }, { - "epoch": 0.02137608550434202, - "grad_norm": 3.687586969188683, + "epoch": 0.021320541008728097, + "grad_norm": 3.95978205732512, "learning_rate": 1.0638297872340425e-07, - "logits/chosen": -0.8715337514877319, - "logits/rejected": -1.0048949718475342, - "logps/chosen": -164.49298095703125, - "logps/rejected": -148.5006103515625, - "loss": 0.6932, - "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -0.00123710953630507, - "rewards/margins": -0.0010025978554040194, - "rewards/rejected": -0.00023451172455679625, + "logits/chosen": -0.46806925535202026, + "logits/rejected": -0.6404483318328857, + "logps/chosen": -160.8107147216797, + "logps/rejected": -149.25921630859375, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0006372839561663568, + "rewards/margins": 0.0015358469681814313, + "rewards/rejected": -0.0008985629538074136, "step": 10 }, { - "epoch": 0.03206412825651302, - "grad_norm": 3.630084633673227, + "epoch": 0.03198081151309214, + "grad_norm": 4.070738919050114, "learning_rate": 1.5957446808510638e-07, - "logits/chosen": -0.854651927947998, - "logits/rejected": -0.9930330514907837, - "logps/chosen": -158.09552001953125, - "logps/rejected": -141.2473602294922, + "logits/chosen": -0.5198644399642944, + "logits/rejected": -0.7026724219322205, + "logps/chosen": -148.3934783935547, + "logps/rejected": -137.8568878173828, "loss": 0.6932, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.00042637778096832335, - "rewards/margins": 7.524692773586139e-05, - "rewards/rejected": -0.0005016247159801424, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.00037692085606977344, + "rewards/margins": 9.87994353636168e-05, + "rewards/rejected": 0.00027812132611870766, "step": 15 }, { - "epoch": 0.04275217100868404, - "grad_norm": 3.672762542020385, + "epoch": 0.04264108201745619, + "grad_norm": 4.076698141198564, "learning_rate": 2.127659574468085e-07, - "logits/chosen": -0.9381202459335327, - "logits/rejected": -1.0677807331085205, - "logps/chosen": -152.1192169189453, - "logps/rejected": -135.05801391601562, - "loss": 0.6931, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": 0.0005861429963260889, - "rewards/margins": 0.0002767331898212433, - "rewards/rejected": 0.0003094098065048456, + "logits/chosen": -0.5080031156539917, + "logits/rejected": -0.6844709515571594, + "logps/chosen": -163.26565551757812, + "logps/rejected": -144.93130493164062, + "loss": 0.6929, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0008511164924129844, + "rewards/margins": 0.0010705896420404315, + "rewards/rejected": -0.00021947314962744713, "step": 20 }, { - "epoch": 0.053440213760855046, - "grad_norm": 3.632497460320964, + "epoch": 0.05330135252182024, + "grad_norm": 4.091883356232605, "learning_rate": 2.659574468085106e-07, - "logits/chosen": -0.8955503702163696, - "logits/rejected": -1.0264657735824585, - "logps/chosen": -135.39430236816406, - "logps/rejected": -122.80595397949219, - "loss": 0.6928, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.0014580696588382125, - "rewards/margins": 0.000883122906088829, - "rewards/rejected": 0.0005749467527493834, + "logits/chosen": -0.45363473892211914, + "logits/rejected": -0.6415150761604309, + "logps/chosen": -160.65203857421875, + "logps/rejected": -139.57582092285156, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0009880407014861703, + "rewards/margins": 0.0012083369074389338, + "rewards/rejected": -0.00022029613319318742, "step": 25 }, { - "epoch": 0.06412825651302605, - "grad_norm": 4.8211615510882515, + "epoch": 0.06396162302618429, + "grad_norm": 4.4267622202574675, "learning_rate": 3.1914893617021275e-07, - "logits/chosen": -0.9184719920158386, - "logits/rejected": -1.0590474605560303, - "logps/chosen": -142.16336059570312, - "logps/rejected": -123.8579330444336, - "loss": 0.6922, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.003047009464353323, - "rewards/margins": 0.0016460716724395752, - "rewards/rejected": 0.001400937675498426, + "logits/chosen": -0.5177901983261108, + "logits/rejected": -0.6321993470191956, + "logps/chosen": -165.01699829101562, + "logps/rejected": -151.71261596679688, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0023814309388399124, + "rewards/margins": 0.002116392133757472, + "rewards/rejected": 0.0002650389797054231, "step": 30 }, { - "epoch": 0.07481629926519706, - "grad_norm": 3.6575310645813537, + "epoch": 0.07462189353054834, + "grad_norm": 4.269424985466007, "learning_rate": 3.7234042553191484e-07, - "logits/chosen": -0.9128482937812805, - "logits/rejected": -1.0003552436828613, - "logps/chosen": -158.94891357421875, - "logps/rejected": -145.61253356933594, - "loss": 0.6915, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.006963816471397877, - "rewards/margins": 0.0035113028716295958, - "rewards/rejected": 0.0034525140654295683, + "logits/chosen": -0.4782675802707672, + "logits/rejected": -0.7104529738426208, + "logps/chosen": -163.6421356201172, + "logps/rejected": -143.2295379638672, + "loss": 0.6913, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004739758092910051, + "rewards/margins": 0.0038230004720389843, + "rewards/rejected": 0.000916757620871067, "step": 35 }, { - "epoch": 0.08550434201736808, - "grad_norm": 3.6225727516042956, + "epoch": 0.08528216403491239, + "grad_norm": 4.2880363073067365, "learning_rate": 4.25531914893617e-07, - "logits/chosen": -0.907701313495636, - "logits/rejected": -1.0581016540527344, - "logps/chosen": -148.80404663085938, - "logps/rejected": -133.12960815429688, - "loss": 0.6906, + "logits/chosen": -0.5303796529769897, + "logits/rejected": -0.7106837630271912, + "logps/chosen": -174.71463012695312, + "logps/rejected": -153.29507446289062, + "loss": 0.6903, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.009939969517290592, - "rewards/margins": 0.006524121854454279, - "rewards/rejected": 0.003415847197175026, + "rewards/chosen": 0.008925501257181168, + "rewards/margins": 0.006593695841729641, + "rewards/rejected": 0.0023318054154515266, "step": 40 }, { - "epoch": 0.09619238476953908, - "grad_norm": 3.796328740259195, + "epoch": 0.09594243453927644, + "grad_norm": 4.016438849908063, "learning_rate": 4.787234042553192e-07, - "logits/chosen": -0.8742364645004272, - "logits/rejected": -1.0144312381744385, - "logps/chosen": -148.868408203125, - "logps/rejected": -135.29483032226562, - "loss": 0.6893, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.015893535688519478, - "rewards/margins": 0.009248056448996067, - "rewards/rejected": 0.006645479239523411, + "logits/chosen": -0.522494375705719, + "logits/rejected": -0.7226734757423401, + "logps/chosen": -165.866455078125, + "logps/rejected": -144.34194946289062, + "loss": 0.6886, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.010274471715092659, + "rewards/margins": 0.011223495937883854, + "rewards/rejected": -0.0009490237571299076, "step": 45 }, { - "epoch": 0.10688042752171009, - "grad_norm": 3.8854341075784395, - "learning_rate": 4.999370587356267e-07, - "logits/chosen": -0.9241711497306824, - "logits/rejected": -1.0241447687149048, - "logps/chosen": -130.87457275390625, - "logps/rejected": -120.37091064453125, - "loss": 0.6875, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.009933440946042538, - "rewards/margins": 0.010043011978268623, - "rewards/rejected": -0.00010956949699902907, + "epoch": 0.10660270504364049, + "grad_norm": 4.3216596095930235, + "learning_rate": 4.999376538968061e-07, + "logits/chosen": -0.5761003494262695, + "logits/rejected": -0.7390087842941284, + "logps/chosen": -161.60655212402344, + "logps/rejected": -144.6966552734375, + "loss": 0.6868, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.009824760258197784, + "rewards/margins": 0.014007952995598316, + "rewards/rejected": -0.004183194134384394, "step": 50 }, { - "epoch": 0.11756847027388109, - "grad_norm": 4.0170807373088335, - "learning_rate": 4.995525324419337e-07, - "logits/chosen": -0.948462963104248, - "logits/rejected": -1.0943880081176758, - "logps/chosen": -136.25076293945312, - "logps/rejected": -123.91798400878906, - "loss": 0.6848, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.008445949293673038, - "rewards/margins": 0.015471531078219414, - "rewards/rejected": -0.0070255836471915245, + "epoch": 0.11726297554800454, + "grad_norm": 4.305829979355763, + "learning_rate": 4.99556762539107e-07, + "logits/chosen": -0.5275800824165344, + "logits/rejected": -0.7155976891517639, + "logps/chosen": -172.5618133544922, + "logps/rejected": -159.7906494140625, + "loss": 0.6842, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.007245404180139303, + "rewards/margins": 0.016996894031763077, + "rewards/rejected": -0.009751489385962486, "step": 55 }, { - "epoch": 0.1282565130260521, - "grad_norm": 3.8156751238476487, - "learning_rate": 4.988189843662815e-07, - "logits/chosen": -0.9715820550918579, - "logits/rejected": -1.0640943050384521, - "logps/chosen": -155.99337768554688, - "logps/rejected": -142.90029907226562, - "loss": 0.6822, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.000325032597174868, - "rewards/margins": 0.017921900376677513, - "rewards/rejected": -0.018246930092573166, + "epoch": 0.12792324605236857, + "grad_norm": 3.919812332975093, + "learning_rate": 4.988301435819852e-07, + "logits/chosen": -0.528161883354187, + "logits/rejected": -0.7242938280105591, + "logps/chosen": -163.2517547607422, + "logps/rejected": -152.65904235839844, + "loss": 0.6833, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.5745400711894035e-05, + "rewards/margins": 0.017660435289144516, + "rewards/rejected": -0.01770617999136448, "step": 60 }, { - "epoch": 0.13894455577822312, - "grad_norm": 4.018774365744506, - "learning_rate": 4.977374404419837e-07, - "logits/chosen": -0.9845069646835327, - "logits/rejected": -1.104178786277771, - "logps/chosen": -161.20130920410156, - "logps/rejected": -142.3312530517578, - "loss": 0.6777, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.004852614365518093, - "rewards/margins": 0.03587757423520088, - "rewards/rejected": -0.04073018953204155, + "epoch": 0.13858351655673262, + "grad_norm": 4.26787115297138, + "learning_rate": 4.977588036590624e-07, + "logits/chosen": -0.6125078797340393, + "logits/rejected": -0.7909122109413147, + "logps/chosen": -157.07858276367188, + "logps/rejected": -142.1239776611328, + "loss": 0.6787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011157763190567493, + "rewards/margins": 0.029583096504211426, + "rewards/rejected": -0.04074086248874664, "step": 65 }, { - "epoch": 0.14963259853039412, - "grad_norm": 4.573283330095081, - "learning_rate": 4.963094133060148e-07, - "logits/chosen": -1.0565409660339355, - "logits/rejected": -1.1621615886688232, - "logps/chosen": -154.70042419433594, - "logps/rejected": -143.05984497070312, - "loss": 0.6793, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.040705859661102295, - "rewards/margins": 0.027652058750391006, - "rewards/rejected": -0.068357914686203, + "epoch": 0.14924378706109667, + "grad_norm": 4.32141025222622, + "learning_rate": 4.96344226968867e-07, + "logits/chosen": -0.6417307257652283, + "logits/rejected": -0.8415061235427856, + "logps/chosen": -177.39974975585938, + "logps/rejected": -156.98171997070312, + "loss": 0.6761, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.018069323152303696, + "rewards/margins": 0.04366481304168701, + "rewards/rejected": -0.061734139919281006, "step": 70 }, { - "epoch": 0.16032064128256512, - "grad_norm": 4.439152598906208, - "learning_rate": 4.945369001834514e-07, - "logits/chosen": -1.0577439069747925, - "logits/rejected": -1.1740387678146362, - "logps/chosen": -175.05032348632812, - "logps/rejected": -164.45143127441406, - "loss": 0.6754, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.052828311920166016, - "rewards/margins": 0.039229657500982285, - "rewards/rejected": -0.0920579731464386, + "epoch": 0.15990405756546072, + "grad_norm": 4.745633736375277, + "learning_rate": 4.945883732186751e-07, + "logits/chosen": -0.6420779824256897, + "logits/rejected": -0.8456922769546509, + "logps/chosen": -175.96359252929688, + "logps/rejected": -160.39553833007812, + "loss": 0.6753, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.049303699284791946, + "rewards/margins": 0.04190283641219139, + "rewards/rejected": -0.09120653569698334, "step": 75 }, { - "epoch": 0.17100868403473615, - "grad_norm": 4.317792006832961, - "learning_rate": 4.924223800941717e-07, - "logits/chosen": -0.9827049374580383, - "logits/rejected": -1.0909287929534912, - "logps/chosen": -163.2997283935547, - "logps/rejected": -151.0543975830078, - "loss": 0.6686, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.07374860346317291, - "rewards/margins": 0.049334120005369186, - "rewards/rejected": -0.1230827197432518, + "epoch": 0.17056432806982477, + "grad_norm": 4.4046157142215705, + "learning_rate": 4.924936749095969e-07, + "logits/chosen": -0.6506496071815491, + "logits/rejected": -0.8331305384635925, + "logps/chosen": -170.9277801513672, + "logps/rejected": -157.8987579345703, + "loss": 0.6764, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07082077115774155, + "rewards/margins": 0.044193871319293976, + "rewards/rejected": -0.11501463502645493, "step": 80 }, { - "epoch": 0.18169672678690715, - "grad_norm": 4.228783145981308, - "learning_rate": 4.899688103857222e-07, - "logits/chosen": -1.0476535558700562, - "logits/rejected": -1.1739842891693115, - "logps/chosen": -160.54885864257812, - "logps/rejected": -150.99081420898438, - "loss": 0.6679, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.11546945571899414, - "rewards/margins": 0.05317140370607376, - "rewards/rejected": -0.1686408668756485, + "epoch": 0.18122459857418882, + "grad_norm": 5.024858873122934, + "learning_rate": 4.900630339666717e-07, + "logits/chosen": -0.6046501994132996, + "logits/rejected": -0.879498302936554, + "logps/chosen": -172.4420928955078, + "logps/rejected": -155.1177215576172, + "loss": 0.6708, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.08710388094186783, + "rewards/margins": 0.05091012641787529, + "rewards/rejected": -0.13801398873329163, "step": 85 }, { - "epoch": 0.19238476953907815, - "grad_norm": 4.653085185459979, - "learning_rate": 4.871796225971999e-07, - "logits/chosen": -1.037262201309204, - "logits/rejected": -1.1481332778930664, - "logps/chosen": -164.3929443359375, - "logps/rejected": -152.60769653320312, - "loss": 0.6634, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.12224151939153671, - "rewards/margins": 0.06964044272899628, - "rewards/rejected": -0.1918819695711136, + "epoch": 0.19188486907855287, + "grad_norm": 4.906760943250142, + "learning_rate": 4.872998177186375e-07, + "logits/chosen": -0.6804112195968628, + "logits/rejected": -0.9185736775398254, + "logps/chosen": -173.2130126953125, + "logps/rejected": -157.01849365234375, + "loss": 0.6656, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.09927495568990707, + "rewards/margins": 0.056527040898799896, + "rewards/rejected": -0.15580201148986816, "step": 90 }, { - "epoch": 0.20307281229124916, - "grad_norm": 4.9559822610401625, - "learning_rate": 4.840587176599343e-07, - "logits/chosen": -0.9877308011054993, - "logits/rejected": -1.11491060256958, - "logps/chosen": -161.76840209960938, - "logps/rejected": -152.807861328125, - "loss": 0.6598, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.15045811235904694, - "rewards/margins": 0.07553653419017792, - "rewards/rejected": -0.22599467635154724, + "epoch": 0.20254513958291692, + "grad_norm": 4.854322224106784, + "learning_rate": 4.842078542329463e-07, + "logits/chosen": -0.6420129537582397, + "logits/rejected": -0.8440741300582886, + "logps/chosen": -172.54263305664062, + "logps/rejected": -160.012939453125, + "loss": 0.6636, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11956344544887543, + "rewards/margins": 0.0651877298951149, + "rewards/rejected": -0.18475116789340973, "step": 95 }, { - "epoch": 0.21376085504342018, - "grad_norm": 4.987917714349524, - "learning_rate": 4.806104604416823e-07, - "logits/chosen": -1.0010203123092651, - "logits/rejected": -1.1507465839385986, - "logps/chosen": -168.78060913085938, - "logps/rejected": -158.51351928710938, - "loss": 0.6522, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.18534889817237854, - "rewards/margins": 0.08616362512111664, - "rewards/rejected": -0.27151253819465637, + "epoch": 0.21320541008728097, + "grad_norm": 5.020847639274401, + "learning_rate": 4.807914270124876e-07, + "logits/chosen": -0.6584053635597229, + "logits/rejected": -0.8369486927986145, + "logps/chosen": -158.8271484375, + "logps/rejected": -151.04791259765625, + "loss": 0.6622, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13495273888111115, + "rewards/margins": 0.06916390359401703, + "rewards/rejected": -0.20411665737628937, "step": 100 }, { - "epoch": 0.22444889779559118, - "grad_norm": 5.410340181633215, - "learning_rate": 4.768396736419662e-07, - "logits/chosen": -1.1048784255981445, - "logits/rejected": -1.2256269454956055, - "logps/chosen": -171.53135681152344, - "logps/rejected": -164.0275421142578, - "loss": 0.6486, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.2029183804988861, - "rewards/margins": 0.10998847335577011, - "rewards/rejected": -0.3129068613052368, + "epoch": 0.22386568059164502, + "grad_norm": 5.1518931973507875, + "learning_rate": 4.770552690613665e-07, + "logits/chosen": -0.7008846998214722, + "logits/rejected": -0.9158443212509155, + "logps/chosen": -181.6995391845703, + "logps/rejected": -168.43638610839844, + "loss": 0.6531, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.14559721946716309, + "rewards/margins": 0.08520212024450302, + "rewards/rejected": -0.2307993471622467, "step": 105 }, { - "epoch": 0.23513694054776219, - "grad_norm": 5.4430280963955315, - "learning_rate": 4.7275163104709194e-07, - "logits/chosen": -1.0222933292388916, - "logits/rejected": -1.1374541521072388, - "logps/chosen": -178.66830444335938, - "logps/rejected": -177.40155029296875, - "loss": 0.6397, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.22815780341625214, - "rewards/margins": 0.12410934269428253, - "rewards/rejected": -0.35226717591285706, + "epoch": 0.23452595109600907, + "grad_norm": 4.93222468686984, + "learning_rate": 4.730045563279577e-07, + "logits/chosen": -0.7327751517295837, + "logits/rejected": -0.9426084756851196, + "logps/chosen": -184.8527069091797, + "logps/rejected": -169.2633056640625, + "loss": 0.6536, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18423308432102203, + "rewards/margins": 0.08043086528778076, + "rewards/rejected": -0.2646639347076416, "step": 110 }, { - "epoch": 0.2458249832999332, - "grad_norm": 5.494377786652467, - "learning_rate": 4.683520501542824e-07, - "logits/chosen": -1.1115857362747192, - "logits/rejected": -1.2163774967193604, - "logps/chosen": -181.41661071777344, - "logps/rejected": -171.02581787109375, - "loss": 0.6425, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.2546471953392029, - "rewards/margins": 0.12287018448114395, - "rewards/rejected": -0.3775174021720886, + "epoch": 0.24518622160037312, + "grad_norm": 5.321285521863998, + "learning_rate": 4.6864490053432e-07, + "logits/chosen": -0.7645201683044434, + "logits/rejected": -0.9136350750923157, + "logps/chosen": -184.50399780273438, + "logps/rejected": -182.33792114257812, + "loss": 0.6467, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1797805279493332, + "rewards/margins": 0.10915856063365936, + "rewards/rejected": -0.28893908858299255, "step": 115 }, { - "epoch": 0.2565130260521042, - "grad_norm": 5.760997535262556, - "learning_rate": 4.636470841752404e-07, - "logits/chosen": -1.0969531536102295, - "logits/rejected": -1.206673264503479, - "logps/chosen": -180.3526153564453, - "logps/rejected": -174.21571350097656, - "loss": 0.6267, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2945270240306854, - "rewards/margins": 0.11502014100551605, - "rewards/rejected": -0.40954717993736267, + "epoch": 0.25584649210473714, + "grad_norm": 5.62424898876036, + "learning_rate": 4.6398234140190413e-07, + "logits/chosen": -0.7312062978744507, + "logits/rejected": -0.9342387318611145, + "logps/chosen": -189.24227905273438, + "logps/rejected": -181.2150115966797, + "loss": 0.6404, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.22928175330162048, + "rewards/margins": 0.1005432978272438, + "rewards/rejected": -0.3298250436782837, "step": 120 }, { - "epoch": 0.26720106880427524, - "grad_norm": 5.859248874617503, - "learning_rate": 4.5864331343032565e-07, - "logits/chosen": -1.1240909099578857, - "logits/rejected": -1.1898443698883057, - "logps/chosen": -182.5801544189453, - "logps/rejected": -175.3216094970703, - "loss": 0.6345, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.32949692010879517, - "rewards/margins": 0.12121868133544922, - "rewards/rejected": -0.4507156014442444, + "epoch": 0.2665067626091012, + "grad_norm": 5.848008736661893, + "learning_rate": 4.5902333828432416e-07, + "logits/chosen": -0.7402585744857788, + "logits/rejected": -0.9469724893569946, + "logps/chosen": -188.2518768310547, + "logps/rejected": -183.68360900878906, + "loss": 0.6314, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2475469410419464, + "rewards/margins": 0.15488557517528534, + "rewards/rejected": -0.40243250131607056, "step": 125 }, { - "epoch": 0.27788911155644624, - "grad_norm": 6.214747382571571, - "learning_rate": 4.533477361453819e-07, - "logits/chosen": -1.1929118633270264, - "logits/rejected": -1.2678076028823853, - "logps/chosen": -181.35247802734375, - "logps/rejected": -181.50747680664062, - "loss": 0.6264, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.34508901834487915, - "rewards/margins": 0.15236955881118774, - "rewards/rejected": -0.4974585473537445, + "epoch": 0.27716703311346524, + "grad_norm": 5.62435510068984, + "learning_rate": 4.537747612187848e-07, + "logits/chosen": -0.6827915906906128, + "logits/rejected": -0.9053131341934204, + "logps/chosen": -176.27835083007812, + "logps/rejected": -177.09768676757812, + "loss": 0.6331, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2656404376029968, + "rewards/margins": 0.14400802552700043, + "rewards/rejected": -0.40964850783348083, "step": 130 }, { - "epoch": 0.28857715430861725, - "grad_norm": 6.108529751482958, - "learning_rate": 4.4776775866408533e-07, - "logits/chosen": -1.0369148254394531, - "logits/rejected": -1.185436725616455, - "logps/chosen": -190.52249145507812, - "logps/rejected": -190.87527465820312, - "loss": 0.6161, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.4117358326911926, - "rewards/margins": 0.18063664436340332, - "rewards/rejected": -0.5923724174499512, + "epoch": 0.2878273036178293, + "grad_norm": 5.883733263408107, + "learning_rate": 4.4824388140856194e-07, + "logits/chosen": -0.813726544380188, + "logits/rejected": -0.9863494634628296, + "logps/chosen": -193.75765991210938, + "logps/rejected": -192.6829833984375, + "loss": 0.6258, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.32872524857521057, + "rewards/margins": 0.16848836839199066, + "rewards/rejected": -0.49721360206604004, "step": 135 }, { - "epoch": 0.29926519706078825, - "grad_norm": 7.289200064343844, - "learning_rate": 4.4191118508950277e-07, - "logits/chosen": -1.1010531187057495, - "logits/rejected": -1.1958372592926025, - "logps/chosen": -213.7646026611328, - "logps/rejected": -212.4058074951172, - "loss": 0.6139, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5470473766326904, - "rewards/margins": 0.19253471493721008, - "rewards/rejected": -0.7395820617675781, + "epoch": 0.29848757412219334, + "grad_norm": 6.222829798884928, + "learning_rate": 4.4243836114972003e-07, + "logits/chosen": -0.7957421541213989, + "logits/rejected": -0.9675641059875488, + "logps/chosen": -185.958251953125, + "logps/rejected": -190.2810516357422, + "loss": 0.6259, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.36352983117103577, + "rewards/margins": 0.1679573506116867, + "rewards/rejected": -0.5314871072769165, "step": 140 }, { - "epoch": 0.30995323981295925, - "grad_norm": 7.335252146924061, - "learning_rate": 4.357862063693485e-07, - "logits/chosen": -1.0665308237075806, - "logits/rejected": -1.2325011491775513, - "logps/chosen": -211.33517456054688, - "logps/rejected": -214.68771362304688, - "loss": 0.6012, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5864853858947754, - "rewards/margins": 0.211782306432724, - "rewards/rejected": -0.7982677221298218, + "epoch": 0.3091478446265574, + "grad_norm": 6.026406045285321, + "learning_rate": 4.3636624321602354e-07, + "logits/chosen": -0.7669280171394348, + "logits/rejected": -1.0013420581817627, + "logps/chosen": -199.62496948242188, + "logps/rejected": -198.5312957763672, + "loss": 0.6139, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.41982731223106384, + "rewards/margins": 0.1919022500514984, + "rewards/rejected": -0.611729621887207, "step": 145 }, { - "epoch": 0.32064128256513025, - "grad_norm": 6.469286105373321, - "learning_rate": 4.294013888402029e-07, - "logits/chosen": -1.146533727645874, - "logits/rejected": -1.2366269826889038, - "logps/chosen": -207.74771118164062, - "logps/rejected": -216.02493286132812, - "loss": 0.6065, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.6006289720535278, - "rewards/margins": 0.21870818734169006, - "rewards/rejected": -0.8193371891975403, + "epoch": 0.31980811513092144, + "grad_norm": 6.938366915650047, + "learning_rate": 4.300359397167469e-07, + "logits/chosen": -0.78579181432724, + "logits/rejected": -1.0266155004501343, + "logps/chosen": -190.5222625732422, + "logps/rejected": -191.94302368164062, + "loss": 0.6191, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4288663864135742, + "rewards/margins": 0.1750030219554901, + "rewards/rejected": -0.6038694381713867, "step": 150 }, { - "epoch": 0.33132932531730125, - "grad_norm": 7.690788060788228, - "learning_rate": 4.227656622467162e-07, - "logits/chosen": -1.085301399230957, - "logits/rejected": -1.1931884288787842, - "logps/chosen": -204.1922607421875, - "logps/rejected": -215.85549926757812, - "loss": 0.5975, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.6167720556259155, - "rewards/margins": 0.26745596528053284, - "rewards/rejected": -0.8842279314994812, + "epoch": 0.3304683856352855, + "grad_norm": 6.503433628260907, + "learning_rate": 4.2345622044281914e-07, + "logits/chosen": -0.7738896608352661, + "logits/rejected": -0.9923878908157349, + "logps/chosen": -201.4437255859375, + "logps/rejected": -201.36099243164062, + "loss": 0.6073, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.46533137559890747, + "rewards/margins": 0.18831129372119904, + "rewards/rejected": -0.6536425948143005, "step": 155 }, { - "epoch": 0.3420173680694723, - "grad_norm": 7.264253831974079, - "learning_rate": 4.158883072525528e-07, - "logits/chosen": -1.153618335723877, - "logits/rejected": -1.248618721961975, - "logps/chosen": -207.35897827148438, - "logps/rejected": -222.1340789794922, - "loss": 0.6052, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.642236590385437, - "rewards/margins": 0.299454003572464, - "rewards/rejected": -0.9416905641555786, + "epoch": 0.34112865613964954, + "grad_norm": 6.951278659773283, + "learning_rate": 4.1663620071744896e-07, + "logits/chosen": -0.8082219958305359, + "logits/rejected": -1.0701286792755127, + "logps/chosen": -221.80789184570312, + "logps/rejected": -220.5237274169922, + "loss": 0.6108, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5697073340415955, + "rewards/margins": 0.196958988904953, + "rewards/rejected": -0.7666663527488708, "step": 160 }, { - "epoch": 0.3527054108216433, - "grad_norm": 7.3185223124368814, - "learning_rate": 4.087789424605447e-07, - "logits/chosen": -1.1305859088897705, - "logits/rejected": -1.230102300643921, - "logps/chosen": -218.1278076171875, - "logps/rejected": -230.4670867919922, - "loss": 0.5805, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7496069073677063, - "rewards/margins": 0.2791447937488556, - "rewards/rejected": -1.0287516117095947, + "epoch": 0.35178892664401357, + "grad_norm": 7.107245594085975, + "learning_rate": 4.0958532876806036e-07, + "logits/chosen": -0.9068414568901062, + "logits/rejected": -1.0665959119796753, + "logps/chosen": -223.1608428955078, + "logps/rejected": -228.6382598876953, + "loss": 0.6007, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.6051439046859741, + "rewards/margins": 0.22736486792564392, + "rewards/rejected": -0.8325088620185852, "step": 165 }, { - "epoch": 0.3633934535738143, - "grad_norm": 7.847762501568954, - "learning_rate": 4.0144751096020497e-07, - "logits/chosen": -1.1950616836547852, - "logits/rejected": -1.2901179790496826, - "logps/chosen": -221.913818359375, - "logps/rejected": -233.92459106445312, - "loss": 0.6019, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.8033916354179382, - "rewards/margins": 0.2500465512275696, - "rewards/rejected": -1.0534381866455078, + "epoch": 0.36244919714837764, + "grad_norm": 7.5558158008023355, + "learning_rate": 4.023133726370341e-07, + "logits/chosen": -0.7768110036849976, + "logits/rejected": -1.023694634437561, + "logps/chosen": -230.20028686523438, + "logps/rejected": -237.296630859375, + "loss": 0.6005, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6818786859512329, + "rewards/margins": 0.2647910714149475, + "rewards/rejected": -0.9466696977615356, "step": 170 }, { - "epoch": 0.3740814963259853, - "grad_norm": 8.324123786977305, - "learning_rate": 3.939042664214184e-07, - "logits/chosen": -1.0172696113586426, - "logits/rejected": -1.1419814825057983, - "logps/chosen": -221.85205078125, - "logps/rejected": -239.06369018554688, - "loss": 0.5934, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.812637984752655, - "rewards/margins": 0.32894450426101685, - "rewards/rejected": -1.141582727432251, + "epoch": 0.37310946765274167, + "grad_norm": 7.748401207711855, + "learning_rate": 3.9483040664938844e-07, + "logits/chosen": -0.8651229739189148, + "logits/rejected": -1.1080349683761597, + "logps/chosen": -239.4313201904297, + "logps/rejected": -245.35641479492188, + "loss": 0.5827, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7178173065185547, + "rewards/margins": 0.29743796586990356, + "rewards/rejected": -1.015255331993103, "step": 175 }, { - "epoch": 0.3847695390781563, - "grad_norm": 8.44823582968457, - "learning_rate": 3.8615975875375676e-07, - "logits/chosen": -1.1517035961151123, - "logits/rejected": -1.2457711696624756, - "logps/chosen": -246.2954864501953, - "logps/rejected": -262.7611389160156, - "loss": 0.5721, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.0003279447555542, - "rewards/margins": 0.3369919955730438, - "rewards/rejected": -1.3373199701309204, + "epoch": 0.38376973815710574, + "grad_norm": 7.833168702083219, + "learning_rate": 3.8714679745614556e-07, + "logits/chosen": -0.9112879633903503, + "logits/rejected": -1.1001932621002197, + "logps/chosen": -251.1482391357422, + "logps/rejected": -257.7167053222656, + "loss": 0.5869, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8083968162536621, + "rewards/margins": 0.26524096727371216, + "rewards/rejected": -1.073637843132019, "step": 180 }, { - "epoch": 0.3954575818303273, - "grad_norm": 9.421575829314097, - "learning_rate": 3.7822481935147655e-07, - "logits/chosen": -1.1958317756652832, - "logits/rejected": -1.3007477521896362, - "logps/chosen": -253.4735565185547, - "logps/rejected": -269.9012451171875, - "loss": 0.5865, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.0968010425567627, - "rewards/margins": 0.28682538866996765, - "rewards/rejected": -1.3836265802383423, + "epoch": 0.39443000866146977, + "grad_norm": 7.402036456357543, + "learning_rate": 3.792731896727196e-07, + "logits/chosen": -0.8897370100021362, + "logits/rejected": -1.091963768005371, + "logps/chosen": -246.6190948486328, + "logps/rejected": -268.6842041015625, + "loss": 0.5851, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8738805651664734, + "rewards/margins": 0.3643074929714203, + "rewards/rejected": -1.2381881475448608, "step": 185 }, { - "epoch": 0.4061456245824983, - "grad_norm": 7.738248266083752, - "learning_rate": 3.7011054594483443e-07, - "logits/chosen": -1.1629494428634644, - "logits/rejected": -1.315276026725769, - "logps/chosen": -233.98428344726562, - "logps/rejected": -253.08676147460938, - "loss": 0.5737, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.9369367361068726, - "rewards/margins": 0.39417704939842224, - "rewards/rejected": -1.3311136960983276, + "epoch": 0.40509027916583384, + "grad_norm": 7.32634230041485, + "learning_rate": 3.712204911322228e-07, + "logits/chosen": -0.8557780981063843, + "logits/rejected": -1.057023286819458, + "logps/chosen": -217.1138916015625, + "logps/rejected": -232.2842254638672, + "loss": 0.5838, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7771707773208618, + "rewards/margins": 0.2797245681285858, + "rewards/rejected": -1.05689537525177, "step": 190 }, { - "epoch": 0.4168336673346693, - "grad_norm": 8.826416400181678, - "learning_rate": 3.618282870789081e-07, - "logits/chosen": -1.217081069946289, - "logits/rejected": -1.3092479705810547, - "logps/chosen": -262.5732727050781, - "logps/rejected": -288.4781799316406, - "loss": 0.5638, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1105743646621704, - "rewards/margins": 0.40279650688171387, - "rewards/rejected": -1.5133707523345947, + "epoch": 0.41575054967019787, + "grad_norm": 9.45088347010784, + "learning_rate": 3.629998577741174e-07, + "logits/chosen": -0.8742257952690125, + "logits/rejected": -1.0490225553512573, + "logps/chosen": -240.11489868164062, + "logps/rejected": -265.6509094238281, + "loss": 0.5864, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8606696128845215, + "rewards/margins": 0.3593491315841675, + "rewards/rejected": -1.2200186252593994, "step": 195 }, { - "epoch": 0.42752171008684037, - "grad_norm": 8.970228997155468, - "learning_rate": 3.5338962624163016e-07, - "logits/chosen": -1.224380612373352, - "logits/rejected": -1.2850580215454102, - "logps/chosen": -247.10861206054688, - "logps/rejected": -262.37396240234375, - "loss": 0.5955, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.104490041732788, - "rewards/margins": 0.23792652785778046, - "rewards/rejected": -1.3424166440963745, + "epoch": 0.42641082017456194, + "grad_norm": 8.652861206718594, + "learning_rate": 3.546226781891501e-07, + "logits/chosen": -0.8858518600463867, + "logits/rejected": -1.0868691205978394, + "logps/chosen": -266.2615051269531, + "logps/rejected": -285.27703857421875, + "loss": 0.5821, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.975814938545227, + "rewards/margins": 0.4038930833339691, + "rewards/rejected": -1.3797080516815186, "step": 200 }, { - "epoch": 0.43820975283901137, - "grad_norm": 8.930840567803743, - "learning_rate": 3.448063656632321e-07, - "logits/chosen": -1.2234936952590942, - "logits/rejected": -1.3128870725631714, - "logps/chosen": -263.89874267578125, - "logps/rejected": -288.7943115234375, - "loss": 0.5636, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0869605541229248, - "rewards/margins": 0.3674429655075073, - "rewards/rejected": -1.4544036388397217, + "epoch": 0.43707109067892597, + "grad_norm": 9.648919264403354, + "learning_rate": 3.461005578419791e-07, + "logits/chosen": -0.8321302533149719, + "logits/rejected": -1.0552650690078735, + "logps/chosen": -253.7904815673828, + "logps/rejected": -272.8400573730469, + "loss": 0.588, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9785162210464478, + "rewards/margins": 0.3188565969467163, + "rewards/rejected": -1.297372817993164, "step": 205 }, { - "epoch": 0.44889779559118237, - "grad_norm": 8.497422391182381, - "learning_rate": 3.360905098097587e-07, - "logits/chosen": -1.1780011653900146, - "logits/rejected": -1.2811458110809326, - "logps/chosen": -272.6873779296875, - "logps/rejected": -302.4361572265625, - "loss": 0.5603, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.1672416925430298, - "rewards/margins": 0.44876694679260254, - "rewards/rejected": -1.6160085201263428, + "epoch": 0.44773136118329004, + "grad_norm": 8.305774901520081, + "learning_rate": 3.374453029933509e-07, + "logits/chosen": -0.9058141708374023, + "logits/rejected": -1.0458682775497437, + "logps/chosen": -258.77069091796875, + "logps/rejected": -279.82977294921875, + "loss": 0.5823, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9745637774467468, + "rewards/margins": 0.3414529263973236, + "rewards/rejected": -1.3160169124603271, "step": 210 }, { - "epoch": 0.45958583834335337, - "grad_norm": 8.864991663622558, - "learning_rate": 3.272542485937368e-07, - "logits/chosen": -1.2407658100128174, - "logits/rejected": -1.350703477859497, - "logps/chosen": -273.6424560546875, - "logps/rejected": -298.4881896972656, - "loss": 0.5632, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2118488550186157, - "rewards/margins": 0.43778902292251587, - "rewards/rejected": -1.6496378183364868, + "epoch": 0.45839163168765407, + "grad_norm": 8.730250055075079, + "learning_rate": 3.286689043441015e-07, + "logits/chosen": -0.8889232873916626, + "logits/rejected": -1.12659752368927, + "logps/chosen": -264.6424255371094, + "logps/rejected": -273.76092529296875, + "loss": 0.5905, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9881819486618042, + "rewards/margins": 0.31245288252830505, + "rewards/rejected": -1.3006350994110107, "step": 215 }, { - "epoch": 0.47027388109552437, - "grad_norm": 9.459474065870525, - "learning_rate": 3.1830994032548e-07, - "logits/chosen": -1.276635766029358, - "logits/rejected": -1.3768898248672485, - "logps/chosen": -290.4046325683594, - "logps/rejected": -321.23968505859375, - "loss": 0.5536, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3036954402923584, - "rewards/margins": 0.4927287697792053, - "rewards/rejected": -1.796424150466919, + "epoch": 0.46905190219201814, + "grad_norm": 9.464259902697126, + "learning_rate": 3.197835204236402e-07, + "logits/chosen": -0.9472643136978149, + "logits/rejected": -1.142138123512268, + "logps/chosen": -279.47662353515625, + "logps/rejected": -311.5118103027344, + "loss": 0.5629, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.133866548538208, + "rewards/margins": 0.4763459265232086, + "rewards/rejected": -1.6102125644683838, "step": 220 }, { - "epoch": 0.48096192384769537, - "grad_norm": 10.873069350565173, - "learning_rate": 3.0927009442887437e-07, - "logits/chosen": -1.2547285556793213, - "logits/rejected": -1.3368825912475586, - "logps/chosen": -244.4249725341797, - "logps/rejected": -270.0481872558594, - "loss": 0.5792, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.0108007192611694, - "rewards/margins": 0.40567222237586975, - "rewards/rejected": -1.4164730310440063, + "epoch": 0.47971217269638217, + "grad_norm": 9.53110205637003, + "learning_rate": 3.1080146074592877e-07, + "logits/chosen": -0.8609586954116821, + "logits/rejected": -1.1460800170898438, + "logps/chosen": -280.66595458984375, + "logps/rejected": -307.8553771972656, + "loss": 0.5514, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1233617067337036, + "rewards/margins": 0.49458152055740356, + "rewards/rejected": -1.6179431676864624, "step": 225 }, { - "epoch": 0.4916499665998664, - "grad_norm": 9.682548364333739, - "learning_rate": 3.001473539458182e-07, - "logits/chosen": -1.2189174890518188, - "logits/rejected": -1.3100054264068604, - "logps/chosen": -294.0689697265625, - "logps/rejected": -322.6885681152344, - "loss": 0.54, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.387765645980835, - "rewards/margins": 0.46354198455810547, - "rewards/rejected": -1.8513076305389404, + "epoch": 0.49037244320074624, + "grad_norm": 10.766670968073823, + "learning_rate": 3.017351687562928e-07, + "logits/chosen": -0.869361400604248, + "logits/rejected": -1.071195125579834, + "logps/chosen": -287.5640869140625, + "logps/rejected": -315.25347900390625, + "loss": 0.5665, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2507811784744263, + "rewards/margins": 0.4507381319999695, + "rewards/rejected": -1.7015190124511719, "step": 230 }, { - "epoch": 0.5023380093520374, - "grad_norm": 9.095073698825349, - "learning_rate": 2.909544778537844e-07, - "logits/chosen": -1.1778374910354614, - "logits/rejected": -1.3377156257629395, - "logps/chosen": -290.4908142089844, - "logps/rejected": -318.8431396484375, - "loss": 0.5224, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.327233910560608, - "rewards/margins": 0.5179687142372131, - "rewards/rejected": -1.8452026844024658, + "epoch": 0.5010327137051103, + "grad_norm": 8.57346401837084, + "learning_rate": 2.925972045926878e-07, + "logits/chosen": -0.9069381952285767, + "logits/rejected": -1.0885123014450073, + "logps/chosen": -276.06878662109375, + "logps/rejected": -302.81072998046875, + "loss": 0.5677, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1936795711517334, + "rewards/margins": 0.44402870535850525, + "rewards/rejected": -1.6377084255218506, "step": 235 }, { - "epoch": 0.5130260521042084, - "grad_norm": 10.375237038099224, - "learning_rate": 2.817043232212371e-07, - "logits/chosen": -1.2266755104064941, - "logits/rejected": -1.3169020414352417, - "logps/chosen": -278.44268798828125, - "logps/rejected": -317.2452392578125, - "loss": 0.5548, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3376266956329346, - "rewards/margins": 0.501586377620697, - "rewards/rejected": -1.8392131328582764, + "epoch": 0.5116929842094743, + "grad_norm": 8.335769499664682, + "learning_rate": 2.83400227685304e-07, + "logits/chosen": -0.926740288734436, + "logits/rejected": -1.188207983970642, + "logps/chosen": -272.0440979003906, + "logps/rejected": -291.0050964355469, + "loss": 0.5609, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1271604299545288, + "rewards/margins": 0.37117230892181396, + "rewards/rejected": -1.4983327388763428, "step": 240 }, { - "epoch": 0.5237140948563794, - "grad_norm": 9.454012213234876, - "learning_rate": 2.7240982722585837e-07, - "logits/chosen": -1.2225902080535889, - "logits/rejected": -1.282707691192627, - "logps/chosen": -286.9110412597656, - "logps/rejected": -312.2615661621094, - "loss": 0.5691, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3561267852783203, - "rewards/margins": 0.39254850149154663, - "rewards/rejected": -1.7486751079559326, + "epoch": 0.5223532547138383, + "grad_norm": 8.95305553011223, + "learning_rate": 2.7415697921861525e-07, + "logits/chosen": -0.8435291051864624, + "logits/rejected": -1.072458028793335, + "logps/chosen": -263.8363952636719, + "logps/rejected": -289.58270263671875, + "loss": 0.552, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0684736967086792, + "rewards/margins": 0.43612685799598694, + "rewards/rejected": -1.5046006441116333, "step": 245 }, { - "epoch": 0.5344021376085505, - "grad_norm": 10.520633380538367, - "learning_rate": 2.63083989060736e-07, - "logits/chosen": -1.2612429857254028, - "logits/rejected": -1.3992116451263428, - "logps/chosen": -268.7119140625, - "logps/rejected": -306.89727783203125, - "loss": 0.553, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2181971073150635, - "rewards/margins": 0.5442426800727844, - "rewards/rejected": -1.7624397277832031, + "epoch": 0.5330135252182024, + "grad_norm": 10.305199478555215, + "learning_rate": 2.6488026448016686e-07, + "logits/chosen": -0.9254539608955383, + "logits/rejected": -1.1660327911376953, + "logps/chosen": -287.7872009277344, + "logps/rejected": -306.3985290527344, + "loss": 0.5594, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1574687957763672, + "rewards/margins": 0.37755414843559265, + "rewards/rejected": -1.5350229740142822, "step": 250 }, { - "epoch": 0.5450901803607214, - "grad_norm": 10.057261387342246, - "learning_rate": 2.537398517538159e-07, - "logits/chosen": -1.2189009189605713, - "logits/rejected": -1.3459957838058472, - "logps/chosen": -323.403564453125, - "logps/rejected": -382.17181396484375, - "loss": 0.5313, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.506287932395935, - "rewards/margins": 0.7995556592941284, - "rewards/rejected": -2.3058438301086426, + "epoch": 0.5436737957225665, + "grad_norm": 9.11035884736237, + "learning_rate": 2.5558293512055923e-07, + "logits/chosen": -0.8859409093856812, + "logits/rejected": -1.1229826211929321, + "logps/chosen": -278.84051513671875, + "logps/rejected": -311.79669189453125, + "loss": 0.5571, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2464487552642822, + "rewards/margins": 0.48425453901290894, + "rewards/rejected": -1.730703353881836, "step": 255 }, { - "epoch": 0.5557782231128925, - "grad_norm": 11.1073129639955, - "learning_rate": 2.4439048392604877e-07, - "logits/chosen": -1.2884687185287476, - "logits/rejected": -1.4388905763626099, - "logps/chosen": -299.78521728515625, - "logps/rejected": -349.76953125, - "loss": 0.5592, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4663364887237549, - "rewards/margins": 0.6724990010261536, - "rewards/rejected": -2.1388354301452637, + "epoch": 0.5543340662269305, + "grad_norm": 9.443455019352353, + "learning_rate": 2.4627787134919946e-07, + "logits/chosen": -0.8607537150382996, + "logits/rejected": -1.067083716392517, + "logps/chosen": -306.5609130859375, + "logps/rejected": -340.9252014160156, + "loss": 0.559, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4955613613128662, + "rewards/margins": 0.5148967504501343, + "rewards/rejected": -2.01045823097229, "step": 260 }, { - "epoch": 0.5664662658650634, - "grad_norm": 9.483991903963172, - "learning_rate": 2.3504896151374144e-07, - "logits/chosen": -1.190502405166626, - "logits/rejected": -1.267677903175354, - "logps/chosen": -296.08197021484375, - "logps/rejected": -338.9811096191406, - "loss": 0.5338, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.4707852602005005, - "rewards/margins": 0.5913419723510742, - "rewards/rejected": -2.0621273517608643, + "epoch": 0.5649943367312945, + "grad_norm": 10.020105882711649, + "learning_rate": 2.369779640904909e-07, + "logits/chosen": -0.9872435331344604, + "logits/rejected": -1.1790921688079834, + "logps/chosen": -301.1463928222656, + "logps/rejected": -326.53509521484375, + "loss": 0.5522, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.46715247631073, + "rewards/margins": 0.45322275161743164, + "rewards/rejected": -1.9203754663467407, "step": 265 }, { - "epoch": 0.5771543086172345, - "grad_norm": 11.616927120737293, - "learning_rate": 2.2572834948067795e-07, - "logits/chosen": -1.2237799167633057, - "logits/rejected": -1.3627163171768188, - "logps/chosen": -257.4330139160156, - "logps/rejected": -289.18780517578125, - "loss": 0.5814, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2827959060668945, - "rewards/margins": 0.43468183279037476, - "rewards/rejected": -1.717477798461914, + "epoch": 0.5756546072356586, + "grad_norm": 9.230369920285517, + "learning_rate": 2.2769609712517602e-07, + "logits/chosen": -0.9972273707389832, + "logits/rejected": -1.139904499053955, + "logps/chosen": -310.1788635253906, + "logps/rejected": -328.85455322265625, + "loss": 0.5693, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3879780769348145, + "rewards/margins": 0.4023415446281433, + "rewards/rejected": -1.7903196811676025, "step": 270 }, { - "epoch": 0.5878423513694054, - "grad_norm": 11.880720453051408, - "learning_rate": 2.164416835455862e-07, - "logits/chosen": -1.239235520362854, - "logits/rejected": -1.3811712265014648, - "logps/chosen": -296.7735290527344, - "logps/rejected": -323.6806945800781, - "loss": 0.5693, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4895226955413818, - "rewards/margins": 0.4056008756160736, - "rewards/rejected": -1.8951237201690674, + "epoch": 0.5863148777400227, + "grad_norm": 9.773551123939216, + "learning_rate": 2.184451292415778e-07, + "logits/chosen": -0.9245126843452454, + "logits/rejected": -1.0917091369628906, + "logps/chosen": -265.5910949707031, + "logps/rejected": -292.25726318359375, + "loss": 0.5625, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.123450517654419, + "rewards/margins": 0.4249204099178314, + "rewards/rejected": -1.5483709573745728, "step": 275 }, { - "epoch": 0.5985303941215765, - "grad_norm": 10.888069633277993, - "learning_rate": 2.072019519505062e-07, - "logits/chosen": -1.2852171659469604, - "logits/rejected": -1.3627903461456299, - "logps/chosen": -255.99462890625, - "logps/rejected": -296.81109619140625, - "loss": 0.5303, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1813851594924927, - "rewards/margins": 0.50818932056427, - "rewards/rejected": -1.6895744800567627, + "epoch": 0.5969751482443867, + "grad_norm": 9.944866138311095, + "learning_rate": 2.0923787642146434e-07, + "logits/chosen": -0.8810575604438782, + "logits/rejected": -1.0941672325134277, + "logps/chosen": -280.61279296875, + "logps/rejected": -312.9557800292969, + "loss": 0.552, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2670402526855469, + "rewards/margins": 0.519837498664856, + "rewards/rejected": -1.7868778705596924, "step": 280 }, { - "epoch": 0.6092184368737475, - "grad_norm": 10.859722903934385, - "learning_rate": 1.980220772955602e-07, - "logits/chosen": -1.2499147653579712, - "logits/rejected": -1.3383333683013916, - "logps/chosen": -270.6602783203125, - "logps/rejected": -304.13397216796875, - "loss": 0.5501, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1872732639312744, - "rewards/margins": 0.47345179319381714, - "rewards/rejected": -1.6607249975204468, + "epoch": 0.6076354187487507, + "grad_norm": 9.880910925618455, + "learning_rate": 2.0008709408521507e-07, + "logits/chosen": -0.9383381009101868, + "logits/rejected": -1.1827994585037231, + "logps/chosen": -295.6000671386719, + "logps/rejected": -324.3331604003906, + "loss": 0.5407, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2447686195373535, + "rewards/margins": 0.5489395260810852, + "rewards/rejected": -1.793708086013794, "step": 285 }, { - "epoch": 0.6199064796259185, - "grad_norm": 10.928462775359565, - "learning_rate": 1.8891489846552644e-07, - "logits/chosen": -1.1926110982894897, - "logits/rejected": -1.3137929439544678, - "logps/chosen": -295.9590148925781, - "logps/rejected": -323.6693420410156, - "loss": 0.5648, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4859222173690796, - "rewards/margins": 0.4386006295681, - "rewards/rejected": -1.9245229959487915, + "epoch": 0.6182956892531148, + "grad_norm": 10.071491320024812, + "learning_rate": 1.9100545942088848e-07, + "logits/chosen": -0.9224274754524231, + "logits/rejected": -1.1538960933685303, + "logps/chosen": -289.017578125, + "logps/rejected": -325.94952392578125, + "loss": 0.5457, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2537972927093506, + "rewards/margins": 0.5672923922538757, + "rewards/rejected": -1.821089744567871, "step": 290 }, { - "epoch": 0.6305945223780896, - "grad_norm": 10.692632816658566, - "learning_rate": 1.7989315267349933e-07, - "logits/chosen": -1.293264627456665, - "logits/rejected": -1.3715952634811401, - "logps/chosen": -307.13677978515625, - "logps/rejected": -353.787109375, - "loss": 0.5611, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.5717823505401611, - "rewards/margins": 0.5958788394927979, - "rewards/rejected": -2.167661428451538, + "epoch": 0.6289559597574789, + "grad_norm": 11.845857689113707, + "learning_rate": 1.8200555382166898e-07, + "logits/chosen": -0.9387105107307434, + "logits/rejected": -1.1250282526016235, + "logps/chosen": -318.4964294433594, + "logps/rejected": -338.69696044921875, + "loss": 0.5696, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5140787363052368, + "rewards/margins": 0.4427851140499115, + "rewards/rejected": -1.9568637609481812, "step": 295 }, { - "epoch": 0.6412825651302605, - "grad_norm": 10.433782095898632, - "learning_rate": 1.7096945764674398e-07, - "logits/chosen": -1.2665973901748657, - "logits/rejected": -1.3694995641708374, - "logps/chosen": -298.32489013671875, - "logps/rejected": -335.8681945800781, - "loss": 0.5456, + "epoch": 0.6396162302618429, + "grad_norm": 10.971903527074975, + "learning_rate": 1.7309984545602528e-07, + "logits/chosen": -0.9286500215530396, + "logits/rejected": -1.1137937307357788, + "logps/chosen": -279.747802734375, + "logps/rejected": -307.8285217285156, + "loss": 0.5376, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.5017318725585938, - "rewards/margins": 0.5392115116119385, - "rewards/rejected": -2.0409433841705322, + "rewards/chosen": -1.323687195777893, + "rewards/margins": 0.48056259751319885, + "rewards/rejected": -1.8042497634887695, "step": 300 }, { - "epoch": 0.6519706078824316, - "grad_norm": 10.37592606965636, - "learning_rate": 1.621562939796643e-07, - "logits/chosen": -1.2665095329284668, - "logits/rejected": -1.3639295101165771, - "logps/chosen": -279.20367431640625, - "logps/rejected": -319.61822509765625, - "loss": 0.5459, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.336974024772644, - "rewards/margins": 0.5596498250961304, - "rewards/rejected": -1.8966238498687744, + "epoch": 0.6502765007662069, + "grad_norm": 10.964118734413244, + "learning_rate": 1.6430067199472657e-07, + "logits/chosen": -0.9661188125610352, + "logits/rejected": -1.1719661951065063, + "logps/chosen": -294.7871398925781, + "logps/rejected": -329.8990783691406, + "loss": 0.5342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3090574741363525, + "rewards/margins": 0.5292733907699585, + "rewards/rejected": -1.838330864906311, "step": 305 }, { - "epoch": 0.6626586506346025, - "grad_norm": 11.017179077327079, - "learning_rate": 1.5346598767856345e-07, - "logits/chosen": -1.2790768146514893, - "logits/rejected": -1.404673457145691, - "logps/chosen": -288.7110290527344, - "logps/rejected": -331.27593994140625, - "loss": 0.5368, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.434003233909607, - "rewards/margins": 0.5939543843269348, - "rewards/rejected": -2.0279576778411865, + "epoch": 0.660936771270571, + "grad_norm": 11.086382549521785, + "learning_rate": 1.5562022351864534e-07, + "logits/chosen": -0.9217275381088257, + "logits/rejected": -1.1163594722747803, + "logps/chosen": -266.56402587890625, + "logps/rejected": -306.4192810058594, + "loss": 0.5437, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1430429220199585, + "rewards/margins": 0.5940698981285095, + "rewards/rejected": -1.7371127605438232, "step": 310 }, { - "epoch": 0.6733466933867736, - "grad_norm": 13.064624393912725, - "learning_rate": 1.4491069292260866e-07, - "logits/chosen": -1.225053310394287, - "logits/rejected": -1.3088257312774658, - "logps/chosen": -312.3059387207031, - "logps/rejected": -343.82244873046875, - "loss": 0.5613, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.544638991355896, - "rewards/margins": 0.46578413248062134, - "rewards/rejected": -2.010423183441162, + "epoch": 0.6715970417749351, + "grad_norm": 10.957109584007643, + "learning_rate": 1.4707052563102748e-07, + "logits/chosen": -0.8743804097175598, + "logits/rejected": -1.0983814001083374, + "logps/chosen": -285.22607421875, + "logps/rejected": -317.2628173828125, + "loss": 0.5298, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3059532642364502, + "rewards/margins": 0.5242554545402527, + "rewards/rejected": -1.8302087783813477, "step": 315 }, { - "epoch": 0.6840347361389446, - "grad_norm": 10.92054454701986, - "learning_rate": 1.365023750651133e-07, - "logits/chosen": -1.278796911239624, - "logits/rejected": -1.3581492900848389, - "logps/chosen": -296.82745361328125, - "logps/rejected": -337.5614318847656, - "loss": 0.5732, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.4520820379257202, - "rewards/margins": 0.5728704333305359, - "rewards/rejected": -2.0249524116516113, + "epoch": 0.6822573122792991, + "grad_norm": 10.507330109558843, + "learning_rate": 1.386634227976224e-07, + "logits/chosen": -0.9597967863082886, + "logits/rejected": -1.124963402748108, + "logps/chosen": -286.6432189941406, + "logps/rejected": -315.79937744140625, + "loss": 0.5378, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3352241516113281, + "rewards/margins": 0.4382667541503906, + "rewards/rejected": -1.7734909057617188, "step": 320 }, { - "epoch": 0.6947227788911156, - "grad_norm": 11.413693224424645, - "learning_rate": 1.2825279389890818e-07, - "logits/chosen": -1.2449887990951538, - "logits/rejected": -1.3573734760284424, - "logps/chosen": -282.05194091796875, - "logps/rejected": -324.5093078613281, - "loss": 0.5624, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3621684312820435, - "rewards/margins": 0.5701974630355835, - "rewards/rejected": -1.9323660135269165, + "epoch": 0.6929175827836631, + "grad_norm": 9.804790546339078, + "learning_rate": 1.3041056193775665e-07, + "logits/chosen": -0.888710618019104, + "logits/rejected": -1.0851693153381348, + "logps/chosen": -311.01544189453125, + "logps/rejected": -332.7283020019531, + "loss": 0.5475, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5570933818817139, + "rewards/margins": 0.4053064286708832, + "rewards/rejected": -1.9623997211456299, "step": 325 }, { - "epoch": 0.7054108216432866, - "grad_norm": 10.81725417806128, - "learning_rate": 1.201734872092077e-07, - "logits/chosen": -1.3177934885025024, - "logits/rejected": -1.4026581048965454, - "logps/chosen": -299.1542663574219, - "logps/rejected": -344.1829528808594, - "loss": 0.5285, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3522742986679077, - "rewards/margins": 0.6428908109664917, - "rewards/rejected": -1.995165228843689, + "epoch": 0.7035778532880271, + "grad_norm": 9.630550808372668, + "learning_rate": 1.2232337628908103e-07, + "logits/chosen": -0.9582077264785767, + "logits/rejected": -1.1537044048309326, + "logps/chosen": -326.71221923828125, + "logps/rejected": -377.6993713378906, + "loss": 0.5435, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4935967922210693, + "rewards/margins": 0.7231054902076721, + "rewards/rejected": -2.2167022228240967, "step": 330 }, { - "epoch": 0.7160988643954576, - "grad_norm": 9.27448000826257, - "learning_rate": 1.1227575463697439e-07, - "logits/chosen": -1.279221773147583, - "logits/rejected": -1.346551537513733, - "logps/chosen": -292.53143310546875, - "logps/rejected": -325.1310729980469, - "loss": 0.5382, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4391541481018066, - "rewards/margins": 0.4418033957481384, - "rewards/rejected": -1.8809573650360107, + "epoch": 0.7142381237923913, + "grad_norm": 9.172032682717258, + "learning_rate": 1.1441306956834504e-07, + "logits/chosen": -0.9413734674453735, + "logits/rejected": -1.1069329977035522, + "logps/chosen": -306.80218505859375, + "logps/rejected": -357.0929870605469, + "loss": 0.5238, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4035927057266235, + "rewards/margins": 0.6626663208007812, + "rewards/rejected": -2.0662589073181152, "step": 335 }, { - "epoch": 0.7267869071476286, - "grad_norm": 12.596942793407296, - "learning_rate": 1.0457064187534861e-07, - "logits/chosen": -1.2788316011428833, - "logits/rejected": -1.3930736780166626, - "logps/chosen": -263.04644775390625, - "logps/rejected": -297.544677734375, - "loss": 0.5243, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2438901662826538, - "rewards/margins": 0.4604038596153259, - "rewards/rejected": -1.704293966293335, + "epoch": 0.7248983942967553, + "grad_norm": 10.907598822157487, + "learning_rate": 1.0669060045014214e-07, + "logits/chosen": -1.0222991704940796, + "logits/rejected": -1.228389024734497, + "logps/chosen": -316.627197265625, + "logps/rejected": -357.66229248046875, + "loss": 0.5388, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4493268728256226, + "rewards/margins": 0.5827343463897705, + "rewards/rejected": -2.0320611000061035, "step": 340 }, { - "epoch": 0.7374749498997996, - "grad_norm": 10.772758676646806, - "learning_rate": 9.706892522124838e-08, - "logits/chosen": -1.3062481880187988, - "logits/rejected": -1.405081033706665, - "logps/chosen": -266.0574951171875, - "logps/rejected": -303.40576171875, - "loss": 0.5403, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2067289352416992, - "rewards/margins": 0.5310922265052795, - "rewards/rejected": -1.7378212213516235, + "epoch": 0.7355586648011193, + "grad_norm": 10.97300975462713, + "learning_rate": 9.9166667385128e-08, + "logits/chosen": -0.963638186454773, + "logits/rejected": -1.1757190227508545, + "logps/chosen": -304.3102722167969, + "logps/rejected": -354.2998962402344, + "loss": 0.5432, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4618219137191772, + "rewards/margins": 0.7080960273742676, + "rewards/rejected": -2.1699178218841553, "step": 345 }, { - "epoch": 0.7481629926519706, - "grad_norm": 12.704539304943145, - "learning_rate": 8.978109650374396e-08, - "logits/chosen": -1.289038896560669, - "logits/rejected": -1.3789876699447632, - "logps/chosen": -288.42108154296875, - "logps/rejected": -334.9360656738281, - "loss": 0.54, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3827602863311768, - "rewards/margins": 0.6269603967666626, - "rewards/rejected": -2.0097203254699707, + "epoch": 0.7462189353054833, + "grad_norm": 9.89897013382996, + "learning_rate": 9.185169377874488e-08, + "logits/chosen": -0.9903243780136108, + "logits/rejected": -1.1469306945800781, + "logps/chosen": -312.1212158203125, + "logps/rejected": -346.9307861328125, + "loss": 0.5252, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5106861591339111, + "rewards/margins": 0.49892768263816833, + "rewards/rejected": -2.0096137523651123, "step": 350 }, { - "epoch": 0.7588510354041417, - "grad_norm": 12.372363424610926, - "learning_rate": 8.271734841028552e-08, - "logits/chosen": -1.2998688220977783, - "logits/rejected": -1.3741917610168457, - "logps/chosen": -277.0091247558594, - "logps/rejected": -311.0502014160156, - "loss": 0.5408, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3495705127716064, - "rewards/margins": 0.47056931257247925, - "rewards/rejected": -1.8201396465301514, + "epoch": 0.7568792058098475, + "grad_norm": 10.018680833325265, + "learning_rate": 8.475581355098379e-08, + "logits/chosen": -0.9698395729064941, + "logits/rejected": -1.1572554111480713, + "logps/chosen": -304.4853820800781, + "logps/rejected": -342.16827392578125, + "loss": 0.5462, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4320096969604492, + "rewards/margins": 0.5366055965423584, + "rewards/rejected": -1.968615174293518, "step": 355 }, { - "epoch": 0.7695390781563126, - "grad_norm": 12.925333584054316, - "learning_rate": 7.588756023130833e-08, - "logits/chosen": -1.271308183670044, - "logits/rejected": -1.3560264110565186, - "logps/chosen": -298.91485595703125, - "logps/rejected": -354.1470642089844, - "loss": 0.5435, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3966560363769531, - "rewards/margins": 0.7308663129806519, - "rewards/rejected": -2.1275222301483154, + "epoch": 0.7675394763142115, + "grad_norm": 11.03385142626086, + "learning_rate": 7.788885709719033e-08, + "logits/chosen": -0.9215399622917175, + "logits/rejected": -1.1144723892211914, + "logps/chosen": -316.9365234375, + "logps/rejected": -359.6341857910156, + "loss": 0.5392, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.580185890197754, + "rewards/margins": 0.564557671546936, + "rewards/rejected": -2.1447434425354004, "step": 360 }, { - "epoch": 0.7802271209084837, - "grad_norm": 9.654555776347287, - "learning_rate": 6.930128404315214e-08, - "logits/chosen": -1.2685134410858154, - "logits/rejected": -1.3667380809783936, - "logps/chosen": -258.31146240234375, - "logps/rejected": -297.5175476074219, - "loss": 0.5416, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2191996574401855, - "rewards/margins": 0.540935218334198, - "rewards/rejected": -1.7601349353790283, + "epoch": 0.7781997468185755, + "grad_norm": 9.523737016870674, + "learning_rate": 7.126033766936365e-08, + "logits/chosen": -0.9409270286560059, + "logits/rejected": -1.124208688735962, + "logps/chosen": -311.7746276855469, + "logps/rejected": -355.46343994140625, + "loss": 0.536, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5002214908599854, + "rewards/margins": 0.5499864816665649, + "rewards/rejected": -2.05020809173584, "step": 365 }, { - "epoch": 0.7909151636606546, - "grad_norm": 8.882525663665772, - "learning_rate": 6.296773134861824e-08, - "logits/chosen": -1.2775005102157593, - "logits/rejected": -1.3826978206634521, - "logps/chosen": -314.6211853027344, - "logps/rejected": -362.9993591308594, - "loss": 0.5287, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.481632947921753, - "rewards/margins": 0.6845245957374573, - "rewards/rejected": -2.1661574840545654, + "epoch": 0.7888600173229395, + "grad_norm": 11.210638577879926, + "learning_rate": 6.487943819681488e-08, + "logits/chosen": -0.9616110920906067, + "logits/rejected": -1.0974061489105225, + "logps/chosen": -315.260009765625, + "logps/rejected": -357.67059326171875, + "loss": 0.5533, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.509570837020874, + "rewards/margins": 0.537238597869873, + "rewards/rejected": -2.046809434890747, "step": 370 }, { - "epoch": 0.8016032064128257, - "grad_norm": 12.3266129654073, - "learning_rate": 5.6895760193850145e-08, - "logits/chosen": -1.2599807977676392, - "logits/rejected": -1.362378478050232, - "logps/chosen": -330.29217529296875, - "logps/rejected": -376.5113220214844, - "loss": 0.5473, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.621299386024475, - "rewards/margins": 0.6200514435768127, - "rewards/rejected": -2.2413508892059326, + "epoch": 0.7995202878273037, + "grad_norm": 9.781063018210089, + "learning_rate": 5.875499856444358e-08, + "logits/chosen": -0.9564340710639954, + "logits/rejected": -1.1133265495300293, + "logps/chosen": -314.17535400390625, + "logps/rejected": -351.45001220703125, + "loss": 0.5458, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.493622064590454, + "rewards/margins": 0.5427702069282532, + "rewards/rejected": -2.0363922119140625, "step": 375 }, { - "epoch": 0.8122912491649966, - "grad_norm": 9.312057645660815, - "learning_rate": 5.109386277955477e-08, - "logits/chosen": -1.296836018562317, - "logits/rejected": -1.372804880142212, - "logps/chosen": -279.9493103027344, - "logps/rejected": -315.6463928222656, - "loss": 0.5322, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3503572940826416, - "rewards/margins": 0.4901738166809082, - "rewards/rejected": -1.8405311107635498, + "epoch": 0.8101805583316677, + "grad_norm": 11.983119955061767, + "learning_rate": 5.289550336625731e-08, + "logits/chosen": -1.0206782817840576, + "logits/rejected": -1.2104320526123047, + "logps/chosen": -327.4963684082031, + "logps/rejected": -353.74603271484375, + "loss": 0.5474, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.506259560585022, + "rewards/margins": 0.49152374267578125, + "rewards/rejected": -1.9977830648422241, "step": 380 }, { - "epoch": 0.8229792919171677, - "grad_norm": 10.515704364878783, - "learning_rate": 4.557015358389216e-08, - "logits/chosen": -1.2392408847808838, - "logits/rejected": -1.3424805402755737, - "logps/chosen": -304.5181579589844, - "logps/rejected": -352.06317138671875, - "loss": 0.541, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.4691457748413086, - "rewards/margins": 0.6450926065444946, - "rewards/rejected": -2.1142382621765137, + "epoch": 0.8208408288360317, + "grad_norm": 10.83148544527409, + "learning_rate": 4.730907015109759e-08, + "logits/chosen": -0.9245961308479309, + "logits/rejected": -1.1795787811279297, + "logps/chosen": -309.1303405761719, + "logps/rejected": -346.46051025390625, + "loss": 0.5403, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5297610759735107, + "rewards/margins": 0.5533354878425598, + "rewards/rejected": -2.083096742630005, "step": 385 }, { - "epoch": 0.8336673346693386, - "grad_norm": 10.683415661086439, - "learning_rate": 4.0332358013644015e-08, - "logits/chosen": -1.2670717239379883, - "logits/rejected": -1.3680970668792725, - "logps/chosen": -252.0902862548828, - "logps/rejected": -290.1014709472656, - "loss": 0.5534, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.155608057975769, - "rewards/margins": 0.5242873430252075, - "rewards/rejected": -1.6798954010009766, + "epoch": 0.8315010993403957, + "grad_norm": 9.500539654945461, + "learning_rate": 4.200343817685981e-08, + "logits/chosen": -0.9566155672073364, + "logits/rejected": -1.0963544845581055, + "logps/chosen": -313.0601501464844, + "logps/rejected": -343.36773681640625, + "loss": 0.547, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5300524234771729, + "rewards/margins": 0.4933779835700989, + "rewards/rejected": -2.023430347442627, "step": 390 }, { - "epoch": 0.8443553774215097, - "grad_norm": 11.153626380505282, - "learning_rate": 3.538780159953347e-08, - "logits/chosen": -1.3420182466506958, - "logits/rejected": -1.4481894969940186, - "logps/chosen": -300.3475036621094, - "logps/rejected": -344.5700378417969, - "loss": 0.5455, + "epoch": 0.8421613698447599, + "grad_norm": 9.955855605589283, + "learning_rate": 3.698595768878363e-08, + "logits/chosen": -0.9913743734359741, + "logits/rejected": -1.180884599685669, + "logps/chosen": -311.83636474609375, + "logps/rejected": -356.932373046875, + "loss": 0.5178, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3933840990066528, - "rewards/margins": 0.6461249589920044, - "rewards/rejected": -2.039508819580078, + "rewards/chosen": -1.429694414138794, + "rewards/margins": 0.6187530755996704, + "rewards/rejected": -2.048447370529175, "step": 395 }, { - "epoch": 0.8550434201736807, - "grad_norm": 10.825658500946938, - "learning_rate": 3.074339975080836e-08, - "logits/chosen": -1.3054192066192627, - "logits/rejected": -1.3680384159088135, - "logps/chosen": -260.28802490234375, - "logps/rejected": -290.3493347167969, - "loss": 0.5551, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1752262115478516, - "rewards/margins": 0.4429014325141907, - "rewards/rejected": -1.6181275844573975, + "epoch": 0.8528216403491239, + "grad_norm": 11.149747005186983, + "learning_rate": 3.226357973666888e-08, + "logits/chosen": -1.0238213539123535, + "logits/rejected": -1.1811949014663696, + "logps/chosen": -332.1514587402344, + "logps/rejected": -359.03167724609375, + "loss": 0.5505, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6280012130737305, + "rewards/margins": 0.43937546014785767, + "rewards/rejected": -2.0673766136169434, "step": 400 }, { - "epoch": 0.8550434201736807, - "eval_logits/chosen": -1.5384172201156616, - "eval_logits/rejected": -1.541242003440857, - "eval_logps/chosen": -294.61590576171875, - "eval_logps/rejected": -332.76953125, - "eval_loss": 0.6313633918762207, - "eval_rewards/accuracies": 0.6402438879013062, - "eval_rewards/chosen": -1.478508710861206, - "eval_rewards/margins": 0.4022486209869385, - "eval_rewards/rejected": -1.880757212638855, - "eval_runtime": 268.6851, - "eval_samples_per_second": 7.299, - "eval_steps_per_second": 0.458, + "epoch": 0.8528216403491239, + "eval_logits/chosen": -0.9705477356910706, + "eval_logits/rejected": -1.165926456451416, + "eval_logps/chosen": -307.21051025390625, + "eval_logps/rejected": -356.52508544921875, + "eval_loss": 0.5049245953559875, + "eval_rewards/accuracies": 0.7932573556900024, + "eval_rewards/chosen": -1.4455755949020386, + "eval_rewards/margins": 0.6763937473297119, + "eval_rewards/rejected": -2.12196946144104, + "eval_runtime": 11441.6179, + "eval_samples_per_second": 5.247, + "eval_steps_per_second": 1.312, "step": 400 }, { - "epoch": 0.8657314629258517, - "grad_norm": 10.396212061172688, - "learning_rate": 2.6405648083415833e-08, - "logits/chosen": -1.2402374744415283, - "logits/rejected": -1.3408787250518799, - "logps/chosen": -312.81597900390625, - "logps/rejected": -357.02557373046875, - "loss": 0.553, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.5488849878311157, - "rewards/margins": 0.5720169544219971, - "rewards/rejected": -2.1209018230438232, + "epoch": 0.8634819108534879, + "grad_norm": 9.468787134199466, + "learning_rate": 2.7842846545123505e-08, + "logits/chosen": -0.9555789232254028, + "logits/rejected": -1.1705703735351562, + "logps/chosen": -289.6531677246094, + "logps/rejected": -345.7925720214844, + "loss": 0.5233, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3922350406646729, + "rewards/margins": 0.6980171203613281, + "rewards/rejected": -2.090252161026001, "step": 405 }, { - "epoch": 0.8764195056780227, - "grad_norm": 11.668815065523301, - "learning_rate": 2.2380613335296033e-08, - "logits/chosen": -1.2881349325180054, - "logits/rejected": -1.35610830783844, - "logps/chosen": -257.713623046875, - "logps/rejected": -296.5043029785156, - "loss": 0.5289, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2816210985183716, - "rewards/margins": 0.5033342242240906, - "rewards/rejected": -1.7849552631378174, + "epoch": 0.8741421813578519, + "grad_norm": 10.178761020491258, + "learning_rate": 2.372988245018401e-08, + "logits/chosen": -0.9851318597793579, + "logits/rejected": -1.1668522357940674, + "logps/chosen": -316.6786193847656, + "logps/rejected": -362.8905944824219, + "loss": 0.5423, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.598661184310913, + "rewards/margins": 0.608306884765625, + "rewards/rejected": -2.206967830657959, "step": 410 }, { - "epoch": 0.8871075484301937, - "grad_norm": 10.430503404637301, - "learning_rate": 1.8673924881500823e-08, - "logits/chosen": -1.2311804294586182, - "logits/rejected": -1.3315976858139038, - "logps/chosen": -287.8216247558594, - "logps/rejected": -335.9296875, - "loss": 0.5387, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3706209659576416, - "rewards/margins": 0.6368829011917114, - "rewards/rejected": -2.0075037479400635, + "epoch": 0.884802451862216, + "grad_norm": 9.329485481095736, + "learning_rate": 1.9930385414865386e-08, + "logits/chosen": -1.0145405530929565, + "logits/rejected": -1.2289698123931885, + "logps/chosen": -336.15087890625, + "logps/rejected": -373.11309814453125, + "loss": 0.5293, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.558721899986267, + "rewards/margins": 0.6198412775993347, + "rewards/rejected": -2.178563356399536, "step": 415 }, { - "epoch": 0.8977955911823647, - "grad_norm": 11.633168606978016, - "learning_rate": 1.5290766861003475e-08, - "logits/chosen": -1.3063844442367554, - "logits/rejected": -1.3918168544769287, - "logps/chosen": -304.36871337890625, - "logps/rejected": -336.753173828125, - "loss": 0.5691, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4071794748306274, - "rewards/margins": 0.50440514087677, - "rewards/rejected": -1.9115846157073975, + "epoch": 0.8954627223665801, + "grad_norm": 9.690686562397088, + "learning_rate": 1.6449619135393084e-08, + "logits/chosen": -0.9239746928215027, + "logits/rejected": -1.1881077289581299, + "logps/chosen": -296.87200927734375, + "logps/rejected": -329.9718017578125, + "loss": 0.5513, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.472847580909729, + "rewards/margins": 0.5113754868507385, + "rewards/rejected": -1.9842230081558228, "step": 420 }, { - "epoch": 0.9084836339345357, - "grad_norm": 10.024740499585103, - "learning_rate": 1.2235870926211616e-08, - "logits/chosen": -1.2585606575012207, - "logits/rejected": -1.3874483108520508, - "logps/chosen": -278.6107482910156, - "logps/rejected": -315.7309265136719, - "loss": 0.5177, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2951407432556152, - "rewards/margins": 0.5562527179718018, - "rewards/rejected": -1.851393461227417, + "epoch": 0.9061229928709441, + "grad_norm": 10.862769817255897, + "learning_rate": 1.329240574905452e-08, + "logits/chosen": -0.9023639559745789, + "logits/rejected": -1.0890004634857178, + "logps/chosen": -324.7179260253906, + "logps/rejected": -374.7180480957031, + "loss": 0.5149, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5423232316970825, + "rewards/margins": 0.6671528816223145, + "rewards/rejected": -2.2094759941101074, "step": 425 }, { - "epoch": 0.9191716766867067, - "grad_norm": 10.820746919937319, - "learning_rate": 9.513509625323518e-09, - "logits/chosen": -1.2896642684936523, - "logits/rejected": -1.4110147953033447, - "logps/chosen": -270.7927551269531, - "logps/rejected": -316.62298583984375, - "loss": 0.5154, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3175842761993408, - "rewards/margins": 0.5881962776184082, - "rewards/rejected": -1.9057804346084595, + "epoch": 0.9167832633753081, + "grad_norm": 11.35977235393007, + "learning_rate": 1.0463119153770989e-08, + "logits/chosen": -0.9444347620010376, + "logits/rejected": -1.1702197790145874, + "logps/chosen": -298.4215393066406, + "logps/rejected": -328.64215087890625, + "loss": 0.5404, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4311974048614502, + "rewards/margins": 0.5026859045028687, + "rewards/rejected": -1.9338833093643188, "step": 430 }, { - "epoch": 0.9298597194388778, - "grad_norm": 13.093979570115176, - "learning_rate": 7.127490426783123e-09, - "logits/chosen": -1.2911689281463623, - "logits/rejected": -1.3863338232040405, - "logps/chosen": -293.3140563964844, - "logps/rejected": -344.6944885253906, - "loss": 0.5498, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3975478410720825, - "rewards/margins": 0.6820291876792908, - "rewards/rejected": -2.0795769691467285, + "epoch": 0.9274435338796722, + "grad_norm": 10.068213055827782, + "learning_rate": 7.965678948645832e-09, + "logits/chosen": -0.9912747144699097, + "logits/rejected": -1.2084077596664429, + "logps/chosen": -336.46929931640625, + "logps/rejected": -379.56640625, + "loss": 0.538, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6182082891464233, + "rewards/margins": 0.6836891174316406, + "rewards/rejected": -2.3018975257873535, "step": 435 }, { - "epoch": 0.9405477621910487, - "grad_norm": 10.22410965930427, - "learning_rate": 5.08115039419113e-09, - "logits/chosen": -1.3063465356826782, - "logits/rejected": -1.411901831626892, - "logps/chosen": -269.8985290527344, - "logps/rejected": -323.1341857910156, - "loss": 0.5179, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2329270839691162, - "rewards/margins": 0.6758817434310913, - "rewards/rejected": -1.908808946609497, + "epoch": 0.9381038043840363, + "grad_norm": 12.790282190393167, + "learning_rate": 5.803545003882554e-09, + "logits/chosen": -0.9938758015632629, + "logits/rejected": -1.17817223072052, + "logps/chosen": -326.2915954589844, + "logps/rejected": -371.28631591796875, + "loss": 0.5377, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5600776672363281, + "rewards/margins": 0.5917671918869019, + "rewards/rejected": -2.1518447399139404, "step": 440 }, { - "epoch": 0.9512358049432198, - "grad_norm": 13.236713645006159, - "learning_rate": 3.3773515191196646e-09, - "logits/chosen": -1.2777467966079712, - "logits/rejected": -1.376539707183838, - "logps/chosen": -271.12701416015625, - "logps/rejected": -302.2646789550781, - "loss": 0.5469, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3269426822662354, - "rewards/margins": 0.426018625497818, - "rewards/rejected": -1.7529613971710205, + "epoch": 0.9487640748884003, + "grad_norm": 9.050016131957404, + "learning_rate": 3.979712667596669e-09, + "logits/chosen": -0.9720270037651062, + "logits/rejected": -1.1488044261932373, + "logps/chosen": -304.312255859375, + "logps/rejected": -351.5962219238281, + "loss": 0.5199, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4655094146728516, + "rewards/margins": 0.6790416240692139, + "rewards/rejected": -2.1445512771606445, "step": 445 }, { - "epoch": 0.9619238476953907, - "grad_norm": 10.347294825234256, - "learning_rate": 2.0184767183584474e-09, - "logits/chosen": -1.2645479440689087, - "logits/rejected": -1.336827278137207, - "logps/chosen": -295.91156005859375, - "logps/rejected": -335.7817687988281, - "loss": 0.5367, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4391281604766846, - "rewards/margins": 0.5296716690063477, - "rewards/rejected": -1.9687998294830322, + "epoch": 0.9594243453927643, + "grad_norm": 13.159010993827899, + "learning_rate": 2.4967086161600814e-09, + "logits/chosen": -0.994873046875, + "logits/rejected": -1.1672512292861938, + "logps/chosen": -314.894287109375, + "logps/rejected": -354.23223876953125, + "loss": 0.5276, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5018284320831299, + "rewards/margins": 0.5567340850830078, + "rewards/rejected": -2.0585622787475586, "step": 450 }, { - "epoch": 0.9726118904475618, - "grad_norm": 10.13734221117426, - "learning_rate": 1.0064265011902328e-09, - "logits/chosen": -1.283314824104309, - "logits/rejected": -1.433180332183838, - "logps/chosen": -289.7782897949219, - "logps/rejected": -332.22894287109375, - "loss": 0.5384, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.4063074588775635, - "rewards/margins": 0.5916343927383423, - "rewards/rejected": -1.9979419708251953, + "epoch": 0.9700846158971284, + "grad_norm": 9.906738715572994, + "learning_rate": 1.3565873538283757e-09, + "logits/chosen": -0.9630732536315918, + "logits/rejected": -1.1276707649230957, + "logps/chosen": -306.04345703125, + "logps/rejected": -351.21099853515625, + "loss": 0.5208, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.395446538925171, + "rewards/margins": 0.6138492822647095, + "rewards/rejected": -2.009295701980591, "step": 455 }, { - "epoch": 0.9832999331997327, - "grad_norm": 9.623765957605292, - "learning_rate": 3.4261631135654167e-10, - "logits/chosen": -1.2937498092651367, - "logits/rejected": -1.3670485019683838, - "logps/chosen": -297.85888671875, - "logps/rejected": -353.9171142578125, - "loss": 0.5159, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.399839997291565, - "rewards/margins": 0.7437825202941895, - "rewards/rejected": -2.143622398376465, + "epoch": 0.9807448864014925, + "grad_norm": 10.687835024200046, + "learning_rate": 5.609283664990693e-10, + "logits/chosen": -0.9506285786628723, + "logits/rejected": -1.20163094997406, + "logps/chosen": -323.80657958984375, + "logps/rejected": -370.2672424316406, + "loss": 0.5199, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5296146869659424, + "rewards/margins": 0.6610507369041443, + "rewards/rejected": -2.1906654834747314, "step": 460 }, { - "epoch": 0.9939879759519038, - "grad_norm": 11.389757630118782, - "learning_rate": 2.797454743164174e-11, - "logits/chosen": -1.2656190395355225, - "logits/rejected": -1.3394776582717896, - "logps/chosen": -293.093505859375, - "logps/rejected": -331.55291748046875, - "loss": 0.5495, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.443403959274292, - "rewards/margins": 0.527863621711731, - "rewards/rejected": -1.9712673425674438, + "epoch": 0.9914051569058565, + "grad_norm": 11.797447945184583, + "learning_rate": 1.1083393354488491e-10, + "logits/chosen": -0.9356955289840698, + "logits/rejected": -1.1217402219772339, + "logps/chosen": -326.0872497558594, + "logps/rejected": -382.658203125, + "loss": 0.5263, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.588428020477295, + "rewards/margins": 0.7401828169822693, + "rewards/rejected": -2.328610897064209, "step": 465 }, { - "epoch": 0.9982631930527722, - "step": 467, + "epoch": 0.9999333733093477, + "step": 469, "total_flos": 0.0, - "train_loss": 0.5881322287900544, - "train_runtime": 16866.5046, - "train_samples_per_second": 3.55, - "train_steps_per_second": 0.028 + "train_loss": 0.5891387982409138, + "train_runtime": 37343.5856, + "train_samples_per_second": 1.608, + "train_steps_per_second": 0.013 } ], "logging_steps": 5, - "max_steps": 467, + "max_steps": 469, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, @@ -1462,7 +1462,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 2, + "train_batch_size": 1, "trial_name": null, "trial_params": null }