diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9954 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999711843242724, + "eval_steps": 500, + "global_step": 5204, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 220.77500915527344, + "epoch": 0.00019210450485063874, + "grad_norm": 2.5577025413513184, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.038, + "reward": 0.37062498927116394, + "reward_std": 0.34713491797447205, + "rewards/code_format_reward": 0.26875001192092896, + "rewards/code_reward": 0.11812499910593033, + "step": 1, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.01640424354829722, + "clip_ratio/high_mean": 0.003707133045989192, + "clip_ratio/low_mean": 0.0004983297904901621, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004205462749167863, + "completion_length": 164.34375381469727, + "epoch": 0.0019210450485063874, + "grad_norm": 2.2875964641571045, + "kl": 0.13929970601263145, + "learning_rate": 9.999947520846931e-07, + "loss": 0.0575, + "reward": 0.655464380979538, + "reward_std": 0.6216425597667694, + "rewards/code_format_reward": 0.5078125074505806, + "rewards/code_reward": 0.20077905245125294, + "step": 10, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio/high_max": 0.04116484243422747, + "clip_ratio/high_mean": 0.007335515914019197, + "clip_ratio/low_mean": 0.00010183055419474841, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007437346538063138, + "completion_length": 100.84750213623047, + "epoch": 0.003842090097012775, + "grad_norm": 2.409867286682129, + "kl": 1.1695969879627228, + "learning_rate": 9.999734326385416e-07, + "loss": -0.0111, + "reward": 0.9829235672950745, + "reward_std": 0.5127422153949738, + "rewards/code_format_reward": 0.84375, + "rewards/code_reward": 0.2805242508649826, + "step": 20, + "zero_std_ratio": 0.075 + }, + { + "clip_ratio/high_max": 0.039504543878138065, + "clip_ratio/high_mean": 0.0051267803879454735, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0051267803879454735, + "completion_length": 97.57750091552734, + "epoch": 0.005763135145519163, + "grad_norm": 4.608696460723877, + "kl": 2.0701700329780577, + "learning_rate": 9.99935714443203e-07, + "loss": -0.019, + "reward": 1.1568554759025573, + "reward_std": 0.6407819569110871, + "rewards/code_format_reward": 0.8674999952316285, + "rewards/code_reward": 0.3615527212619781, + "step": 30, + "zero_std_ratio": 0.025 + }, + { + "clip_ratio/high_max": 0.005034898268058896, + "clip_ratio/high_mean": 0.0006811309984186664, + "clip_ratio/low_mean": 0.00013664715661434456, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008177781579433941, + "completion_length": 83.10500030517578, + "epoch": 0.00768418019402555, + "grad_norm": 4.833131313323975, + "kl": 2.2019619703292848, + "learning_rate": 9.99881598873272e-07, + "loss": -0.02, + "reward": 1.1795239448547363, + "reward_std": 0.7194581270217896, + "rewards/code_format_reward": 0.8987499952316285, + "rewards/code_reward": 0.36507447361946105, + "step": 40, + "zero_std_ratio": 0.05 + }, + { + "clip_ratio/high_max": 0.00872214906848967, + "clip_ratio/high_mean": 0.0010902686335612088, + "clip_ratio/low_mean": 0.00040422612219117584, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014944947557523846, + "completion_length": 88.22500152587891, + "epoch": 0.009605225242531937, + "grad_norm": 2.778585433959961, + "kl": 2.4498987793922424, + "learning_rate": 9.998110879009265e-07, + "loss": -0.0035, + "reward": 1.2663686752319336, + "reward_std": 0.6244019389152526, + "rewards/code_format_reward": 0.918750011920929, + "rewards/code_reward": 0.40349680185317993, + "step": 50, + "zero_std_ratio": 0.075 + }, + { + "clip_ratio/high_max": 0.016367838624864815, + "clip_ratio/high_mean": 0.002741052128840238, + "clip_ratio/low_mean": 0.0008432979579083621, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003584350028540939, + "completion_length": 91.3375, + "epoch": 0.011526270291038325, + "grad_norm": 2.5201120376586914, + "kl": 2.7947509050369264, + "learning_rate": 9.997241840958557e-07, + "loss": 0.005, + "reward": 1.0697558522224426, + "reward_std": 0.49940577149391174, + "rewards/code_format_reward": 0.9200000047683716, + "rewards/code_reward": 0.30487790107727053, + "step": 60, + "zero_std_ratio": 0.025 + }, + { + "clip_ratio/high_max": 0.031629907339811324, + "clip_ratio/high_mean": 0.005140213097911328, + "clip_ratio/low_mean": 0.003656612744089216, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008796826144680381, + "completion_length": 84.79750213623046, + "epoch": 0.013447315339544713, + "grad_norm": 7.281564712524414, + "kl": 1.7218781247735024, + "learning_rate": 9.99620890625166e-07, + "loss": -0.0261, + "reward": 1.1421246886253358, + "reward_std": 0.5977877795696258, + "rewards/code_format_reward": 0.9275000095367432, + "rewards/code_reward": 0.33918734490871427, + "step": 70, + "zero_std_ratio": 0.05 + }, + { + "clip_ratio/high_max": 0.10261552361771464, + "clip_ratio/high_mean": 0.014289343578275293, + "clip_ratio/low_mean": 0.0031720689148642123, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017461412807460875, + "completion_length": 75.67250061035156, + "epoch": 0.0153683603880511, + "grad_norm": 3.359511137008667, + "kl": 0.3687619216740131, + "learning_rate": 9.995012112532654e-07, + "loss": -0.0037, + "reward": 1.2640612244606018, + "reward_std": 0.5189764618873596, + "rewards/code_format_reward": 0.9087499976158142, + "rewards/code_reward": 0.40484309792518614, + "step": 80, + "zero_std_ratio": 0.075 + }, + { + "clip_ratio/high_max": 0.053048994287382814, + "clip_ratio/high_mean": 0.0092369354548282, + "clip_ratio/low_mean": 0.00010360952001065016, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009340544970473274, + "completion_length": 84.56500091552735, + "epoch": 0.01728940543655749, + "grad_norm": 2.177191734313965, + "kl": 0.5693678379058837, + "learning_rate": 9.993651503417269e-07, + "loss": -0.008, + "reward": 1.1986377000808717, + "reward_std": 0.49277395009994507, + "rewards/code_format_reward": 0.9112500071525573, + "rewards/code_reward": 0.3715063512325287, + "step": 90, + "zero_std_ratio": 0.1 + }, + { + "clip_ratio/high_max": 0.04136249013245106, + "clip_ratio/high_mean": 0.00551328391302377, + "clip_ratio/low_mean": 0.001408070686738938, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006921354681253433, + "completion_length": 80.8550033569336, + "epoch": 0.019210450485063875, + "grad_norm": 2.0033416748046875, + "kl": 0.8493028253316879, + "learning_rate": 9.992127128491296e-07, + "loss": 0.0027, + "reward": 1.1780336141586303, + "reward_std": 0.4479735493659973, + "rewards/code_format_reward": 0.9275000095367432, + "rewards/code_reward": 0.3571417987346649, + "step": 100, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio/high_max": 0.0585523322224617, + "clip_ratio/high_mean": 0.008032404945697635, + "clip_ratio/low_mean": 0.006125411042012275, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014157815964426845, + "completion_length": 74.19000091552735, + "epoch": 0.02113149553357026, + "grad_norm": 2.267624855041504, + "kl": 1.1039492040872574, + "learning_rate": 9.990439043308776e-07, + "loss": -0.0238, + "reward": 1.2784739494323731, + "reward_std": 0.49057124853134154, + "rewards/code_format_reward": 0.9475000023841857, + "rewards/code_reward": 0.40236196517944334, + "step": 110, + "zero_std_ratio": 0.175 + }, + { + "clip_ratio/high_max": 0.07124514738097787, + "clip_ratio/high_mean": 0.01569047374650836, + "clip_ratio/low_mean": 0.0004420768018462695, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01613254987169057, + "completion_length": 68.35750045776368, + "epoch": 0.02305254058207665, + "grad_norm": 4.1564249992370605, + "kl": 1.4338344126939773, + "learning_rate": 9.988587309389975e-07, + "loss": -0.0026, + "reward": 1.1606964468955994, + "reward_std": 0.46601226925849915, + "rewards/code_format_reward": 0.9475000023841857, + "rewards/code_reward": 0.34347322285175325, + "step": 120, + "zero_std_ratio": 0.175 + }, + { + "clip_ratio/high_max": 0.07592196827754379, + "clip_ratio/high_mean": 0.014454811741597951, + "clip_ratio/low_mean": 0.0013599038298707455, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015814715722808615, + "completion_length": 72.15750122070312, + "epoch": 0.024973585630583037, + "grad_norm": 3.9662575721740723, + "kl": 1.5312897458672523, + "learning_rate": 9.98657199421914e-07, + "loss": -0.0024, + "reward": 1.1610160946846009, + "reward_std": 0.3773229032754898, + "rewards/code_format_reward": 0.9587499976158143, + "rewards/code_reward": 0.3408205330371857, + "step": 130, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.07943324451334774, + "clip_ratio/high_mean": 0.014111382194096222, + "clip_ratio/low_mean": 0.0036704083904623985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017781790602020918, + "completion_length": 83.8675033569336, + "epoch": 0.026894630679089426, + "grad_norm": 9.37182331085205, + "kl": 0.5293755233287811, + "learning_rate": 9.984393171242054e-07, + "loss": -0.0045, + "reward": 1.3634901762008667, + "reward_std": 0.5678210258483887, + "rewards/code_format_reward": 0.9512500047683716, + "rewards/code_reward": 0.4439325869083405, + "step": 140, + "zero_std_ratio": 0.175 + }, + { + "clip_ratio/high_max": 0.13982175141572953, + "clip_ratio/high_mean": 0.018845621962100267, + "clip_ratio/low_mean": 0.0009358229042845778, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019781444873660802, + "completion_length": 79.41999969482421, + "epoch": 0.028815675727595812, + "grad_norm": 3.3437957763671875, + "kl": 1.0034890450537204, + "learning_rate": 9.982050919863332e-07, + "loss": -0.0003, + "reward": 1.332119607925415, + "reward_std": 0.4401752531528473, + "rewards/code_format_reward": 0.9674999952316284, + "rewards/code_reward": 0.4241847813129425, + "step": 150, + "zero_std_ratio": 0.2 + }, + { + "clip_ratio/high_max": 0.08667803611606359, + "clip_ratio/high_mean": 0.01204416286200285, + "clip_ratio/low_mean": 0.0012475995084969328, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013291762379230932, + "completion_length": 80.43250122070313, + "epoch": 0.0307367207761022, + "grad_norm": 3.763737678527832, + "kl": 0.9024959966540337, + "learning_rate": 9.979545325443564e-07, + "loss": -0.0043, + "reward": 1.3518987059593202, + "reward_std": 0.46767728328704833, + "rewards/code_format_reward": 0.9450000047683715, + "rewards/code_reward": 0.4396993488073349, + "step": 160, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.08449154160916805, + "clip_ratio/high_mean": 0.012011481402441859, + "clip_ratio/low_mean": 0.00172541297506541, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013736894307658076, + "completion_length": 78.61250152587891, + "epoch": 0.03265776582460859, + "grad_norm": 7.203779220581055, + "kl": 0.9198675453662872, + "learning_rate": 9.976876479296167e-07, + "loss": -0.0013, + "reward": 1.3803849458694457, + "reward_std": 0.4038102596998215, + "rewards/code_format_reward": 0.9587499976158143, + "rewards/code_reward": 0.4505049705505371, + "step": 170, + "zero_std_ratio": 0.2 + }, + { + "clip_ratio/high_max": 0.07188423536717892, + "clip_ratio/high_mean": 0.013125935778953135, + "clip_ratio/low_mean": 0.0036661239922977985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016792060085572304, + "completion_length": 77.1875, + "epoch": 0.03457881087311498, + "grad_norm": 4.487454414367676, + "kl": 1.7507148087024689, + "learning_rate": 9.974044478684084e-07, + "loss": 0.0129, + "reward": 1.3845421075820923, + "reward_std": 0.5211645245552063, + "rewards/code_format_reward": 0.9325000047683716, + "rewards/code_reward": 0.4591460168361664, + "step": 180, + "zero_std_ratio": 0.175 + }, + { + "clip_ratio/high_max": 0.03638382372446358, + "clip_ratio/high_mean": 0.005234482995001599, + "clip_ratio/low_mean": 0.0020637288223952057, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007298211794113741, + "completion_length": 72.73499984741211, + "epoch": 0.03649985592162136, + "grad_norm": 1.9644516706466675, + "kl": 1.5947209149599075, + "learning_rate": 9.97104942681622e-07, + "loss": -0.0015, + "reward": 1.5394827842712402, + "reward_std": 0.42243914008140565, + "rewards/code_format_reward": 0.9625, + "rewards/code_reward": 0.5291163563728333, + "step": 190, + "zero_std_ratio": 0.225 + }, + { + "clip_ratio/high_max": 0.2028519107028842, + "clip_ratio/high_mean": 0.031164265819825232, + "clip_ratio/low_mean": 0.00270410452503711, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03386837020516396, + "completion_length": 69.81000061035157, + "epoch": 0.03842090097012775, + "grad_norm": 3.170088052749634, + "kl": 1.0117133632302284, + "learning_rate": 9.9678914328437e-07, + "loss": 0.0113, + "reward": 1.4108091354370118, + "reward_std": 0.43394198417663576, + "rewards/code_format_reward": 0.9675000071525574, + "rewards/code_reward": 0.46352959871292115, + "step": 200, + "zero_std_ratio": 0.225 + }, + { + "clip_ratio/high_max": 0.053923821565695106, + "clip_ratio/high_mean": 0.009883182743215002, + "clip_ratio/low_mean": 0.0038779765891376883, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013761159335263073, + "completion_length": 69.04750213623046, + "epoch": 0.04034194601863414, + "grad_norm": 2.5729293823242188, + "kl": 1.1861489608883857, + "learning_rate": 9.964570611855874e-07, + "loss": -0.007, + "reward": 1.4398113250732423, + "reward_std": 0.39351261258125303, + "rewards/code_format_reward": 0.9650000095367431, + "rewards/code_reward": 0.47865564227104185, + "step": 210, + "zero_std_ratio": 0.3 + }, + { + "clip_ratio/high_max": 0.1575187448877841, + "clip_ratio/high_mean": 0.020484526228392495, + "clip_ratio/low_mean": 0.011988240911159664, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03247276756446808, + "completion_length": 61.67000122070313, + "epoch": 0.04226299106714052, + "grad_norm": 9.919574737548828, + "kl": 3.983895111083984, + "learning_rate": 9.961087084876135e-07, + "loss": 0.0076, + "reward": 1.2202381372451783, + "reward_std": 0.26475468575954436, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.36793155074119566, + "step": 220, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.141914052516222, + "clip_ratio/high_mean": 0.023408634401857854, + "clip_ratio/low_mean": 0.004240041392040439, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027648675863747484, + "completion_length": 67.37250213623047, + "epoch": 0.04418403611564691, + "grad_norm": 106.46134185791016, + "kl": 2.121386268734932, + "learning_rate": 9.957440978857498e-07, + "loss": -0.0021, + "reward": 1.3681801557540894, + "reward_std": 0.37111111879348757, + "rewards/code_format_reward": 0.9675000071525574, + "rewards/code_reward": 0.4422150731086731, + "step": 230, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.07315623210743069, + "clip_ratio/high_mean": 0.01155225959373638, + "clip_ratio/low_mean": 0.005734288269013632, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01728654802427627, + "completion_length": 72.65750198364258, + "epoch": 0.0461050811641533, + "grad_norm": 3.1017534732818604, + "kl": 0.882834991812706, + "learning_rate": 9.953632426677983e-07, + "loss": -0.0093, + "reward": 1.484795618057251, + "reward_std": 0.4526777356863022, + "rewards/code_format_reward": 0.9662500023841858, + "rewards/code_reward": 0.5008352994918823, + "step": 240, + "zero_std_ratio": 0.2 + }, + { + "clip_ratio/high_max": 0.06419091664720326, + "clip_ratio/high_mean": 0.008793376400717534, + "clip_ratio/low_mean": 0.0021902987034991385, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010983675080933609, + "completion_length": 88.21500091552734, + "epoch": 0.048026126212659684, + "grad_norm": 5.3243279457092285, + "kl": 2.7226425796747207, + "learning_rate": 9.94966156713577e-07, + "loss": -0.0127, + "reward": 1.455380654335022, + "reward_std": 0.4675000965595245, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.4842528164386749, + "step": 250, + "zero_std_ratio": 0.2 + }, + { + "clip_ratio/high_max": 0.0741606397787109, + "clip_ratio/high_mean": 0.012071207936969586, + "clip_ratio/low_mean": 0.003062122967094183, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015133331064134836, + "completion_length": 88.98250122070313, + "epoch": 0.04994717126116607, + "grad_norm": 2.7804369926452637, + "kl": 0.6169008180499077, + "learning_rate": 9.94552854494413e-07, + "loss": 0.0033, + "reward": 1.427869987487793, + "reward_std": 0.4851543098688126, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.4717474699020386, + "step": 260, + "zero_std_ratio": 0.1 + }, + { + "clip_ratio/high_max": 0.03937563952058554, + "clip_ratio/high_mean": 0.0065028761862777175, + "clip_ratio/low_mean": 0.004409579199273139, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01091245551360771, + "completion_length": 87.42750091552735, + "epoch": 0.05186821630967246, + "grad_norm": 6.559643745422363, + "kl": 0.4433484449982643, + "learning_rate": 9.941233510726168e-07, + "loss": -0.0018, + "reward": 1.4182387351989747, + "reward_std": 0.4612067699432373, + "rewards/code_format_reward": 0.9412499904632569, + "rewards/code_reward": 0.4738068819046021, + "step": 270, + "zero_std_ratio": 0.175 + }, + { + "clip_ratio/high_max": 0.057588514033705, + "clip_ratio/high_mean": 0.008462971181143076, + "clip_ratio/low_mean": 0.007865038787713274, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016328010114375503, + "completion_length": 79.69500122070312, + "epoch": 0.05378926135817885, + "grad_norm": 6.077131271362305, + "kl": 0.6961165189743042, + "learning_rate": 9.936776621009322e-07, + "loss": 0.0038, + "reward": 1.5715951919555664, + "reward_std": 0.4179812580347061, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5420475661754608, + "step": 280, + "zero_std_ratio": 0.2 + }, + { + "clip_ratio/high_max": 0.025062982086092235, + "clip_ratio/high_mean": 0.004761367203900591, + "clip_ratio/low_mean": 0.0028623046877328307, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007623671973124147, + "completion_length": 83.21750183105469, + "epoch": 0.055710306406685235, + "grad_norm": 6.066061019897461, + "kl": 0.7484225794672966, + "learning_rate": 9.932158038219662e-07, + "loss": -0.0052, + "reward": 1.1587857127189636, + "reward_std": 0.39943512678146365, + "rewards/code_format_reward": 0.9637500047683716, + "rewards/code_reward": 0.3384553253650665, + "step": 290, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.10117955654859542, + "clip_ratio/high_mean": 0.013649052195250987, + "clip_ratio/low_mean": 0.0008885912131518126, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014537643361836671, + "completion_length": 84.00750122070312, + "epoch": 0.057631351455191625, + "grad_norm": 3.23887038230896, + "kl": 0.8064253896474838, + "learning_rate": 9.92737793067597e-07, + "loss": -0.0034, + "reward": 1.3393104553222657, + "reward_std": 0.4101540923118591, + "rewards/code_format_reward": 0.9549999952316284, + "rewards/code_reward": 0.43090522289276123, + "step": 300, + "zero_std_ratio": 0.15 + }, + { + "clip_ratio/high_max": 0.04703736044466496, + "clip_ratio/high_mean": 0.007716302154585719, + "clip_ratio/low_mean": 0.0006432932626921683, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008359595513320528, + "completion_length": 77.70500030517579, + "epoch": 0.059552396503698014, + "grad_norm": 3.357680320739746, + "kl": 0.6727996915578842, + "learning_rate": 9.922436472583614e-07, + "loss": 0.0013, + "reward": 1.6670202493667603, + "reward_std": 0.4320096135139465, + "rewards/code_format_reward": 0.9712500095367431, + "rewards/code_reward": 0.5906976163387299, + "step": 310, + "zero_std_ratio": 0.3 + }, + { + "clip_ratio/high_max": 0.16380154211074113, + "clip_ratio/high_mean": 0.03262772373855114, + "clip_ratio/low_mean": 0.0011754593724617735, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03380318162962794, + "completion_length": 72.78750152587891, + "epoch": 0.0614734415522044, + "grad_norm": 3.652451992034912, + "kl": 1.8953835844993592, + "learning_rate": 9.91733384402818e-07, + "loss": -0.005, + "reward": 1.4837595462799071, + "reward_std": 0.45500350296497344, + "rewards/code_format_reward": 0.9662500023841858, + "rewards/code_reward": 0.5003172576427459, + "step": 320, + "zero_std_ratio": 0.225 + }, + { + "clip_ratio/high_max": 0.034284231485798955, + "clip_ratio/high_mean": 0.005935872689587995, + "clip_ratio/low_mean": 0.000911827472737059, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0068477002554573115, + "completion_length": 74.17000274658203, + "epoch": 0.06339448660071079, + "grad_norm": 1.5065704584121704, + "kl": 0.40838020071387293, + "learning_rate": 9.912070230968928e-07, + "loss": -0.0054, + "reward": 1.3848075151443482, + "reward_std": 0.3038723856210709, + "rewards/code_format_reward": 0.9612499952316285, + "rewards/code_reward": 0.45209125280380247, + "step": 330, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.05724322898313403, + "clip_ratio/high_mean": 0.009350239217747002, + "clip_ratio/low_mean": 0.0077414238592609765, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0170916625414975, + "completion_length": 80.06500091552735, + "epoch": 0.06531553164921718, + "grad_norm": 3.77842116355896, + "kl": 0.8782595857977867, + "learning_rate": 9.906645825232008e-07, + "loss": -0.0023, + "reward": 1.294193172454834, + "reward_std": 0.3676457226276398, + "rewards/code_format_reward": 0.9549999952316284, + "rewards/code_reward": 0.4083465874195099, + "step": 340, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.10199148450046777, + "clip_ratio/high_mean": 0.018657304299995302, + "clip_ratio/low_mean": 0.004165191331412643, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022822496155276893, + "completion_length": 86.5000015258789, + "epoch": 0.06723657669772357, + "grad_norm": 3.2845616340637207, + "kl": 0.9463568836450577, + "learning_rate": 9.901060824503463e-07, + "loss": -0.0115, + "reward": 1.485135293006897, + "reward_std": 0.48840407729148866, + "rewards/code_format_reward": 0.9487499833106995, + "rewards/code_reward": 0.5053801357746124, + "step": 350, + "zero_std_ratio": 0.225 + }, + { + "clip_ratio/high_max": 0.07233364712446928, + "clip_ratio/high_mean": 0.009769158461131156, + "clip_ratio/low_mean": 0.019356250233249737, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029125408595427872, + "completion_length": 80.54000091552734, + "epoch": 0.06915762174622996, + "grad_norm": 19.32016944885254, + "kl": 1.1565445899963378, + "learning_rate": 9.89531543232204e-07, + "loss": 0.0045, + "reward": 1.3412477493286132, + "reward_std": 0.49785757064819336, + "rewards/code_format_reward": 0.9599999904632568, + "rewards/code_reward": 0.43062385320663454, + "step": 360, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.11471173651516438, + "clip_ratio/high_mean": 0.02246011425741017, + "clip_ratio/low_mean": 0.00892345790634863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031383572798222306, + "completion_length": 74.02000274658204, + "epoch": 0.07107866679473633, + "grad_norm": 2.2520923614501953, + "kl": 1.074078917503357, + "learning_rate": 9.889409858071753e-07, + "loss": -0.0059, + "reward": 1.5273491621017456, + "reward_std": 0.414175683259964, + "rewards/code_format_reward": 0.9775000095367432, + "rewards/code_reward": 0.519299578666687, + "step": 370, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.06336253914050757, + "clip_ratio/high_mean": 0.01199121386744082, + "clip_ratio/low_mean": 0.009130357182584703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021121570840477943, + "completion_length": 86.4000015258789, + "epoch": 0.07299971184324272, + "grad_norm": 4.1052961349487305, + "kl": 1.3110491752624511, + "learning_rate": 9.883344316974266e-07, + "loss": -0.0079, + "reward": 1.5908024072647096, + "reward_std": 0.47413656711578367, + "rewards/code_format_reward": 0.9600000023841858, + "rewards/code_reward": 0.555401211977005, + "step": 380, + "zero_std_ratio": 0.2 + }, + { + "clip_ratio/high_max": 0.04433182019274682, + "clip_ratio/high_mean": 0.008989717412623577, + "clip_ratio/low_mean": 0.006074265367351473, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015063982826541178, + "completion_length": 86.175, + "epoch": 0.07492075689174911, + "grad_norm": 4.5202155113220215, + "kl": 0.830048742890358, + "learning_rate": 9.877119030081048e-07, + "loss": -0.0051, + "reward": 1.492829155921936, + "reward_std": 0.3874175697565079, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.5007895469665528, + "step": 390, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.1522485612425953, + "clip_ratio/high_mean": 0.0220908185117878, + "clip_ratio/low_mean": 0.012701757764443756, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03479257607832551, + "completion_length": 78.91000137329101, + "epoch": 0.0768418019402555, + "grad_norm": 2.6146676540374756, + "kl": 0.8627120085060597, + "learning_rate": 9.870734224265308e-07, + "loss": -0.0059, + "reward": 1.5748756647109985, + "reward_std": 0.3048340857028961, + "rewards/code_format_reward": 0.987500011920929, + "rewards/code_reward": 0.5405627965927124, + "step": 400, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.16312104668468236, + "clip_ratio/high_mean": 0.025311203207820654, + "clip_ratio/low_mean": 0.008227485651150345, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03353868862614036, + "completion_length": 77.27750091552734, + "epoch": 0.07876284698876189, + "grad_norm": 1.7234841585159302, + "kl": 0.8750749856233597, + "learning_rate": 9.864190132213742e-07, + "loss": -0.0062, + "reward": 1.6338460445404053, + "reward_std": 0.3537067860364914, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.570673018693924, + "step": 410, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.0936438184697181, + "clip_ratio/high_mean": 0.014423616812564433, + "clip_ratio/low_mean": 0.010347768076462672, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024771385360509157, + "completion_length": 75.69749908447265, + "epoch": 0.08068389203726828, + "grad_norm": 2.0902154445648193, + "kl": 1.264050543308258, + "learning_rate": 9.857486992418036e-07, + "loss": 0.0048, + "reward": 1.644848608970642, + "reward_std": 0.277804034948349, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.5774242997169494, + "step": 420, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.05532362968660891, + "clip_ratio/high_mean": 0.00992250678827986, + "clip_ratio/low_mean": 0.004125738283619285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014048245223239064, + "completion_length": 69.60749969482421, + "epoch": 0.08260493708577465, + "grad_norm": 3.702075481414795, + "kl": 1.7400359451770782, + "learning_rate": 9.850625049166189e-07, + "loss": -0.0008, + "reward": 1.5316168069839478, + "reward_std": 0.275749945640564, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.5223708748817444, + "step": 430, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.15841160174459218, + "clip_ratio/high_mean": 0.02351265251636505, + "clip_ratio/low_mean": 0.010281538363778963, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.033794190967455506, + "completion_length": 74.51000061035157, + "epoch": 0.08452598213428104, + "grad_norm": 3.4808361530303955, + "kl": 1.2856003642082214, + "learning_rate": 9.8436045525336e-07, + "loss": -0.0035, + "reward": 1.5067368984222411, + "reward_std": 0.28293364942073823, + "rewards/code_format_reward": 0.9737499833106995, + "rewards/code_reward": 0.5099309325218201, + "step": 440, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.06043836465105414, + "clip_ratio/high_mean": 0.009103650611359626, + "clip_ratio/low_mean": 0.002932069695089012, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012035720515996218, + "completion_length": 76.08250122070312, + "epoch": 0.08644702718278743, + "grad_norm": 3.665134906768799, + "kl": 1.0338351279497147, + "learning_rate": 9.836425758373958e-07, + "loss": 0.0011, + "reward": 1.4822889804840087, + "reward_std": 0.18996141627430915, + "rewards/code_format_reward": 0.9674999952316284, + "rewards/code_reward": 0.49926944375038146, + "step": 450, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.21641009524464608, + "clip_ratio/high_mean": 0.03260216782800853, + "clip_ratio/low_mean": 0.007402116784942336, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.040004284400492904, + "completion_length": 73.13500213623047, + "epoch": 0.08836807223129382, + "grad_norm": 3.1982343196868896, + "kl": 0.6477661892771721, + "learning_rate": 9.829088928309923e-07, + "loss": -0.0043, + "reward": 1.7202057361602783, + "reward_std": 0.25773381292819975, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.6163528442382813, + "step": 460, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.09453173456713557, + "clip_ratio/high_mean": 0.015337946941144764, + "clip_ratio/low_mean": 0.005975433619460091, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02131338034523651, + "completion_length": 81.9000015258789, + "epoch": 0.09028911727980021, + "grad_norm": 1.441091775894165, + "kl": 0.6155861958861351, + "learning_rate": 9.82159432972358e-07, + "loss": -0.0063, + "reward": 1.4617766380310058, + "reward_std": 0.24772228300571442, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.48651331663131714, + "step": 470, + "zero_std_ratio": 0.3 + }, + { + "clip_ratio/high_max": 0.16705528497695923, + "clip_ratio/high_mean": 0.026639112271368504, + "clip_ratio/low_mean": 0.0035399875399889425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03017909936606884, + "completion_length": 77.79500274658203, + "epoch": 0.0922101623283066, + "grad_norm": 47.74139404296875, + "kl": 1.360982394218445, + "learning_rate": 9.813942235746705e-07, + "loss": 0.0034, + "reward": 1.5168325901031494, + "reward_std": 0.3997103154659271, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5149787843227387, + "step": 480, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.26956315375864504, + "clip_ratio/high_mean": 0.04211876043118536, + "clip_ratio/low_mean": 0.002336682367604226, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04445544336922467, + "completion_length": 86.21500091552734, + "epoch": 0.09413120737681299, + "grad_norm": 3.7244272232055664, + "kl": 2.59437358379364, + "learning_rate": 9.80613292525081e-07, + "loss": 0.0038, + "reward": 1.6131777048110962, + "reward_std": 0.32231712639331817, + "rewards/code_format_reward": 0.9799999833106995, + "rewards/code_reward": 0.5615888297557831, + "step": 490, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.22240130547434092, + "clip_ratio/high_mean": 0.044074146053753795, + "clip_ratio/low_mean": 0.012573283386882395, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0566474299877882, + "completion_length": 72.23500061035156, + "epoch": 0.09605225242531937, + "grad_norm": 2.852999687194824, + "kl": 1.615745335817337, + "learning_rate": 9.79816668283697e-07, + "loss": 0.0017, + "reward": 1.5203128576278686, + "reward_std": 0.3012717217206955, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.517031443119049, + "step": 500, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.15330625362694264, + "clip_ratio/high_mean": 0.02403738833963871, + "clip_ratio/low_mean": 0.004583830677438528, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028621218353509902, + "completion_length": 74.30000076293945, + "epoch": 0.09797329747382576, + "grad_norm": 2.484840154647827, + "kl": 2.1540999174118043, + "learning_rate": 9.790043798825458e-07, + "loss": 0.0073, + "reward": 1.5013367414474488, + "reward_std": 0.24206546545028687, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.508168363571167, + "step": 510, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.15383050357922912, + "clip_ratio/high_mean": 0.027125787048134953, + "clip_ratio/low_mean": 0.002593657124089077, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029719442850910126, + "completion_length": 65.0400016784668, + "epoch": 0.09989434252233215, + "grad_norm": 7.2150959968566895, + "kl": 1.1968895211815833, + "learning_rate": 9.781764569245178e-07, + "loss": -0.006, + "reward": 1.510750651359558, + "reward_std": 0.41533524394035337, + "rewards/code_format_reward": 0.9712500095367431, + "rewards/code_reward": 0.5125628054141999, + "step": 520, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.109321213606745, + "clip_ratio/high_mean": 0.018354640086181463, + "clip_ratio/low_mean": 0.011131488461978733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029486127989366652, + "completion_length": 74.52250213623047, + "epoch": 0.10181538757083854, + "grad_norm": 1.8456060886383057, + "kl": 0.7155197218060494, + "learning_rate": 9.773329295822844e-07, + "loss": 0.0073, + "reward": 1.5899319171905517, + "reward_std": 0.3179755389690399, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5512159705162049, + "step": 530, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.04905872759409249, + "clip_ratio/high_mean": 0.008021075790748, + "clip_ratio/low_mean": 0.004390958754811436, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012412034533917904, + "completion_length": 67.07500076293945, + "epoch": 0.10373643261934493, + "grad_norm": 4.641266345977783, + "kl": 0.7290919035673141, + "learning_rate": 9.764738285972015e-07, + "loss": 0.0008, + "reward": 1.300760817527771, + "reward_std": 0.3361863404512405, + "rewards/code_format_reward": 0.9537500143051147, + "rewards/code_reward": 0.4119428813457489, + "step": 540, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.1825170351192355, + "clip_ratio/high_mean": 0.027253909036517143, + "clip_ratio/low_mean": 0.0015975978298229166, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028851506439968942, + "completion_length": 73.99250030517578, + "epoch": 0.10565747766785132, + "grad_norm": 1.1770566701889038, + "kl": 1.328820213675499, + "learning_rate": 9.755991852781876e-07, + "loss": -0.0023, + "reward": 1.5671115159988402, + "reward_std": 0.34309983551502227, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5401182293891906, + "step": 550, + "zero_std_ratio": 0.3 + }, + { + "clip_ratio/high_max": 0.12550847120583059, + "clip_ratio/high_mean": 0.025771993771195413, + "clip_ratio/low_mean": 0.0035689805867150427, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029340974800288678, + "completion_length": 71.76750030517579, + "epoch": 0.1075785227163577, + "grad_norm": 0.3435879647731781, + "kl": 2.12383970618248, + "learning_rate": 9.747090315005836e-07, + "loss": 0.0024, + "reward": 1.5273173809051515, + "reward_std": 0.2889336168766022, + "rewards/code_format_reward": 0.9649999976158142, + "rewards/code_reward": 0.5224087119102478, + "step": 560, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.0834595168940723, + "clip_ratio/high_mean": 0.015262311231344939, + "clip_ratio/low_mean": 0.021645180485211312, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03690749178640544, + "completion_length": 79.53250122070312, + "epoch": 0.10949956776486408, + "grad_norm": 1.7026695013046265, + "kl": 1.6705755025148392, + "learning_rate": 9.738033997049902e-07, + "loss": 0.1708, + "reward": 1.5908133745193482, + "reward_std": 0.3691225051879883, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.5475941836833954, + "step": 570, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.18124623028561473, + "clip_ratio/high_mean": 0.02496154889231548, + "clip_ratio/low_mean": 0.020611650816863402, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04557319916784763, + "completion_length": 85.51750183105469, + "epoch": 0.11142061281337047, + "grad_norm": 18.138025283813477, + "kl": 4.237766814231873, + "learning_rate": 9.728823228960862e-07, + "loss": -0.0051, + "reward": 1.5469601631164551, + "reward_std": 0.37420718297362326, + "rewards/code_format_reward": 0.975000011920929, + "rewards/code_reward": 0.5297300696372986, + "step": 580, + "zero_std_ratio": 0.3 + }, + { + "clip_ratio/high_max": 0.014268473512493074, + "clip_ratio/high_mean": 0.0028658110386459157, + "clip_ratio/low_mean": 0.0056301898322999476, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008496000757440924, + "completion_length": 80.10750274658203, + "epoch": 0.11334165786187686, + "grad_norm": 5.16138219833374, + "kl": 0.6609396353363991, + "learning_rate": 9.71945834641426e-07, + "loss": -0.004, + "reward": 1.4476024627685546, + "reward_std": 0.3472218900918961, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.4813012361526489, + "step": 590, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.17693078136071563, + "clip_ratio/high_mean": 0.02441923434380442, + "clip_ratio/low_mean": 0.012987980741309002, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.037407214660197495, + "completion_length": 83.96500091552734, + "epoch": 0.11526270291038325, + "grad_norm": 1.7465465068817139, + "kl": 1.0383819937705994, + "learning_rate": 9.709939690702158e-07, + "loss": -0.0078, + "reward": 1.4550770282745362, + "reward_std": 0.3056318134069443, + "rewards/code_format_reward": 0.9587500095367432, + "rewards/code_reward": 0.48785099387168884, + "step": 600, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.1888352295383811, + "clip_ratio/high_mean": 0.026437551854178308, + "clip_ratio/low_mean": 0.0054486555512994524, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.031886206939816475, + "completion_length": 79.63500213623047, + "epoch": 0.11718374795888964, + "grad_norm": 5.674210548400879, + "kl": 1.2073093384504319, + "learning_rate": 9.700267608720692e-07, + "loss": -0.0021, + "reward": 1.4424492359161376, + "reward_std": 0.3397494524717331, + "rewards/code_format_reward": 0.9725000143051148, + "rewards/code_reward": 0.4780996203422546, + "step": 610, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.09671425293199717, + "clip_ratio/high_mean": 0.020163473271531986, + "clip_ratio/low_mean": 0.006395513273309917, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02655898590455763, + "completion_length": 75.22750091552734, + "epoch": 0.11910479300739603, + "grad_norm": 5.531320571899414, + "kl": 2.2407817423343657, + "learning_rate": 9.690442452957448e-07, + "loss": -0.0021, + "reward": 1.5595922470092773, + "reward_std": 0.28165863305330274, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.5351086378097534, + "step": 620, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.11810005996376276, + "clip_ratio/high_mean": 0.02500568316318095, + "clip_ratio/low_mean": 0.00357620443101041, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028581888042390348, + "completion_length": 80.09000091552734, + "epoch": 0.1210258380559024, + "grad_norm": 2.165558338165283, + "kl": 1.546025463938713, + "learning_rate": 9.680464581478594e-07, + "loss": -0.0037, + "reward": 1.51439368724823, + "reward_std": 0.3320598304271698, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.5140718221664429, + "step": 630, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.10313799739815295, + "clip_ratio/high_mean": 0.017414161982014776, + "clip_ratio/low_mean": 0.009596780824358575, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027010941854678096, + "completion_length": 76.15749969482422, + "epoch": 0.1229468831044088, + "grad_norm": 5.05511999130249, + "kl": 1.6615911841392517, + "learning_rate": 9.670334357915852e-07, + "loss": 0.0033, + "reward": 1.5930729150772094, + "reward_std": 0.3864523351192474, + "rewards/code_format_reward": 0.9662500023841858, + "rewards/code_reward": 0.554973942041397, + "step": 640, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.1653188370168209, + "clip_ratio/high_mean": 0.027094300370663404, + "clip_ratio/low_mean": 0.0033949258620850744, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03048922661691904, + "completion_length": 74.23250274658203, + "epoch": 0.12486792815291518, + "grad_norm": 1.1590094566345215, + "kl": 0.39487394616007804, + "learning_rate": 9.660052151453228e-07, + "loss": -0.006, + "reward": 1.7198987245559691, + "reward_std": 0.3215783953666687, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.613699346780777, + "step": 650, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.2655821519903839, + "clip_ratio/high_mean": 0.03813204998150468, + "clip_ratio/low_mean": 0.017123100493336096, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05525515023618936, + "completion_length": 79.31999969482422, + "epoch": 0.12678897320142157, + "grad_norm": 2.8189809322357178, + "kl": 0.9924295842647552, + "learning_rate": 9.649618336813565e-07, + "loss": -0.0022, + "reward": 1.710445189476013, + "reward_std": 0.2906018912792206, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.6111600875854493, + "step": 660, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.10447313897311687, + "clip_ratio/high_mean": 0.017084641277324408, + "clip_ratio/low_mean": 0.018559307692339645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03564394909190014, + "completion_length": 73.31750183105468, + "epoch": 0.12871001824992795, + "grad_norm": 7.561813831329346, + "kl": 1.0190230280160903, + "learning_rate": 9.639033294244894e-07, + "loss": -0.0059, + "reward": 1.4508479833602905, + "reward_std": 0.2639226779341698, + "rewards/code_format_reward": 0.9724999904632569, + "rewards/code_reward": 0.4822989523410797, + "step": 670, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.17416613902896644, + "clip_ratio/high_mean": 0.02931727101095021, + "clip_ratio/low_mean": 0.013709834642941131, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04302710462361574, + "completion_length": 75.30500183105468, + "epoch": 0.13063106329843435, + "grad_norm": 4.0138373374938965, + "kl": 1.8731355726718903, + "learning_rate": 9.628297409506558e-07, + "loss": 0.0038, + "reward": 1.5990655183792115, + "reward_std": 0.38845544308423996, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.5554702281951904, + "step": 680, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.14133978222962468, + "clip_ratio/high_mean": 0.025468734742025843, + "clip_ratio/low_mean": 0.0034107466402929277, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028879481457988732, + "completion_length": 71.69250183105468, + "epoch": 0.13255210834694073, + "grad_norm": 2.7108314037323, + "kl": 1.0770379617810248, + "learning_rate": 9.61741107385517e-07, + "loss": 0.0015, + "reward": 1.357295000553131, + "reward_std": 0.16353759765625, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.43333501517772677, + "step": 690, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.2215075224637985, + "clip_ratio/high_mean": 0.03973329542204738, + "clip_ratio/low_mean": 0.021483630378497764, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06121692657470703, + "completion_length": 77.00250244140625, + "epoch": 0.13447315339544713, + "grad_norm": 3.874828338623047, + "kl": 1.798163938522339, + "learning_rate": 9.606374684030354e-07, + "loss": -0.0002, + "reward": 1.4897700071334838, + "reward_std": 0.3036611869931221, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.5023849844932556, + "step": 700, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.26057110670953987, + "clip_ratio/high_mean": 0.04422192363999784, + "clip_ratio/low_mean": 0.012507367390207946, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05672929054126143, + "completion_length": 68.01749954223632, + "epoch": 0.1363941984439535, + "grad_norm": 1.9008493423461914, + "kl": 1.1601522982120513, + "learning_rate": 9.595188642240268e-07, + "loss": -0.006, + "reward": 1.5408167839050293, + "reward_std": 0.23992418646812438, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5244708836078644, + "step": 710, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.11190514008048921, + "clip_ratio/high_mean": 0.022988432584679686, + "clip_ratio/low_mean": 0.003842631517909467, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02683106428885367, + "completion_length": 70.91749954223633, + "epoch": 0.1383152434924599, + "grad_norm": 2.230220317840576, + "kl": 0.6176944851875306, + "learning_rate": 9.58385335614697e-07, + "loss": -0.0038, + "reward": 1.474353313446045, + "reward_std": 0.22789922058582307, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.49092662930488584, + "step": 720, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.22790296860039233, + "clip_ratio/high_mean": 0.043722260277718306, + "clip_ratio/low_mean": 0.005503303511068225, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0492255637422204, + "completion_length": 70.33000183105469, + "epoch": 0.1402362885409663, + "grad_norm": 3.880234956741333, + "kl": 1.7978762328624724, + "learning_rate": 9.572369238851546e-07, + "loss": -0.01, + "reward": 1.7555195808410644, + "reward_std": 0.30654080510139464, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6311972856521606, + "step": 730, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.13005290240980685, + "clip_ratio/high_mean": 0.02253831790876575, + "clip_ratio/low_mean": 0.0076317260100040585, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030170044326223434, + "completion_length": 67.4625015258789, + "epoch": 0.14215733358947266, + "grad_norm": 31014.41015625, + "kl": 2.5802926242351534, + "learning_rate": 9.560736708879055e-07, + "loss": 4.1316, + "reward": 1.391554856300354, + "reward_std": 0.3107602626085281, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.4501524269580841, + "step": 740, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.21672796942293643, + "clip_ratio/high_mean": 0.03920850001741201, + "clip_ratio/low_mean": 0.0084746521897614, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.047683153115212915, + "completion_length": 71.03750076293946, + "epoch": 0.14407837863797907, + "grad_norm": 1.3094109296798706, + "kl": 4.56303431391716, + "learning_rate": 9.54895619016329e-07, + "loss": 0.0111, + "reward": 1.5939582109451294, + "reward_std": 0.2379148319363594, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.5547916054725647, + "step": 750, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.08126737037673593, + "clip_ratio/high_mean": 0.01269659586250782, + "clip_ratio/low_mean": 0.006480468995869159, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019177064718678593, + "completion_length": 74.09750213623047, + "epoch": 0.14599942368648544, + "grad_norm": 3.0267083644866943, + "kl": 1.5844107165932655, + "learning_rate": 9.53702811203131e-07, + "loss": 0.0048, + "reward": 1.4744285106658936, + "reward_std": 0.2754403457045555, + "rewards/code_format_reward": 0.9900000095367432, + "rewards/code_reward": 0.489714241027832, + "step": 760, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.22151243952102959, + "clip_ratio/high_mean": 0.038386100489879026, + "clip_ratio/low_mean": 0.001766498590586707, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04015259912703186, + "completion_length": 73.72750244140624, + "epoch": 0.14792046873499184, + "grad_norm": 3596482.75, + "kl": 0.6901701986789703, + "learning_rate": 9.524952909187801e-07, + "loss": 83.9443, + "reward": 1.4019340753555298, + "reward_std": 0.24908357337117196, + "rewards/code_format_reward": 0.9749999880790711, + "rewards/code_reward": 0.45721703171730044, + "step": 770, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.07684345319867134, + "clip_ratio/high_mean": 0.014277776470407844, + "clip_ratio/low_mean": 0.016169815976172685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0304475924000144, + "completion_length": 79.24250183105468, + "epoch": 0.14984151378349822, + "grad_norm": 3.468223810195923, + "kl": 0.45489892959594724, + "learning_rate": 9.512731021699245e-07, + "loss": -0.0056, + "reward": 1.580666732788086, + "reward_std": 0.41472728848457335, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5459583520889282, + "step": 780, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.10067678079940379, + "clip_ratio/high_mean": 0.013439147116150707, + "clip_ratio/low_mean": 0.023053765966324136, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03649291144683957, + "completion_length": 72.04750137329101, + "epoch": 0.15176255883200462, + "grad_norm": 13.193933486938477, + "kl": 1.6161374658346177, + "learning_rate": 9.500362894977864e-07, + "loss": 0.0007, + "reward": 1.6252036333084106, + "reward_std": 0.3433967262506485, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5666643261909485, + "step": 790, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.11107501722872257, + "clip_ratio/high_mean": 0.01587685807608068, + "clip_ratio/low_mean": 0.001843169682251755, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017720027733594178, + "completion_length": 77.34250183105469, + "epoch": 0.153683603880511, + "grad_norm": 3.4086289405822754, + "kl": 0.735039034485817, + "learning_rate": 9.487848979765399e-07, + "loss": -0.0033, + "reward": 1.7214166164398192, + "reward_std": 0.3059865742921829, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6125832796096802, + "step": 800, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.06381021924316883, + "clip_ratio/high_mean": 0.012221441417932511, + "clip_ratio/low_mean": 0.002595777277019806, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014817218482494354, + "completion_length": 78.94500045776367, + "epoch": 0.15560464892901738, + "grad_norm": 2.894174098968506, + "kl": 0.9337424471974373, + "learning_rate": 9.475189732116677e-07, + "loss": -0.0074, + "reward": 1.5309076070785523, + "reward_std": 0.36832110285758973, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.5201413094997406, + "step": 810, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.0614451477304101, + "clip_ratio/high_mean": 0.011137601570226252, + "clip_ratio/low_mean": 0.015545779425883666, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02668338119983673, + "completion_length": 80.46750030517578, + "epoch": 0.15752569397752378, + "grad_norm": 1.5945316553115845, + "kl": 1.666656306385994, + "learning_rate": 9.462385613382997e-07, + "loss": -0.0138, + "reward": 1.4196115970611571, + "reward_std": 0.3273743912577629, + "rewards/code_format_reward": 0.9625, + "rewards/code_reward": 0.4691807866096497, + "step": 820, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.0729204102884978, + "clip_ratio/high_mean": 0.011435226618777961, + "clip_ratio/low_mean": 0.0035716916667297483, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015006918273866177, + "completion_length": 83.92250061035156, + "epoch": 0.15944673902603015, + "grad_norm": 3.7898244857788086, + "kl": 3.157607713341713, + "learning_rate": 9.449437090195312e-07, + "loss": 0.6488, + "reward": 1.5506922006607056, + "reward_std": 0.3165741294622421, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.532533586025238, + "step": 830, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.22014709915965797, + "clip_ratio/high_mean": 0.030960237560793757, + "clip_ratio/low_mean": 0.008386016800068318, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03934625396504998, + "completion_length": 79.59750213623047, + "epoch": 0.16136778407453656, + "grad_norm": 3.164461851119995, + "kl": 0.48004563301801684, + "learning_rate": 9.436344634447226e-07, + "loss": 0.0002, + "reward": 1.4315959692001343, + "reward_std": 0.2676436066627502, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.4714229583740234, + "step": 840, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.20247683776542544, + "clip_ratio/high_mean": 0.040387283614836636, + "clip_ratio/low_mean": 0.0031327656004577877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04352005030959845, + "completion_length": 81.20750274658204, + "epoch": 0.16328882912304293, + "grad_norm": 3.2722160816192627, + "kl": 0.8405016213655472, + "learning_rate": 9.42310872327779e-07, + "loss": -0.0002, + "reward": 1.550826621055603, + "reward_std": 0.4091781198978424, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.5322882652282714, + "step": 850, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.061589781753718854, + "clip_ratio/high_mean": 0.011824411456473172, + "clip_ratio/low_mean": 0.011703617853345349, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023528029827866705, + "completion_length": 62.61500244140625, + "epoch": 0.1652098741715493, + "grad_norm": 0.2732953727245331, + "kl": 1.4307941138744353, + "learning_rate": 9.409729839054123e-07, + "loss": 0.0075, + "reward": 1.5864750623703003, + "reward_std": 0.2073097825050354, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5473000288009644, + "step": 860, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.1379000276327133, + "clip_ratio/high_mean": 0.02470994950272143, + "clip_ratio/low_mean": 0.004926441749557853, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029636391997337343, + "completion_length": 77.2400032043457, + "epoch": 0.1671309192200557, + "grad_norm": 3.488050699234009, + "kl": 0.9351878672838211, + "learning_rate": 9.396208469353826e-07, + "loss": -0.0059, + "reward": 1.5735363721847535, + "reward_std": 0.3392061233520508, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.5436432063579559, + "step": 870, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.07835716316476464, + "clip_ratio/high_mean": 0.014919109572656453, + "clip_ratio/low_mean": 0.006504692946327851, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021423802757635713, + "completion_length": 74.78000183105469, + "epoch": 0.1690519642685621, + "grad_norm": 5.493437767028809, + "kl": 1.060418888926506, + "learning_rate": 9.382545106947214e-07, + "loss": -0.0036, + "reward": 1.745260238647461, + "reward_std": 0.297343048453331, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.6254426181316376, + "step": 880, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.12418304020538926, + "clip_ratio/high_mean": 0.022332211420871318, + "clip_ratio/low_mean": 0.022319327194418294, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04465153906494379, + "completion_length": 84.53250122070312, + "epoch": 0.1709730093170685, + "grad_norm": 5.462327480316162, + "kl": 1.5445073664188385, + "learning_rate": 9.368740249779358e-07, + "loss": 0.0049, + "reward": 1.473905611038208, + "reward_std": 0.33463606536388396, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.49351527690887453, + "step": 890, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio/high_max": 0.08285986992996186, + "clip_ratio/high_mean": 0.015584854045300744, + "clip_ratio/low_mean": 0.002020698119304143, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017605551629094406, + "completion_length": 85.78250122070312, + "epoch": 0.17289405436557487, + "grad_norm": 3.7394657135009766, + "kl": 1.2308152213692665, + "learning_rate": 9.354794400951942e-07, + "loss": 0.0006, + "reward": 1.3064285874366761, + "reward_std": 0.3360040634870529, + "rewards/code_format_reward": 0.9787500023841857, + "rewards/code_reward": 0.40852679312229156, + "step": 900, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.06636467641219497, + "clip_ratio/high_mean": 0.01088127460097894, + "clip_ratio/low_mean": 0.005357642179296818, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01623891657218337, + "completion_length": 86.17000122070313, + "epoch": 0.17481509941408127, + "grad_norm": 3.883023977279663, + "kl": 0.5634948700666428, + "learning_rate": 9.340708068704917e-07, + "loss": -0.0132, + "reward": 1.6946633338928223, + "reward_std": 0.2633577108383179, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6004566550254822, + "step": 910, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.12004003385081888, + "clip_ratio/high_mean": 0.01987670698435977, + "clip_ratio/low_mean": 0.00857236894662492, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028449075785465537, + "completion_length": 83.18000030517578, + "epoch": 0.17673614446258765, + "grad_norm": 5.860812187194824, + "kl": 1.0160879641771317, + "learning_rate": 9.326481766397991e-07, + "loss": -0.0011, + "reward": 1.5558514595031738, + "reward_std": 0.28839708790183066, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5344882309436798, + "step": 920, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.06109805963933468, + "clip_ratio/high_mean": 0.00847023066598922, + "clip_ratio/low_mean": 0.004858631710521877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01332886223681271, + "completion_length": 85.00750122070312, + "epoch": 0.17865718951109402, + "grad_norm": 2.287473440170288, + "kl": 0.629003182053566, + "learning_rate": 9.312116012491916e-07, + "loss": -0.0155, + "reward": 1.3984088182449341, + "reward_std": 0.38690108954906466, + "rewards/code_format_reward": 0.9787500023841857, + "rewards/code_reward": 0.45451690554618834, + "step": 930, + "zero_std_ratio": 0.275 + }, + { + "clip_ratio/high_max": 0.11440350348129869, + "clip_ratio/high_mean": 0.021249773760791867, + "clip_ratio/low_mean": 0.010212704542209395, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03146247826516628, + "completion_length": 85.56500244140625, + "epoch": 0.18057823455960043, + "grad_norm": 2.5915870666503906, + "kl": 0.6908730089664459, + "learning_rate": 9.297611330529588e-07, + "loss": -0.0019, + "reward": 1.5472615003585815, + "reward_std": 0.34995803236961365, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.529568213224411, + "step": 940, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.11480946252122522, + "clip_ratio/high_mean": 0.021491143060848115, + "clip_ratio/low_mean": 0.007519157652859576, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029010300803929568, + "completion_length": 72.10000152587891, + "epoch": 0.1824992796081068, + "grad_norm": 1.5689059495925903, + "kl": 0.7929495573043823, + "learning_rate": 9.282968249116975e-07, + "loss": -0.0054, + "reward": 1.8428637742996217, + "reward_std": 0.2614489495754242, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6745568513870239, + "step": 950, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.3971266824752092, + "clip_ratio/high_mean": 0.05282264268025756, + "clip_ratio/low_mean": 0.004530514683574438, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05735315615311265, + "completion_length": 70.86750030517578, + "epoch": 0.1844203246566132, + "grad_norm": 3.4463512897491455, + "kl": 0.8312035664916039, + "learning_rate": 9.268187301903852e-07, + "loss": 0.0003, + "reward": 1.6929683208465576, + "reward_std": 0.2562918782234192, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.600546681880951, + "step": 960, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.1673737466800958, + "clip_ratio/high_mean": 0.03157579629332759, + "clip_ratio/low_mean": 0.012752554472535848, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.044328349828720096, + "completion_length": 76.92749938964843, + "epoch": 0.18634136970511958, + "grad_norm": 3.0548853874206543, + "kl": 0.6291002959012986, + "learning_rate": 9.253269027564339e-07, + "loss": -0.005, + "reward": 1.4119353413581848, + "reward_std": 0.33177118599414823, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.46065517961978913, + "step": 970, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.15495819319039583, + "clip_ratio/high_mean": 0.022329012653790413, + "clip_ratio/low_mean": 0.006486268152366392, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02881528080906719, + "completion_length": 68.58250122070312, + "epoch": 0.18826241475362598, + "grad_norm": 7.065835952758789, + "kl": 1.0375685960054397, + "learning_rate": 9.238213969777292e-07, + "loss": -0.0046, + "reward": 1.6331373691558837, + "reward_std": 0.2626490265130997, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.5703186750411987, + "step": 980, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.10045178183354438, + "clip_ratio/high_mean": 0.020599483215482904, + "clip_ratio/low_mean": 0.007835417747264728, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02843490142840892, + "completion_length": 71.47500076293946, + "epoch": 0.19018345980213236, + "grad_norm": 4.533353328704834, + "kl": 2.011890631914139, + "learning_rate": 9.223022677206474e-07, + "loss": -0.0001, + "reward": 1.7676753044128417, + "reward_std": 0.25886805951595304, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6382126212120056, + "step": 990, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.12670890614390373, + "clip_ratio/high_mean": 0.022856980562210083, + "clip_ratio/low_mean": 0.016935013599868397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.039791994355618955, + "completion_length": 70.40500106811524, + "epoch": 0.19210450485063874, + "grad_norm": 9.587749481201172, + "kl": 1.1125446915626527, + "learning_rate": 9.207695703480562e-07, + "loss": -0.0049, + "reward": 1.5464402914047242, + "reward_std": 0.30552313327789304, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5257201135158539, + "step": 1000, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.13173274043947458, + "clip_ratio/high_mean": 0.021644592471420764, + "clip_ratio/low_mean": 0.01016495683870744, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03180954959243536, + "completion_length": 81.64500122070312, + "epoch": 0.19402554989914514, + "grad_norm": 61.59896469116211, + "kl": 1.3899411320686341, + "learning_rate": 9.192233607172973e-07, + "loss": 0.0117, + "reward": 1.5586263418197632, + "reward_std": 0.32884465754032133, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5327506422996521, + "step": 1010, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.38223748579621314, + "clip_ratio/high_mean": 0.05293128285557032, + "clip_ratio/low_mean": 0.008536407171050087, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06146768992766738, + "completion_length": 75.7425033569336, + "epoch": 0.19594659494765151, + "grad_norm": 0.8699261546134949, + "kl": 2.267198386788368, + "learning_rate": 9.17663695178151e-07, + "loss": 0.0007, + "reward": 1.4393709778785706, + "reward_std": 0.19248414039611816, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.4724979490041733, + "step": 1020, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.05449964143335819, + "clip_ratio/high_mean": 0.008484689320903271, + "clip_ratio/low_mean": 0.0017167545520351268, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010201443906407804, + "completion_length": 74.80750045776367, + "epoch": 0.19786763999615792, + "grad_norm": 3.8721530437469482, + "kl": 1.034875027090311, + "learning_rate": 9.160906305707814e-07, + "loss": -0.0065, + "reward": 1.6229804277420044, + "reward_std": 0.21886643767356873, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5624276876449585, + "step": 1030, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.057783479290083054, + "clip_ratio/high_mean": 0.008794186974409968, + "clip_ratio/low_mean": 0.01261859169753734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02141277852933854, + "completion_length": 80.94500122070312, + "epoch": 0.1997886850446643, + "grad_norm": 2.0369646549224854, + "kl": 0.47016064152121545, + "learning_rate": 9.145042242236667e-07, + "loss": -0.0016, + "reward": 1.5200274467468262, + "reward_std": 0.2379522889852524, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.5147012054920197, + "step": 1040, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.05080003601033241, + "clip_ratio/high_mean": 0.0081847107532667, + "clip_ratio/low_mean": 0.003685746184783056, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011870456766337157, + "completion_length": 86.39750213623047, + "epoch": 0.2017097300931707, + "grad_norm": 1.86152184009552, + "kl": 0.9119557231664658, + "learning_rate": 9.129045339515085e-07, + "loss": -0.0025, + "reward": 1.338998556137085, + "reward_std": 0.29172809422016144, + "rewards/code_format_reward": 0.9787500023841857, + "rewards/code_reward": 0.42481178045272827, + "step": 1050, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.11181259918957949, + "clip_ratio/high_mean": 0.01702371232677251, + "clip_ratio/low_mean": 0.003983464353950694, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021007176581770183, + "completion_length": 89.0250015258789, + "epoch": 0.20363077514167707, + "grad_norm": 1.664932370185852, + "kl": 1.7415984645485878, + "learning_rate": 9.112916180531254e-07, + "loss": -0.0009, + "reward": 1.6867451906204223, + "reward_std": 0.26216842532157897, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.5971225798130035, + "step": 1060, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.1619036693125963, + "clip_ratio/high_mean": 0.02605230761691928, + "clip_ratio/low_mean": 0.011786457896232606, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03783876644447446, + "completion_length": 80.52750091552734, + "epoch": 0.20555182019018345, + "grad_norm": 3.1480722427368164, + "kl": 2.3309426337480543, + "learning_rate": 9.096655353093286e-07, + "loss": -0.0108, + "reward": 1.7797099113464356, + "reward_std": 0.3243818938732147, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6429799437522888, + "step": 1070, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.145719656907022, + "clip_ratio/high_mean": 0.02472380215767771, + "clip_ratio/low_mean": 0.01881317695369944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.043536979146301745, + "completion_length": 75.46750183105469, + "epoch": 0.20747286523868985, + "grad_norm": 4.7426347732543945, + "kl": 0.7767296731472015, + "learning_rate": 9.080263449807788e-07, + "loss": 0.0042, + "reward": 1.5128322124481202, + "reward_std": 0.26058112680912016, + "rewards/code_format_reward": 0.9662500023841858, + "rewards/code_reward": 0.514853572845459, + "step": 1080, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.04860758520662785, + "clip_ratio/high_mean": 0.00921072952914983, + "clip_ratio/low_mean": 0.013458288778201677, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022669017571024595, + "completion_length": 77.22750244140624, + "epoch": 0.20939391028719623, + "grad_norm": 2.2836835384368896, + "kl": 0.6794285923242569, + "learning_rate": 9.063741068058278e-07, + "loss": -0.0028, + "reward": 1.5665315628051757, + "reward_std": 0.23656646013259888, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5398283064365387, + "step": 1090, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.2074673067778349, + "clip_ratio/high_mean": 0.036228268034756185, + "clip_ratio/low_mean": 0.003734398238157155, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.039962667226791385, + "completion_length": 91.18000030517578, + "epoch": 0.21131495533570263, + "grad_norm": 7.916996002197266, + "kl": 1.0919141083955766, + "learning_rate": 9.0470888099834e-07, + "loss": 0.1666, + "reward": 1.68690767288208, + "reward_std": 0.32907233834266664, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.5984537959098816, + "step": 1100, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.16652454435825348, + "clip_ratio/high_mean": 0.027045656740665436, + "clip_ratio/low_mean": 0.006342244842380751, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03338790265843272, + "completion_length": 80.02000122070312, + "epoch": 0.213236000384209, + "grad_norm": 24.34583282470703, + "kl": 1.00138920545578, + "learning_rate": 9.030307282454995e-07, + "loss": -0.0023, + "reward": 1.6111816883087158, + "reward_std": 0.24880893230438234, + "rewards/code_format_reward": 0.9724999904632569, + "rewards/code_reward": 0.5624658226966858, + "step": 1110, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.18198216175660492, + "clip_ratio/high_mean": 0.02493738690391183, + "clip_ratio/low_mean": 0.004894328210502863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029831714881584048, + "completion_length": 71.95, + "epoch": 0.2151570454327154, + "grad_norm": 2.7608304023742676, + "kl": 0.971074515581131, + "learning_rate": 9.013397097055971e-07, + "loss": -0.0022, + "reward": 1.6884326457977294, + "reward_std": 0.3369467526674271, + "rewards/code_format_reward": 0.9712499856948853, + "rewards/code_reward": 0.6014038324356079, + "step": 1120, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.16543546952307225, + "clip_ratio/high_mean": 0.02493141880258918, + "clip_ratio/low_mean": 0.007064808573340997, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03199622761458158, + "completion_length": 72.50500030517578, + "epoch": 0.21707809048122179, + "grad_norm": 7.147952556610107, + "kl": 6.163409499824047, + "learning_rate": 8.996358870058017e-07, + "loss": 0.0081, + "reward": 1.5753276348114014, + "reward_std": 0.2175431028008461, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.5395387947559357, + "step": 1130, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.05989155264105648, + "clip_ratio/high_mean": 0.009021314003621227, + "clip_ratio/low_mean": 0.014251881884410978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023273196443915366, + "completion_length": 74.57750091552734, + "epoch": 0.21899913552972816, + "grad_norm": 17.58907699584961, + "kl": 0.9839092344045639, + "learning_rate": 8.979193222399154e-07, + "loss": -0.0006, + "reward": 1.570918822288513, + "reward_std": 0.27486068904399874, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5420219123363494, + "step": 1140, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.23600016683340072, + "clip_ratio/high_mean": 0.04525289600715041, + "clip_ratio/low_mean": 0.00799154011765495, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05324443739373237, + "completion_length": 71.89750061035156, + "epoch": 0.22092018057823457, + "grad_norm": 8.010896682739258, + "kl": 1.0768774889409543, + "learning_rate": 8.961900779661095e-07, + "loss": 0.0139, + "reward": 1.5848765134811402, + "reward_std": 0.21965934410691262, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5458757638931274, + "step": 1150, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.10782922431826591, + "clip_ratio/high_mean": 0.014393238560296595, + "clip_ratio/low_mean": 0.0046036563231609765, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018996895058080554, + "completion_length": 78.2925018310547, + "epoch": 0.22284122562674094, + "grad_norm": 3.7750465869903564, + "kl": 0.5210637584328651, + "learning_rate": 8.944482172046448e-07, + "loss": -0.0065, + "reward": 1.6227028608322143, + "reward_std": 0.2484603613615036, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.5660388946533204, + "step": 1160, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.13120641289278864, + "clip_ratio/high_mean": 0.019719564472325146, + "clip_ratio/low_mean": 0.00696407729992643, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026683641644194723, + "completion_length": 81.64000091552734, + "epoch": 0.22476227067524734, + "grad_norm": 1.1691230535507202, + "kl": 0.5908193171024323, + "learning_rate": 8.926938034355751e-07, + "loss": -0.0008, + "reward": 1.6598936080932618, + "reward_std": 0.3073273479938507, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.5830717980861664, + "step": 1170, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.26425148695707323, + "clip_ratio/high_mean": 0.03642228813841939, + "clip_ratio/low_mean": 0.0025068818649742752, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03892916943877935, + "completion_length": 83.06500244140625, + "epoch": 0.22668331572375372, + "grad_norm": 5.047176361083984, + "kl": 0.8601905956864357, + "learning_rate": 8.90926900596434e-07, + "loss": 0.019, + "reward": 1.6030859470367431, + "reward_std": 0.18358819633722306, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.5549804508686066, + "step": 1180, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.20188620835542678, + "clip_ratio/high_mean": 0.03365288833156228, + "clip_ratio/low_mean": 0.012162915989756584, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04581580460071564, + "completion_length": 80.93500061035157, + "epoch": 0.2286043607722601, + "grad_norm": 3.431043863296509, + "kl": 3.284740853309631, + "learning_rate": 8.891475730799039e-07, + "loss": -0.0024, + "reward": 1.719798493385315, + "reward_std": 0.2678588882088661, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.6127117216587067, + "step": 1190, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.13805483505129815, + "clip_ratio/high_mean": 0.02111883880570531, + "clip_ratio/low_mean": 0.002508872369071469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023627711273729802, + "completion_length": 87.33250274658204, + "epoch": 0.2305254058207665, + "grad_norm": 4.731442928314209, + "kl": 1.1696231275796891, + "learning_rate": 8.873558857314706e-07, + "loss": -0.0053, + "reward": 1.7580220222473144, + "reward_std": 0.28411929309368134, + "rewards/code_format_reward": 0.9900000095367432, + "rewards/code_reward": 0.6315110087394714, + "step": 1200, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.07043634681031108, + "clip_ratio/high_mean": 0.009235845855437219, + "clip_ratio/low_mean": 0.017453379271319135, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02668922524899244, + "completion_length": 86.74250030517578, + "epoch": 0.23244645086927287, + "grad_norm": 23.686250686645508, + "kl": 1.7613270074129104, + "learning_rate": 8.855519038470587e-07, + "loss": 0.91, + "reward": 1.8096629619598388, + "reward_std": 0.2700611263513565, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6592064738273621, + "step": 1210, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.1193816315382719, + "clip_ratio/high_mean": 0.01799508691765368, + "clip_ratio/low_mean": 0.0052341839407745285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023229270869342143, + "completion_length": 91.73750152587891, + "epoch": 0.23436749591777928, + "grad_norm": 5.015241622924805, + "kl": 87723751.16166303, + "learning_rate": 8.83735693170653e-07, + "loss": 178666.875, + "reward": 1.5409840583801269, + "reward_std": 0.3586106300354004, + "rewards/code_format_reward": 0.9687500119209289, + "rewards/code_reward": 0.5283045113086701, + "step": 1220, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.15788686936721205, + "clip_ratio/high_mean": 0.02180835944600403, + "clip_ratio/low_mean": 0.004957044991897419, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026765404315665365, + "completion_length": 83.70250091552734, + "epoch": 0.23628854096628565, + "grad_norm": 2.7140953540802, + "kl": 0.755669391900301, + "learning_rate": 8.81907319891902e-07, + "loss": -0.0099, + "reward": 1.8449480056762695, + "reward_std": 0.28006095588207247, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6755990028381348, + "step": 1230, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.12416752465069295, + "clip_ratio/high_mean": 0.01972346901893616, + "clip_ratio/low_mean": 0.01847981174942106, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0382032809779048, + "completion_length": 91.47000274658203, + "epoch": 0.23820958601479206, + "grad_norm": 10.781957626342773, + "kl": 1.0129390999674797, + "learning_rate": 8.800668506437059e-07, + "loss": 0.0011, + "reward": 1.6923505306243896, + "reward_std": 0.3265227422118187, + "rewards/code_format_reward": 0.9787500023841857, + "rewards/code_reward": 0.6014877319335937, + "step": 1240, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.12042212830856443, + "clip_ratio/high_mean": 0.017916655144654216, + "clip_ratio/low_mean": 0.007017276567057707, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02493393179029226, + "completion_length": 76.6675018310547, + "epoch": 0.24013063106329843, + "grad_norm": 47.773136138916016, + "kl": 1.4071896970272064, + "learning_rate": 8.782143524997882e-07, + "loss": 0.0018, + "reward": 1.6722928285598755, + "reward_std": 0.25374017357826234, + "rewards/code_format_reward": 0.9824999809265137, + "rewards/code_reward": 0.5905213832855225, + "step": 1250, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.08169625541195273, + "clip_ratio/high_mean": 0.013112110400106758, + "clip_ratio/low_mean": 0.003914138658728916, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01702624891186133, + "completion_length": 78.23750152587891, + "epoch": 0.2420516761118048, + "grad_norm": 2688.99462890625, + "kl": 9.395949372649193, + "learning_rate": 8.76349892972251e-07, + "loss": 0.1943, + "reward": 1.5601455688476562, + "reward_std": 0.3348282665014267, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5372602701187134, + "step": 1260, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.13095853393897414, + "clip_ratio/high_mean": 0.018921413994394242, + "clip_ratio/low_mean": 0.018763081403449178, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03768449563067407, + "completion_length": 76.1500015258789, + "epoch": 0.2439727211603112, + "grad_norm": 3.0777931213378906, + "kl": 1.7352074533700943, + "learning_rate": 8.744735400091154e-07, + "loss": 0.0055, + "reward": 1.633968448638916, + "reward_std": 0.23277063071727752, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.5713592231273651, + "step": 1270, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.15694143967702984, + "clip_ratio/high_mean": 0.026766782545018943, + "clip_ratio/low_mean": 0.010570818380801938, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03733760174363852, + "completion_length": 76.13500213623047, + "epoch": 0.2458937662088176, + "grad_norm": 2.8748385906219482, + "kl": 3.007472372055054, + "learning_rate": 8.725853619918444e-07, + "loss": 0.0249, + "reward": 1.4643328666687012, + "reward_std": 0.2899716466665268, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.48716638684272767, + "step": 1280, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.07734788609668612, + "clip_ratio/high_mean": 0.013521577988285571, + "clip_ratio/low_mean": 0.002974278874171432, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01649585694540292, + "completion_length": 77.6050033569336, + "epoch": 0.247814811257324, + "grad_norm": 4.51137638092041, + "kl": 0.6521440967917442, + "learning_rate": 8.706854277328507e-07, + "loss": -0.0065, + "reward": 1.663088607788086, + "reward_std": 0.29463320076465604, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.5843567848205566, + "step": 1290, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.12926983460783958, + "clip_ratio/high_mean": 0.016393666993826626, + "clip_ratio/low_mean": 0.024948839796707034, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04134250609204173, + "completion_length": 74.63750305175782, + "epoch": 0.24973585630583037, + "grad_norm": 7.019649982452393, + "kl": 0.6837658904492855, + "learning_rate": 8.687738064729902e-07, + "loss": -0.0022, + "reward": 1.6927862167358398, + "reward_std": 0.14656674191355706, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5973306179046631, + "step": 1300, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.15073961750604212, + "clip_ratio/high_mean": 0.024888798157917336, + "clip_ratio/low_mean": 0.004707413475262001, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02959621128393337, + "completion_length": 79.17500152587891, + "epoch": 0.25165690135433677, + "grad_norm": 3.9428677558898926, + "kl": 1.0088127315044404, + "learning_rate": 8.668505678790368e-07, + "loss": 0.7445, + "reward": 1.5962260961532593, + "reward_std": 0.22741070687770842, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.5528005361557007, + "step": 1310, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.0862931152805686, + "clip_ratio/high_mean": 0.016994312894530593, + "clip_ratio/low_mean": 0.0031913593309582213, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020185671979561448, + "completion_length": 79.30500183105468, + "epoch": 0.25357794640284315, + "grad_norm": 2.810743808746338, + "kl": 2.0237294919788837, + "learning_rate": 8.649157820411451e-07, + "loss": -0.0028, + "reward": 1.6300202369689942, + "reward_std": 0.2859074264764786, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5712601006031036, + "step": 1320, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.14902311654295772, + "clip_ratio/high_mean": 0.02855427504691761, + "clip_ratio/low_mean": 0.012185945303644984, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.040740220062434676, + "completion_length": 70.88000030517578, + "epoch": 0.2554989914513495, + "grad_norm": 4.68557071685791, + "kl": 1.2288852274417876, + "learning_rate": 8.629695194702949e-07, + "loss": -0.0057, + "reward": 1.4114359855651855, + "reward_std": 0.2626632884144783, + "rewards/code_format_reward": 0.9625, + "rewards/code_reward": 0.46509301066398623, + "step": 1330, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.11323303133249282, + "clip_ratio/high_mean": 0.016216285666450857, + "clip_ratio/low_mean": 0.0045135776337701826, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02072986289858818, + "completion_length": 71.99250030517578, + "epoch": 0.2574200364998559, + "grad_norm": 43.944698333740234, + "kl": 1.446278090775013, + "learning_rate": 8.610118510957221e-07, + "loss": 0.0112, + "reward": 1.5807109117507934, + "reward_std": 0.23466840982437134, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5469179153442383, + "step": 1340, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.20504833161830902, + "clip_ratio/high_mean": 0.029384778672829272, + "clip_ratio/low_mean": 0.006734570109983906, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03611934892833233, + "completion_length": 69.60750198364258, + "epoch": 0.25934108154836233, + "grad_norm": 3.4515652656555176, + "kl": 1.288391387462616, + "learning_rate": 8.59042848262334e-07, + "loss": 0.0022, + "reward": 1.7648874998092652, + "reward_std": 0.29008678793907167, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.6340062260627747, + "step": 1350, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.18540791552513838, + "clip_ratio/high_mean": 0.030647353292442857, + "clip_ratio/low_mean": 0.0048290589373209515, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03547641267068684, + "completion_length": 73.8150016784668, + "epoch": 0.2612621265968687, + "grad_norm": 24.974191665649414, + "kl": 1.361786951869726, + "learning_rate": 8.570625827281077e-07, + "loss": -0.0015, + "reward": 1.6352276086807251, + "reward_std": 0.20483867302536965, + "rewards/code_format_reward": 0.9712500095367431, + "rewards/code_reward": 0.5748012781143188, + "step": 1360, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.25138766765594484, + "clip_ratio/high_mean": 0.043486443860456345, + "clip_ratio/low_mean": 0.006613140180706978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05009958455339074, + "completion_length": 85.41999969482421, + "epoch": 0.2631831716453751, + "grad_norm": 0.2826422452926636, + "kl": 1.1484392315149308, + "learning_rate": 8.550711266614774e-07, + "loss": -0.0015, + "reward": 1.5049166679382324, + "reward_std": 0.17118329852819442, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.5090208292007447, + "step": 1370, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.10418513733893633, + "clip_ratio/high_mean": 0.017387184244580568, + "clip_ratio/low_mean": 0.006483422458404675, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023870606115087865, + "completion_length": 78.00750274658203, + "epoch": 0.26510421669388146, + "grad_norm": 0.43826720118522644, + "kl": 0.5077251173555851, + "learning_rate": 8.530685526387023e-07, + "loss": 0.0071, + "reward": 1.5417476654052735, + "reward_std": 0.2806018695235252, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5271238267421723, + "step": 1380, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.12633447310654447, + "clip_ratio/high_mean": 0.01944113611098146, + "clip_ratio/low_mean": 0.02114583211950958, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04058696813735878, + "completion_length": 69.89499969482422, + "epoch": 0.26702526174238783, + "grad_norm": 3.222648859024048, + "kl": 0.8532382689416409, + "learning_rate": 8.510549336412227e-07, + "loss": 0.2832, + "reward": 1.4325429320335388, + "reward_std": 0.23379142954945564, + "rewards/code_format_reward": 0.95625, + "rewards/code_reward": 0.47720896899700166, + "step": 1390, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.10978957340121269, + "clip_ratio/high_mean": 0.015348212420940399, + "clip_ratio/low_mean": 0.00886362442979589, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024211836606264116, + "completion_length": 74.32000198364258, + "epoch": 0.26894630679089426, + "grad_norm": 511.98333740234375, + "kl": 6.762348529696465, + "learning_rate": 8.490303430529996e-07, + "loss": 0.0097, + "reward": 1.5433219909667968, + "reward_std": 0.3002948135137558, + "rewards/code_format_reward": 0.9787500023841857, + "rewards/code_reward": 0.5269734919071197, + "step": 1400, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.021737607452087103, + "clip_ratio/high_mean": 0.004128801400656812, + "clip_ratio/low_mean": 0.008135353482794016, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012264154804870486, + "completion_length": 70.88250122070312, + "epoch": 0.27086735183940064, + "grad_norm": 4.558300018310547, + "kl": 1.0645984336733818, + "learning_rate": 8.469948546578406e-07, + "loss": -0.002, + "reward": 1.711915636062622, + "reward_std": 0.23479849100112915, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6090827941894531, + "step": 1410, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.31115832179784775, + "clip_ratio/high_mean": 0.04542893636971712, + "clip_ratio/low_mean": 0.004167796808178537, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04959673266857863, + "completion_length": 82.51750335693359, + "epoch": 0.272788396887907, + "grad_norm": 26.85635757446289, + "kl": 0.6633755072951317, + "learning_rate": 8.449485426367113e-07, + "loss": -0.0044, + "reward": 1.8086278200149537, + "reward_std": 0.25109012275934217, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6577514052391052, + "step": 1420, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.21433292645961047, + "clip_ratio/high_mean": 0.027504962938837706, + "clip_ratio/low_mean": 0.007746222103014589, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03525118476245552, + "completion_length": 69.12750015258788, + "epoch": 0.2747094419364134, + "grad_norm": 39.272727966308594, + "kl": 2.1152508199214934, + "learning_rate": 8.428914815650318e-07, + "loss": 56.6465, + "reward": 1.5950207233428955, + "reward_std": 0.25626782774925233, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.5518853664398193, + "step": 1430, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.14325670124962925, + "clip_ratio/high_mean": 0.02268084152601659, + "clip_ratio/low_mean": 0.006528474338119849, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029209316370543092, + "completion_length": 67.30000076293945, + "epoch": 0.2766304869849198, + "grad_norm": 4.287910461425781, + "kl": 1.2686308354139328, + "learning_rate": 8.408237464099576e-07, + "loss": 9.8201, + "reward": 1.6364605188369752, + "reward_std": 0.22813104093074799, + "rewards/code_format_reward": 0.9749999880790711, + "rewards/code_reward": 0.5744802415370941, + "step": 1440, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.18851536950096487, + "clip_ratio/high_mean": 0.024719347018981354, + "clip_ratio/low_mean": 0.013444452191470191, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.038163799053290856, + "completion_length": 82.13250274658203, + "epoch": 0.2785515320334262, + "grad_norm": 0.4786536991596222, + "kl": 8.468844538927078, + "learning_rate": 8.387454125276494e-07, + "loss": 0.0456, + "reward": 1.7758944988250733, + "reward_std": 0.1511917643249035, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6410722196102142, + "step": 1450, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.15984937213361264, + "clip_ratio/high_mean": 0.025054804515093565, + "clip_ratio/low_mean": 0.01257994698244147, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03763475017622113, + "completion_length": 79.66000213623047, + "epoch": 0.2804725770819326, + "grad_norm": 3.223284959793091, + "kl": 1.7015444114804268, + "learning_rate": 8.366565556605258e-07, + "loss": 0.0276, + "reward": 1.5976650953292846, + "reward_std": 0.341750779747963, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.5566450238227845, + "step": 1460, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.27182711616624144, + "clip_ratio/high_mean": 0.040798351392732, + "clip_ratio/low_mean": 0.002200227712455671, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04299857785226777, + "completion_length": 79.22250213623047, + "epoch": 0.28239362213043895, + "grad_norm": 1.4845157861709595, + "kl": 1.693036738038063, + "learning_rate": 8.345572519345031e-07, + "loss": -0.0017, + "reward": 1.7161717653274535, + "reward_std": 0.2422049015760422, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.612460857629776, + "step": 1470, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.17126517184078693, + "clip_ratio/high_mean": 0.025960111571475864, + "clip_ratio/low_mean": 0.00444280517695006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030402917158789934, + "completion_length": 83.31750183105468, + "epoch": 0.2843146671789453, + "grad_norm": 5.96829080581665, + "kl": 0.574289733916521, + "learning_rate": 8.324475778562209e-07, + "loss": -0.0061, + "reward": 1.7776363611221313, + "reward_std": 0.2358689785003662, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6428806602954864, + "step": 1480, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.2015096817165613, + "clip_ratio/high_mean": 0.03324723746627569, + "clip_ratio/low_mean": 0.00480144299363019, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.038048680778592824, + "completion_length": 73.28000106811524, + "epoch": 0.28623571222745176, + "grad_norm": 6.496949672698975, + "kl": 0.6653359919786453, + "learning_rate": 8.30327610310254e-07, + "loss": 0.0021, + "reward": 1.6191941976547242, + "reward_std": 0.31718442738056185, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.5639720797538758, + "step": 1490, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.15795932533219456, + "clip_ratio/high_mean": 0.02212390162749216, + "clip_ratio/low_mean": 0.00480329486890696, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026927196700125933, + "completion_length": 73.78000106811524, + "epoch": 0.28815675727595813, + "grad_norm": 5.75892972946167, + "kl": 0.46196936070919037, + "learning_rate": 8.281974265563108e-07, + "loss": -0.0045, + "reward": 1.7829506158828736, + "reward_std": 0.17953601479530334, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.642725282907486, + "step": 1500, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.24582542856223882, + "clip_ratio/high_mean": 0.030850262753665446, + "clip_ratio/low_mean": 0.005616182333324104, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03646644388791174, + "completion_length": 77.69500198364258, + "epoch": 0.2900778023244645, + "grad_norm": 326340576.0, + "kl": 0.605505321919918, + "learning_rate": 8.260571042264166e-07, + "loss": 8518.9961, + "reward": 1.7113344192504882, + "reward_std": 0.18693218380212784, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6087921977043151, + "step": 1510, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.22759323129430414, + "clip_ratio/high_mean": 0.03405714362161234, + "clip_ratio/low_mean": 0.0032101303557283247, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03726727362954989, + "completion_length": 75.53250122070312, + "epoch": 0.2919988473729709, + "grad_norm": 2.2893807888031006, + "kl": 0.5214515089988708, + "learning_rate": 8.23906721322086e-07, + "loss": 0.0027, + "reward": 1.6311777591705323, + "reward_std": 0.17696685791015626, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5690263509750366, + "step": 1520, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.06725322343409061, + "clip_ratio/high_mean": 0.010706762981135398, + "clip_ratio/low_mean": 0.0018884234530560206, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012595186498947442, + "completion_length": 78.90999908447266, + "epoch": 0.29391989242147726, + "grad_norm": 2.6211440563201904, + "kl": 0.5930808052420616, + "learning_rate": 8.217463562114786e-07, + "loss": -0.0035, + "reward": 1.7637510299682617, + "reward_std": 0.209340962767601, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.6365630030632019, + "step": 1530, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.06629493543878198, + "clip_ratio/high_mean": 0.012000571249518543, + "clip_ratio/low_mean": 0.010053297760896385, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022053868882358073, + "completion_length": 77.66250152587891, + "epoch": 0.2958409374699837, + "grad_norm": 0.5937472581863403, + "kl": 0.6556157968938351, + "learning_rate": 8.195760876265438e-07, + "loss": 0.0023, + "reward": 1.4144308805465697, + "reward_std": 0.12647379338741302, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.461590439081192, + "step": 1540, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.2611659773625433, + "clip_ratio/high_mean": 0.05069012229796499, + "clip_ratio/low_mean": 0.009917778367525897, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.06060790033079684, + "completion_length": 80.60250091552734, + "epoch": 0.29776198251849006, + "grad_norm": 7.297484874725342, + "kl": 2.139972834289074, + "learning_rate": 8.173959946601519e-07, + "loss": 0.0662, + "reward": 1.6416264057159424, + "reward_std": 0.3118141442537308, + "rewards/code_format_reward": 0.9749999880790711, + "rewards/code_reward": 0.5770631790161133, + "step": 1550, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.14441705606877803, + "clip_ratio/high_mean": 0.023728324193507434, + "clip_ratio/low_mean": 0.005098688977886923, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028827012795954943, + "completion_length": 77.17750244140625, + "epoch": 0.29968302756699644, + "grad_norm": 5.614815711975098, + "kl": 0.5137595549225807, + "learning_rate": 8.152061567632108e-07, + "loss": -0.0057, + "reward": 1.5097593545913697, + "reward_std": 0.29559260606765747, + "rewards/code_format_reward": 0.9575000047683716, + "rewards/code_reward": 0.5155046641826629, + "step": 1560, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.03715956890955567, + "clip_ratio/high_mean": 0.006024846772197634, + "clip_ratio/low_mean": 0.009319488028995692, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015344334836117923, + "completion_length": 76.34000091552734, + "epoch": 0.3016040726155028, + "grad_norm": 5.059381008148193, + "kl": 0.8711868159472942, + "learning_rate": 8.130066537417707e-07, + "loss": -0.0003, + "reward": 1.4149085521697997, + "reward_std": 0.19155050422996284, + "rewards/code_format_reward": 0.9749999880790711, + "rewards/code_reward": 0.463704252243042, + "step": 1570, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.0935845285654068, + "clip_ratio/high_mean": 0.013573423656634987, + "clip_ratio/low_mean": 0.00990565216197865, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023479075590148567, + "completion_length": 83.97500152587891, + "epoch": 0.30352511766400925, + "grad_norm": 2.025956869125366, + "kl": 0.9980318561196327, + "learning_rate": 8.10797565754116e-07, + "loss": -0.0041, + "reward": 1.5444376945495606, + "reward_std": 0.19510383605957032, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.525031316280365, + "step": 1580, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.11659459788352251, + "clip_ratio/high_mean": 0.016526972700376064, + "clip_ratio/low_mean": 0.0030368489184184, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019563821679912507, + "completion_length": 90.33000335693359, + "epoch": 0.3054461627125156, + "grad_norm": 4.901747703552246, + "kl": 0.6650052145123482, + "learning_rate": 8.085789733078439e-07, + "loss": 0.9063, + "reward": 1.6000897407531738, + "reward_std": 0.20618843138217927, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5556698679924011, + "step": 1590, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.1246914654970169, + "clip_ratio/high_mean": 0.018419789243489505, + "clip_ratio/low_mean": 0.0033823222620412707, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021802110970020293, + "completion_length": 82.78250122070312, + "epoch": 0.307367207761022, + "grad_norm": 16365.4453125, + "kl": 83.84930176734925, + "learning_rate": 8.063509572569303e-07, + "loss": 0.4123, + "reward": 1.8164207458496093, + "reward_std": 0.25260339230298995, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6613353252410888, + "step": 1600, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.18187000900506972, + "clip_ratio/high_mean": 0.026620355295017363, + "clip_ratio/low_mean": 0.011157544914749452, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03777789976447821, + "completion_length": 72.65250244140626, + "epoch": 0.3092882528095284, + "grad_norm": 2.8136842250823975, + "kl": 0.9565572030842304, + "learning_rate": 8.041135987987831e-07, + "loss": 0.0037, + "reward": 1.7599462985992431, + "reward_std": 0.26825075447559354, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6324730753898621, + "step": 1610, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.03404317735694349, + "clip_ratio/high_mean": 0.006068735342705622, + "clip_ratio/low_mean": 0.010824382931605214, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016893118200823665, + "completion_length": 78.07500305175782, + "epoch": 0.31120929785803475, + "grad_norm": 31.179058074951172, + "kl": 0.560398967564106, + "learning_rate": 8.018669794712835e-07, + "loss": -0.0011, + "reward": 1.5130140781402588, + "reward_std": 0.2716240629553795, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.5115070700645447, + "step": 1620, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.06218870538286865, + "clip_ratio/high_mean": 0.008549430634593591, + "clip_ratio/low_mean": 0.007052442076383158, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01560187318827957, + "completion_length": 83.56500091552735, + "epoch": 0.3131303429065412, + "grad_norm": 0.6899747252464294, + "kl": 0.7204694971442223, + "learning_rate": 7.996111811498138e-07, + "loss": 0.0031, + "reward": 1.687961721420288, + "reward_std": 0.19512347355484963, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.5958558440208435, + "step": 1630, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.17274489336414262, + "clip_ratio/high_mean": 0.021967002666497138, + "clip_ratio/low_mean": 0.009596503502689303, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03156350784411188, + "completion_length": 80.9175033569336, + "epoch": 0.31505138795504756, + "grad_norm": 2.105334758758545, + "kl": 0.8054538488388061, + "learning_rate": 7.97346286044274e-07, + "loss": -0.0058, + "reward": 1.3176400899887084, + "reward_std": 0.20478213280439378, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.41350752413272857, + "step": 1640, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.16534482885617763, + "clip_ratio/high_mean": 0.02735080250131432, + "clip_ratio/low_mean": 0.0035748321075516286, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030925634037703275, + "completion_length": 74.01250228881835, + "epoch": 0.31697243300355393, + "grad_norm": 184916.921875, + "kl": 28.671802641451357, + "learning_rate": 7.950723766960857e-07, + "loss": 5.579, + "reward": 1.6360910892486573, + "reward_std": 0.2874180316925049, + "rewards/code_format_reward": 0.9687500119209289, + "rewards/code_reward": 0.5758580267429352, + "step": 1650, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.10983106552157551, + "clip_ratio/high_mean": 0.016536441215430388, + "clip_ratio/low_mean": 0.011150279239518567, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027686719762277788, + "completion_length": 84.17750244140625, + "epoch": 0.3188934780520603, + "grad_norm": 219305424.0, + "kl": 106.82060827612877, + "learning_rate": 7.927895359751835e-07, + "loss": 5248.6121, + "reward": 1.5329812049865723, + "reward_std": 0.22349740117788314, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5221156060695649, + "step": 1660, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.13622083119116724, + "clip_ratio/high_mean": 0.01933064509066753, + "clip_ratio/low_mean": 0.005038347843219526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024368993006646633, + "completion_length": 80.39500274658204, + "epoch": 0.3208145231005667, + "grad_norm": 9.519110679626465, + "kl": 0.7214748501777649, + "learning_rate": 7.904978470769959e-07, + "loss": -0.0025, + "reward": 1.6617871284484864, + "reward_std": 0.27498180270195005, + "rewards/code_format_reward": 0.95625, + "rewards/code_reward": 0.5918310403823852, + "step": 1670, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.09761472065001726, + "clip_ratio/high_mean": 0.01911984165199101, + "clip_ratio/low_mean": 0.010301339952275158, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02942118220962584, + "completion_length": 74.54750213623046, + "epoch": 0.3227355681490731, + "grad_norm": 6.143461227416992, + "kl": 0.7205829441547393, + "learning_rate": 7.881973935194124e-07, + "loss": 0.0015, + "reward": 1.4262179613113404, + "reward_std": 0.26740061640739443, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.4696714758872986, + "step": 1680, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.07396706650033594, + "clip_ratio/high_mean": 0.011737752065528184, + "clip_ratio/low_mean": 0.005250315659213811, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016988067945931107, + "completion_length": 75.27500228881836, + "epoch": 0.3246566131975795, + "grad_norm": 2.337491989135742, + "kl": 68.4789316162467, + "learning_rate": 7.858882591397403e-07, + "loss": 0.3045, + "reward": 1.527750849723816, + "reward_std": 0.26877219378948214, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5163754165172577, + "step": 1690, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.28693441725336016, + "clip_ratio/high_mean": 0.04205623795860447, + "clip_ratio/low_mean": 0.009473194915335626, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.051529432012466715, + "completion_length": 84.14500274658204, + "epoch": 0.32657765824608587, + "grad_norm": 20.964569091796875, + "kl": 0.5620399042963982, + "learning_rate": 7.835705280916488e-07, + "loss": -0.0051, + "reward": 1.615627408027649, + "reward_std": 0.2002291887998581, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.5590636849403381, + "step": 1700, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.2242162274196744, + "clip_ratio/high_mean": 0.036464582500047985, + "clip_ratio/low_mean": 0.010222097241785378, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046686679660342636, + "completion_length": 78.56000061035157, + "epoch": 0.32849870329459224, + "grad_norm": 3.2044875621795654, + "kl": 0.7747909784317016, + "learning_rate": 7.812442848421032e-07, + "loss": -0.0006, + "reward": 1.6169416427612304, + "reward_std": 0.24999960064888, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.5612833142280579, + "step": 1710, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.10125078996643425, + "clip_ratio/high_mean": 0.019883562461473048, + "clip_ratio/low_mean": 0.014126901775307487, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.034010464209131896, + "completion_length": 73.05000152587891, + "epoch": 0.3304197483430986, + "grad_norm": 735.9865112304688, + "kl": 2.3181345582008364, + "learning_rate": 7.789096141682851e-07, + "loss": 0.1213, + "reward": 1.371981406211853, + "reward_std": 0.17790164202451705, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.44317818284034727, + "step": 1720, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.15698121464811265, + "clip_ratio/high_mean": 0.026607585436431692, + "clip_ratio/low_mean": 0.004372719774255529, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030980303999967873, + "completion_length": 78.5425033569336, + "epoch": 0.33234079339160505, + "grad_norm": 2.3281009197235107, + "kl": 1.7815167903900146, + "learning_rate": 7.765666011545045e-07, + "loss": 0.4359, + "reward": 1.669968068599701, + "reward_std": 0.18121034651994705, + "rewards/code_format_reward": 0.9737499833106995, + "rewards/code_reward": 0.5915465235710144, + "step": 1730, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.1189429596066475, + "clip_ratio/high_mean": 0.021151045989245176, + "clip_ratio/low_mean": 0.002452358941081911, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023603405337780714, + "completion_length": 69.71000289916992, + "epoch": 0.3342618384401114, + "grad_norm": 1720.8326416015625, + "kl": 0.7967777937650681, + "learning_rate": 7.742153311890971e-07, + "loss": 0.0982, + "reward": 1.5440645456314086, + "reward_std": 0.18595425188541412, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5292197823524475, + "step": 1740, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.08902034647762776, + "clip_ratio/high_mean": 0.012681722827255725, + "clip_ratio/low_mean": 0.00311334275174886, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015795065369457007, + "completion_length": 74.49249954223633, + "epoch": 0.3361828834886178, + "grad_norm": 0.09847641736268997, + "kl": 0.8014414094388485, + "learning_rate": 7.718558899613143e-07, + "loss": 0.0099, + "reward": 1.5567015647888183, + "reward_std": 0.14754890371114016, + "rewards/code_format_reward": 0.9649999976158142, + "rewards/code_reward": 0.5371007978916168, + "step": 1750, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.15779653917998077, + "clip_ratio/high_mean": 0.030520046106539668, + "clip_ratio/low_mean": 0.009007267560809851, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03952731299214065, + "completion_length": 77.64000091552734, + "epoch": 0.3381039285371242, + "grad_norm": 16.5263729095459, + "kl": 0.7359155111014843, + "learning_rate": 7.69488363458199e-07, + "loss": -0.0085, + "reward": 1.477712869644165, + "reward_std": 0.26145162880420686, + "rewards/code_format_reward": 0.993749988079071, + "rewards/code_reward": 0.49041891694068906, + "step": 1760, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.17542534926906228, + "clip_ratio/high_mean": 0.025472976046148687, + "clip_ratio/low_mean": 0.005083448148798198, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030556425044778734, + "completion_length": 78.76000061035157, + "epoch": 0.3400249735856306, + "grad_norm": 2.440377950668335, + "kl": 1.2570879265666008, + "learning_rate": 7.671128379614524e-07, + "loss": -0.0029, + "reward": 1.697490382194519, + "reward_std": 0.21552397906780243, + "rewards/code_format_reward": 0.9887499809265137, + "rewards/code_reward": 0.6015576839447021, + "step": 1770, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.03777246242389083, + "clip_ratio/high_mean": 0.005805602658074349, + "clip_ratio/low_mean": 0.006219673785381019, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012025276734493672, + "completion_length": 78.01500091552734, + "epoch": 0.341946018634137, + "grad_norm": 3.58803129196167, + "kl": 1.3505164757370949, + "learning_rate": 7.647294000442899e-07, + "loss": -0.0008, + "reward": 1.3937680006027222, + "reward_std": 0.1832626909017563, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.44907149076461794, + "step": 1780, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.08561003021895885, + "clip_ratio/high_mean": 0.011109948102966883, + "clip_ratio/low_mean": 0.0035756964149186387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014685644480050542, + "completion_length": 76.20749969482422, + "epoch": 0.34386706368264336, + "grad_norm": 10.503286361694336, + "kl": 0.552098847925663, + "learning_rate": 7.623381365682855e-07, + "loss": -0.0015, + "reward": 1.6644479036331177, + "reward_std": 0.22849067896604539, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5847239375114441, + "step": 1790, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.06243175007402897, + "clip_ratio/high_mean": 0.009089326043613255, + "clip_ratio/low_mean": 0.005161185140605084, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014250511419959366, + "completion_length": 69.70000076293945, + "epoch": 0.34578810873114973, + "grad_norm": 4.685351371765137, + "kl": 0.3103115826845169, + "learning_rate": 7.599391346802063e-07, + "loss": -0.0003, + "reward": 1.8390909910202027, + "reward_std": 0.20207120031118392, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6726704835891724, + "step": 1800, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.046840774989686904, + "clip_ratio/high_mean": 0.007519985581166111, + "clip_ratio/low_mean": 0.004676173024927266, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01219615869631525, + "completion_length": 80.15500183105469, + "epoch": 0.3477091537796561, + "grad_norm": 21886460.0, + "kl": 0.48781016543507577, + "learning_rate": 7.575324818088367e-07, + "loss": 517.7405, + "reward": 1.6558839797973632, + "reward_std": 0.2796541228890419, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5845044732093811, + "step": 1810, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.18512438922189176, + "clip_ratio/high_mean": 0.0357341198658105, + "clip_ratio/low_mean": 0.0033004880184307694, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03903460723813623, + "completion_length": 78.84000091552734, + "epoch": 0.34963019882816254, + "grad_norm": 9.198795318603516, + "kl": 4.244446061551571, + "learning_rate": 7.551182656617924e-07, + "loss": 0.0031, + "reward": 1.5848650455474853, + "reward_std": 0.17606763169169426, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.5458700299263001, + "step": 1820, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.07551750033162534, + "clip_ratio/high_mean": 0.013169253122759983, + "clip_ratio/low_mean": 0.001537335959437769, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014706589409615844, + "completion_length": 82.8800033569336, + "epoch": 0.3515512438766689, + "grad_norm": 0.724766731262207, + "kl": 0.9274087265133858, + "learning_rate": 7.526965742223234e-07, + "loss": 0.0013, + "reward": 1.5606717586517334, + "reward_std": 0.2877893716096878, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5343983888626098, + "step": 1830, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.1388332260772586, + "clip_ratio/high_mean": 0.021653852658346295, + "clip_ratio/low_mean": 0.008576209528837354, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03023006208240986, + "completion_length": 74.26250076293945, + "epoch": 0.3534722889251753, + "grad_norm": 5.426670074462891, + "kl": 0.7045004338026046, + "learning_rate": 7.502674957461079e-07, + "loss": -0.007, + "reward": 1.5688656568527222, + "reward_std": 0.30554552264511586, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5384953856468201, + "step": 1840, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.07899991576559842, + "clip_ratio/high_mean": 0.013301478006178513, + "clip_ratio/low_mean": 0.01124582380289212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024547301628626884, + "completion_length": 74.45500106811524, + "epoch": 0.35539333397368167, + "grad_norm": 2.5104761123657227, + "kl": 0.6198086604475975, + "learning_rate": 7.478311187580363e-07, + "loss": -0.0071, + "reward": 1.5550098896026612, + "reward_std": 0.21109988391399384, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.52937992811203, + "step": 1850, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.063601403683424, + "clip_ratio/high_mean": 0.010639100335538387, + "clip_ratio/low_mean": 0.00778028266504407, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018419382628053427, + "completion_length": 71.92500152587891, + "epoch": 0.35731437902218804, + "grad_norm": 3.805928945541382, + "kl": 1.6179959252476692, + "learning_rate": 7.453875320489842e-07, + "loss": 0.3, + "reward": 1.4410953760147094, + "reward_std": 0.19501519501209258, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.47523519992828367, + "step": 1860, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.10860318611375988, + "clip_ratio/high_mean": 0.018746975070098416, + "clip_ratio/low_mean": 0.008747255423804745, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02749423016794026, + "completion_length": 69.9375015258789, + "epoch": 0.3592354240706945, + "grad_norm": 2.388782501220703, + "kl": 0.5952992506325245, + "learning_rate": 7.429368246725772e-07, + "loss": 0.0443, + "reward": 1.6972971916198731, + "reward_std": 0.17401356399059295, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.6008361041545868, + "step": 1870, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.08630747124552726, + "clip_ratio/high_mean": 0.012746809562668205, + "clip_ratio/low_mean": 0.010304910433478653, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02305172006599605, + "completion_length": 70.83000183105469, + "epoch": 0.36115646911920085, + "grad_norm": 16.255178451538086, + "kl": 0.8730347856879235, + "learning_rate": 7.40479085941945e-07, + "loss": 0.0036, + "reward": 1.467816424369812, + "reward_std": 0.17535984218120576, + "rewards/code_format_reward": 0.9925000071525574, + "rewards/code_reward": 0.48578319549560545, + "step": 1880, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.26251301234588026, + "clip_ratio/high_mean": 0.03827818045392632, + "clip_ratio/low_mean": 0.005526873719645664, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04380505495937541, + "completion_length": 64.74750213623047, + "epoch": 0.3630775141677072, + "grad_norm": 4.061140060424805, + "kl": 0.8530658036470413, + "learning_rate": 7.380144054264669e-07, + "loss": 0.0197, + "reward": 1.498781108856201, + "reward_std": 0.17463037073612214, + "rewards/code_format_reward": 0.9600000023841858, + "rewards/code_reward": 0.509390527009964, + "step": 1890, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.24850870491936802, + "clip_ratio/high_mean": 0.04144583061570302, + "clip_ratio/low_mean": 0.00702623330289498, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04847206423291937, + "completion_length": 75.8375015258789, + "epoch": 0.3649985592162136, + "grad_norm": 3.4472062587738037, + "kl": 1.6324397973716258, + "learning_rate": 7.355428729485071e-07, + "loss": -0.001, + "reward": 1.6619214057922362, + "reward_std": 0.18103656098246573, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.5840856909751893, + "step": 1900, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.09173821061849594, + "clip_ratio/high_mean": 0.014921509474515916, + "clip_ratio/low_mean": 0.002157307107700035, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017078816797584294, + "completion_length": 62.185000610351565, + "epoch": 0.36691960426472003, + "grad_norm": 2.0225422382354736, + "kl": 184.02759787738324, + "learning_rate": 7.330645785801417e-07, + "loss": 2.9496, + "reward": 1.7410502433776855, + "reward_std": 0.10668236091732979, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.6217751204967499, + "step": 1910, + "zero_std_ratio": 0.75 + }, + { + "clip_ratio/high_max": 0.16933906488120556, + "clip_ratio/high_mean": 0.02619449864141643, + "clip_ratio/low_mean": 0.014137339405715465, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04033183753490448, + "completion_length": 79.30000152587891, + "epoch": 0.3688406493132264, + "grad_norm": 2.6208443641662598, + "kl": 1.235317513346672, + "learning_rate": 7.305796126398758e-07, + "loss": -0.0012, + "reward": 1.5036948204040528, + "reward_std": 0.20645264089107512, + "rewards/code_format_reward": 0.9762499928474426, + "rewards/code_reward": 0.5077848553657531, + "step": 1920, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.2661599090555683, + "clip_ratio/high_mean": 0.03600101897318382, + "clip_ratio/low_mean": 0.009155643907433841, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.045156662538647654, + "completion_length": 78.10000152587891, + "epoch": 0.3707616943617328, + "grad_norm": 8.953734397888184, + "kl": 0.6204134523868561, + "learning_rate": 7.280880656893518e-07, + "loss": 0.0025, + "reward": 1.4915935516357421, + "reward_std": 0.2376121073961258, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.501109266281128, + "step": 1930, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.15203024838119744, + "clip_ratio/high_mean": 0.023713350854814054, + "clip_ratio/low_mean": 0.004282052081543952, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02799540273845196, + "completion_length": 74.42500076293945, + "epoch": 0.37268273941023916, + "grad_norm": 11.845942497253418, + "kl": 0.5031724810600281, + "learning_rate": 7.255900285300496e-07, + "loss": 0.5255, + "reward": 1.6400779724121093, + "reward_std": 0.22267285138368606, + "rewards/code_format_reward": 0.9649999856948852, + "rewards/code_reward": 0.5787889719009399, + "step": 1940, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.09135808227583767, + "clip_ratio/high_mean": 0.012801296508405358, + "clip_ratio/low_mean": 0.01690869364247192, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02970999013632536, + "completion_length": 69.52000198364257, + "epoch": 0.37460378445874554, + "grad_norm": 6.7441229820251465, + "kl": 1.2024895504117012, + "learning_rate": 7.230855921999769e-07, + "loss": 44.3651, + "reward": 1.6912511348724366, + "reward_std": 0.17418113350868225, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5981255412101746, + "step": 1950, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.07453169417567551, + "clip_ratio/high_mean": 0.009913802641676739, + "clip_ratio/low_mean": 0.003736039294744842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013649841863662004, + "completion_length": 74.01250228881835, + "epoch": 0.37652482950725197, + "grad_norm": 4.616723537445068, + "kl": 0.6156632959842682, + "learning_rate": 7.205748479703515e-07, + "loss": -0.0005, + "reward": 1.846400761604309, + "reward_std": 0.17167636156082153, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6757004141807557, + "step": 1960, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.09189570704475045, + "clip_ratio/high_mean": 0.013587052945513278, + "clip_ratio/low_mean": 0.004667519498616457, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0182545724324882, + "completion_length": 64.46750030517578, + "epoch": 0.37844587455575834, + "grad_norm": 0.17748567461967468, + "kl": 0.4286219261586666, + "learning_rate": 7.180578873422757e-07, + "loss": -0.0046, + "reward": 1.612094521522522, + "reward_std": 0.10822201184928418, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.5576097548007966, + "step": 1970, + "zero_std_ratio": 0.725 + }, + { + "clip_ratio/high_max": 0.2088342323899269, + "clip_ratio/high_mean": 0.028434151923283933, + "clip_ratio/low_mean": 0.005974846053868532, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.034408997558057305, + "completion_length": 69.26750106811524, + "epoch": 0.3803669196042647, + "grad_norm": 6.238914966583252, + "kl": 0.7256933867931366, + "learning_rate": 7.155348020434001e-07, + "loss": -0.0046, + "reward": 1.469704508781433, + "reward_std": 0.24035734832286834, + "rewards/code_format_reward": 0.9799999833106995, + "rewards/code_reward": 0.4898522675037384, + "step": 1980, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.05789460870437324, + "clip_ratio/high_mean": 0.007717460609273985, + "clip_ratio/low_mean": 0.003460834617726505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011178295104764402, + "completion_length": 70.19000244140625, + "epoch": 0.3822879646527711, + "grad_norm": 8.066108703613281, + "kl": 1.1788517452776432, + "learning_rate": 7.130056840245824e-07, + "loss": -0.0005, + "reward": 1.5026792764663697, + "reward_std": 0.2312860034406185, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5022771418094635, + "step": 1990, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.07602796133141965, + "clip_ratio/high_mean": 0.012856367122731171, + "clip_ratio/low_mean": 0.0035519548939191735, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016408322553616017, + "completion_length": 66.4625015258789, + "epoch": 0.38420900970127747, + "grad_norm": 3.559206962585449, + "kl": 1.225260878354311, + "learning_rate": 7.104706254565358e-07, + "loss": -0.003, + "reward": 1.742388916015625, + "reward_std": 0.12480423972010612, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.623069453239441, + "step": 2000, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.11271043051965535, + "clip_ratio/high_mean": 0.017727556044701488, + "clip_ratio/low_mean": 0.005613272835034877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02334082857705653, + "completion_length": 77.1650001525879, + "epoch": 0.3861300547497839, + "grad_norm": 3.4077274799346924, + "kl": 0.8489379599690438, + "learning_rate": 7.07929718726469e-07, + "loss": 0.0403, + "reward": 1.5602745056152343, + "reward_std": 0.2609230324625969, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.5338872492313385, + "step": 2010, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.2993799396790564, + "clip_ratio/high_mean": 0.041865267558023334, + "clip_ratio/low_mean": 0.006948894041124731, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04881416228599846, + "completion_length": 74.0150016784668, + "epoch": 0.3880510997982903, + "grad_norm": 3.2043685913085938, + "kl": 6.086115422844887, + "learning_rate": 7.053830564347206e-07, + "loss": 2.2989, + "reward": 1.5310536623001099, + "reward_std": 0.19302123934030532, + "rewards/code_format_reward": 0.9837500095367432, + "rewards/code_reward": 0.5195893287658692, + "step": 2020, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.06591402762569487, + "clip_ratio/high_mean": 0.009311116795288399, + "clip_ratio/low_mean": 0.0017412514251191169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01105236830189824, + "completion_length": 73.44250106811523, + "epoch": 0.38997214484679665, + "grad_norm": 2.137256622314453, + "kl": 3.9139866441488267, + "learning_rate": 7.028307313913838e-07, + "loss": 0.0061, + "reward": 1.8796703815460205, + "reward_std": 0.12868851274251938, + "rewards/code_format_reward": 0.9974999904632569, + "rewards/code_reward": 0.6904601573944091, + "step": 2030, + "zero_std_ratio": 0.775 + }, + { + "clip_ratio/high_max": 0.24738994101062417, + "clip_ratio/high_mean": 0.03705689987400547, + "clip_ratio/low_mean": 0.007423648721305654, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04448054819367826, + "completion_length": 67.09500198364258, + "epoch": 0.39189318989530303, + "grad_norm": 5.504507541656494, + "kl": 1.4878595262765884, + "learning_rate": 7.002728366129242e-07, + "loss": 0.0166, + "reward": 1.8640715599060058, + "reward_std": 0.22610510736703873, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.6870357990264893, + "step": 2040, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.09283696161583066, + "clip_ratio/high_mean": 0.014592013147193938, + "clip_ratio/low_mean": 0.0040809189551509915, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01867293259128928, + "completion_length": 72.88500137329102, + "epoch": 0.3938142349438094, + "grad_norm": 1.877032995223999, + "kl": 2.3534633785486223, + "learning_rate": 6.977094653187891e-07, + "loss": 0.3364, + "reward": 1.5182712078094482, + "reward_std": 0.19934598058462144, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5163230776786805, + "step": 2050, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.047755516692996026, + "clip_ratio/high_mean": 0.007312651420943439, + "clip_ratio/low_mean": 0.0007527987050707452, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008065450168214739, + "completion_length": 67.76500091552734, + "epoch": 0.39573527999231584, + "grad_norm": 1.7954281568527222, + "kl": 2.4329017847776413, + "learning_rate": 6.95140710928012e-07, + "loss": 206.5648, + "reward": 1.3761554956436157, + "reward_std": 0.21033956706523896, + "rewards/code_format_reward": 0.9762499928474426, + "rewards/code_reward": 0.44401525855064394, + "step": 2060, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.07075442476198077, + "clip_ratio/high_mean": 0.009443573304452002, + "clip_ratio/low_mean": 0.003901358728762716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013344932324253022, + "completion_length": 68.6150016784668, + "epoch": 0.3976563250408222, + "grad_norm": 1.3921815156936646, + "kl": 0.6283935949206352, + "learning_rate": 6.925666670558062e-07, + "loss": 1.5274, + "reward": 1.4756604433059692, + "reward_std": 0.2542987480759621, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.49189271330833434, + "step": 2070, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.09334485791623592, + "clip_ratio/high_mean": 0.015712386509403587, + "clip_ratio/low_mean": 0.005205962993204594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02091834945604205, + "completion_length": 75.06750183105468, + "epoch": 0.3995773700893286, + "grad_norm": 1.3997697830200195, + "kl": 0.5330163806676864, + "learning_rate": 6.899874275101538e-07, + "loss": -0.0031, + "reward": 1.7522424459457397, + "reward_std": 0.1803124487400055, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.6286212205886841, + "step": 2080, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.08916364926844836, + "clip_ratio/high_mean": 0.014017748599871992, + "clip_ratio/low_mean": 0.003948131998186, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01796588017605245, + "completion_length": 78.19750213623047, + "epoch": 0.40149841513783496, + "grad_norm": 2296.336669921875, + "kl": 1.0256180852651595, + "learning_rate": 6.874030862883879e-07, + "loss": 0.0318, + "reward": 1.2450440883636475, + "reward_std": 0.22890471369028093, + "rewards/code_format_reward": 0.9775000095367432, + "rewards/code_reward": 0.3781470343470573, + "step": 2090, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.22327043637633323, + "clip_ratio/high_mean": 0.04789549903944135, + "clip_ratio/low_mean": 0.00559167112223804, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.053487171232700345, + "completion_length": 70.61250152587891, + "epoch": 0.4034194601863414, + "grad_norm": 3.2615253925323486, + "kl": 8.218332803249359, + "learning_rate": 6.848137375737652e-07, + "loss": 0.0058, + "reward": 1.6430699110031128, + "reward_std": 0.21420457661151887, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.5793474376201629, + "step": 2100, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.1533806946128607, + "clip_ratio/high_mean": 0.02256658235564828, + "clip_ratio/low_mean": 0.002787484592408873, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025354066491127016, + "completion_length": 74.33999938964844, + "epoch": 0.40534050523484777, + "grad_norm": 4.315516471862793, + "kl": 1.0426696628332137, + "learning_rate": 6.822194757320354e-07, + "loss": 0.0019, + "reward": 1.6090970516204834, + "reward_std": 0.1758709292858839, + "rewards/code_format_reward": 0.993749988079071, + "rewards/code_reward": 0.5561110019683838, + "step": 2110, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.1336930485442281, + "clip_ratio/high_mean": 0.021989132883027195, + "clip_ratio/low_mean": 0.0070218192064203325, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0290109527297318, + "completion_length": 73.0250015258789, + "epoch": 0.40726155028335415, + "grad_norm": 18.143117904663086, + "kl": 0.4288759011775255, + "learning_rate": 6.796203953080007e-07, + "loss": 0.0005, + "reward": 1.72017080783844, + "reward_std": 0.22243313789367675, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6144603788852692, + "step": 2120, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.08061643750406802, + "clip_ratio/high_mean": 0.011467291257577016, + "clip_ratio/low_mean": 0.011395246715983376, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022862538695335388, + "completion_length": 68.66250152587891, + "epoch": 0.4091825953318605, + "grad_norm": 1.0005404949188232, + "kl": 0.47304695919156076, + "learning_rate": 6.770165910220709e-07, + "loss": 0.0006, + "reward": 1.4831626653671264, + "reward_std": 0.1916220799088478, + "rewards/code_format_reward": 0.9837499856948853, + "rewards/code_reward": 0.4956438183784485, + "step": 2130, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.06393485199660062, + "clip_ratio/high_mean": 0.011905963439494372, + "clip_ratio/low_mean": 0.0023792986408807336, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01428526220843196, + "completion_length": 74.32250137329102, + "epoch": 0.4111036403803669, + "grad_norm": 2.491830825805664, + "kl": 2.213325946778059, + "learning_rate": 6.744081577668115e-07, + "loss": 0.1532, + "reward": 1.7680244207382203, + "reward_std": 0.18317916095256806, + "rewards/code_format_reward": 0.9687499880790711, + "rewards/code_reward": 0.6418246865272522, + "step": 2140, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.03965856842696667, + "clip_ratio/high_mean": 0.00730013819411397, + "clip_ratio/low_mean": 0.0031650666729547083, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010465204739011824, + "completion_length": 73.1050018310547, + "epoch": 0.41302468542887333, + "grad_norm": 0.353427916765213, + "kl": 0.2898652456700802, + "learning_rate": 6.717951906034856e-07, + "loss": -0.0015, + "reward": 1.6113624095916748, + "reward_std": 0.09930019937455654, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5591186702251434, + "step": 2150, + "zero_std_ratio": 0.725 + }, + { + "clip_ratio/high_max": 0.03382167350500822, + "clip_ratio/high_mean": 0.005409902473911643, + "clip_ratio/low_mean": 0.0024156818573828785, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007825584337115287, + "completion_length": 68.12750091552735, + "epoch": 0.4149457304773797, + "grad_norm": 3.9950575828552246, + "kl": 0.789361334592104, + "learning_rate": 6.691777847585883e-07, + "loss": 0.048, + "reward": 1.5698497295379639, + "reward_std": 0.1552659712731838, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.5417998552322387, + "step": 2160, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.026565033989027143, + "clip_ratio/high_mean": 0.004212364956038073, + "clip_ratio/low_mean": 0.0013839059392921627, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005596270889509469, + "completion_length": 70.80999984741212, + "epoch": 0.4168667755258861, + "grad_norm": 1.3910998106002808, + "kl": 1.4257395297288895, + "learning_rate": 6.665560356203784e-07, + "loss": 0.8731, + "reward": 1.4512264728546143, + "reward_std": 0.14117379933595658, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.47748821377754214, + "step": 2170, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.09546168451197445, + "clip_ratio/high_mean": 0.01459201174438931, + "clip_ratio/low_mean": 0.006060798710677773, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020652810629690065, + "completion_length": 67.89000091552734, + "epoch": 0.41878782057439246, + "grad_norm": 0.6732813715934753, + "kl": 1.1321026906371117, + "learning_rate": 6.639300387353999e-07, + "loss": -0.0002, + "reward": 1.3501636981964111, + "reward_std": 0.21670444533228875, + "rewards/code_format_reward": 0.9924999833106994, + "rewards/code_reward": 0.42695685029029845, + "step": 2180, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.11407874876167626, + "clip_ratio/high_mean": 0.01725804756570142, + "clip_ratio/low_mean": 0.0015681478500482627, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018826195126166567, + "completion_length": 68.7525016784668, + "epoch": 0.42070886562289883, + "grad_norm": 1.5759879350662231, + "kl": 0.4211964398622513, + "learning_rate": 6.612998898050014e-07, + "loss": -0.0021, + "reward": 1.7485667228698731, + "reward_std": 0.16526954025030136, + "rewards/code_format_reward": 0.9612500071525574, + "rewards/code_reward": 0.6339708626270294, + "step": 2190, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.10751554854214192, + "clip_ratio/high_mean": 0.013745604571886361, + "clip_ratio/low_mean": 0.010064921525190585, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023810526612214743, + "completion_length": 62.88500137329102, + "epoch": 0.42262991067140526, + "grad_norm": 2.4066975116729736, + "kl": 0.7549011036753654, + "learning_rate": 6.586656846818477e-07, + "loss": 0.2999, + "reward": 1.6932018756866456, + "reward_std": 0.1608109436929226, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5991009473800659, + "step": 2200, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.019488381547853352, + "clip_ratio/high_mean": 0.003436583065195009, + "clip_ratio/low_mean": 0.002801175639615394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0062377589056268334, + "completion_length": 72.55250244140625, + "epoch": 0.42455095571991164, + "grad_norm": 2.0696611404418945, + "kl": 5.306586292386055, + "learning_rate": 6.56027519366427e-07, + "loss": 0.011, + "reward": 1.611876368522644, + "reward_std": 0.1603232156485319, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.5596881568431854, + "step": 2210, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.05246827639639377, + "clip_ratio/high_mean": 0.00732308179140091, + "clip_ratio/low_mean": 0.0034836977836675944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010806779406266287, + "completion_length": 64.31750183105468, + "epoch": 0.426472000768418, + "grad_norm": 0.12577353417873383, + "kl": 0.5850224502384662, + "learning_rate": 6.533854900035516e-07, + "loss": -0.0015, + "reward": 1.7735862731933594, + "reward_std": 0.13040905613452197, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6399181246757507, + "step": 2220, + "zero_std_ratio": 0.7 + }, + { + "clip_ratio/high_max": 0.24315445288084447, + "clip_ratio/high_mean": 0.031706276966724546, + "clip_ratio/low_mean": 0.011593326600268484, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04329960328759626, + "completion_length": 72.80000152587891, + "epoch": 0.4283930458169244, + "grad_norm": 4.765016078948975, + "kl": 1.5887107208371163, + "learning_rate": 6.507396928788548e-07, + "loss": 0.0023, + "reward": 1.6477301597595215, + "reward_std": 0.12887158915400504, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5801151037216187, + "step": 2230, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.044854282308369874, + "clip_ratio/high_mean": 0.007485381804872304, + "clip_ratio/low_mean": 0.0028356918206554837, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01032107327482663, + "completion_length": 66.63000183105468, + "epoch": 0.4303140908654308, + "grad_norm": 1.5923104286193848, + "kl": 0.9431760296225548, + "learning_rate": 6.480902244152813e-07, + "loss": -0.0021, + "reward": 1.4723083972930908, + "reward_std": 0.13776133116334677, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.48865418434143065, + "step": 2240, + "zero_std_ratio": 0.7 + }, + { + "clip_ratio/high_max": 0.08558401605114341, + "clip_ratio/high_mean": 0.01418596402509138, + "clip_ratio/low_mean": 0.005716345021210145, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019902308681048454, + "completion_length": 67.75250015258788, + "epoch": 0.4322351359139372, + "grad_norm": 4.213563442230225, + "kl": 0.7182839468121529, + "learning_rate": 6.454371811695732e-07, + "loss": -0.0032, + "reward": 1.5263491868972778, + "reward_std": 0.215225650370121, + "rewards/code_format_reward": 0.975000011920929, + "rewards/code_reward": 0.51942458152771, + "step": 2250, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.17924602022394537, + "clip_ratio/high_mean": 0.02314122476382181, + "clip_ratio/low_mean": 0.006780697987414897, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.029921922995708884, + "completion_length": 67.31500091552735, + "epoch": 0.43415618096244357, + "grad_norm": 2.018653392791748, + "kl": 0.644180704653263, + "learning_rate": 6.427806598287522e-07, + "loss": -0.0031, + "reward": 1.8284268617630004, + "reward_std": 0.1590463936328888, + "rewards/code_format_reward": 0.993749988079071, + "rewards/code_reward": 0.6657759308815002, + "step": 2260, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.26562999933958054, + "clip_ratio/high_mean": 0.04080731603316963, + "clip_ratio/low_mean": 0.002605196795775555, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.043412512401118875, + "completion_length": 64.91250076293946, + "epoch": 0.43607722601094995, + "grad_norm": 2.8014633655548096, + "kl": 1.4193657219409943, + "learning_rate": 6.401207572065942e-07, + "loss": 0.0075, + "reward": 1.6795406818389893, + "reward_std": 0.1340640414506197, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.5913328170776367, + "step": 2270, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.15035496577620505, + "clip_ratio/high_mean": 0.021555275144055485, + "clip_ratio/low_mean": 0.007308500797080342, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028863775311037898, + "completion_length": 83.20750122070312, + "epoch": 0.4379982710594563, + "grad_norm": 5.3116655349731445, + "kl": 1.7165004715323449, + "learning_rate": 6.374575702401019e-07, + "loss": -0.0031, + "reward": 1.694450354576111, + "reward_std": 0.2935485541820526, + "rewards/code_format_reward": 0.9650000095367431, + "rewards/code_reward": 0.6059751749038697, + "step": 2280, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.05178634703624994, + "clip_ratio/high_mean": 0.007199086344917305, + "clip_ratio/low_mean": 0.004959188599605114, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012158275028923526, + "completion_length": 68.35500106811523, + "epoch": 0.43991931610796275, + "grad_norm": 11.67419719696045, + "kl": 0.8460408747196198, + "learning_rate": 6.347911959859725e-07, + "loss": -0.0013, + "reward": 1.6080287456512452, + "reward_std": 0.2270718976855278, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.5615143775939941, + "step": 2290, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.07622572851832957, + "clip_ratio/high_mean": 0.011604995708330535, + "clip_ratio/low_mean": 0.0013341609621420503, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012939156647189521, + "completion_length": 68.30750274658203, + "epoch": 0.44184036115646913, + "grad_norm": 332.7762451171875, + "kl": 0.7540152728557586, + "learning_rate": 6.321217316170599e-07, + "loss": 0.1015, + "reward": 1.4850183725357056, + "reward_std": 0.1393202841281891, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.49469670057296755, + "step": 2300, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.16746083926409483, + "clip_ratio/high_mean": 0.02103413282893598, + "clip_ratio/low_mean": 0.0068577720652683635, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.027891904639545828, + "completion_length": 64.55000152587891, + "epoch": 0.4437614062049755, + "grad_norm": 0.36056017875671387, + "kl": 0.4329931303858757, + "learning_rate": 6.294492744188335e-07, + "loss": 0.0002, + "reward": 1.4963040232658387, + "reward_std": 0.07247132882475853, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.502214539051056, + "step": 2310, + "zero_std_ratio": 0.725 + }, + { + "clip_ratio/high_max": 0.05429213300812989, + "clip_ratio/high_mean": 0.007803994990536012, + "clip_ratio/low_mean": 0.008226435555843636, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01603043078503106, + "completion_length": 69.78750228881836, + "epoch": 0.4456824512534819, + "grad_norm": 0.1676941215991974, + "kl": 0.276796979829669, + "learning_rate": 6.267739217858329e-07, + "loss": -0.0028, + "reward": 1.7269956827163697, + "reward_std": 0.1742506742477417, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.6156853199005127, + "step": 2320, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.03961263382807374, + "clip_ratio/high_mean": 0.00831791803939268, + "clip_ratio/low_mean": 0.008615480939624831, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016933399019762874, + "completion_length": 70.70000228881835, + "epoch": 0.44760349630198826, + "grad_norm": 6.724217891693115, + "kl": 0.544577070325613, + "learning_rate": 6.240957712181186e-07, + "loss": -0.0041, + "reward": 1.3949034690856934, + "reward_std": 0.21950918734073638, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.45307670831680297, + "step": 2330, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.2168802363506984, + "clip_ratio/high_mean": 0.03705684195374488, + "clip_ratio/low_mean": 0.0028890643618069587, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03994590537622571, + "completion_length": 74.0625, + "epoch": 0.4495245413504947, + "grad_norm": 3.073554277420044, + "kl": 0.6307030320167542, + "learning_rate": 6.214149203177182e-07, + "loss": -0.0002, + "reward": 1.679004979133606, + "reward_std": 0.1860196329653263, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.5916899800300598, + "step": 2340, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.0932310588657856, + "clip_ratio/high_mean": 0.014429462677799165, + "clip_ratio/low_mean": 0.0065534046734683216, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020982867432758213, + "completion_length": 67.18000183105468, + "epoch": 0.45144558639900106, + "grad_norm": 3595.65576171875, + "kl": 1.140541896224022, + "learning_rate": 6.187314667850697e-07, + "loss": 0.1447, + "reward": 1.4676954984664916, + "reward_std": 0.20568167939782142, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.4875977456569672, + "step": 2350, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.036702672578394414, + "clip_ratio/high_mean": 0.006752843782305717, + "clip_ratio/low_mean": 0.008269340678816661, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015022184286499396, + "completion_length": 80.57000122070312, + "epoch": 0.45336663144750744, + "grad_norm": 2.759171724319458, + "kl": 10.568821829557418, + "learning_rate": 6.160455084154613e-07, + "loss": 1.8532, + "reward": 1.4545687198638917, + "reward_std": 0.23069845288991928, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.4813468337059021, + "step": 2360, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.08144733654335141, + "clip_ratio/high_mean": 0.014263840962667019, + "clip_ratio/low_mean": 0.0019872021744959056, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01625104369595647, + "completion_length": 71.55750045776367, + "epoch": 0.4552876764960138, + "grad_norm": 1.9088038206100464, + "kl": 1.3571255028247833, + "learning_rate": 6.133571430954667e-07, + "loss": 0.0026, + "reward": 1.5344175338745116, + "reward_std": 0.16607576459646226, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.5237712502479553, + "step": 2370, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.10026950519531966, + "clip_ratio/high_mean": 0.01315019663888961, + "clip_ratio/low_mean": 0.00221524270309601, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015365439187735318, + "completion_length": 72.40750122070312, + "epoch": 0.4572087215445202, + "grad_norm": 4.301158428192139, + "kl": 0.6290791854262352, + "learning_rate": 6.106664687993782e-07, + "loss": -0.0032, + "reward": 1.5749263525009156, + "reward_std": 0.16429235637187958, + "rewards/code_format_reward": 0.9724999785423278, + "rewards/code_reward": 0.5443381488323211, + "step": 2380, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.10308512919582427, + "clip_ratio/high_mean": 0.016338009486207738, + "clip_ratio/low_mean": 0.0017032683303114028, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018041277857264504, + "completion_length": 76.43750228881837, + "epoch": 0.4591297665930266, + "grad_norm": 6.198258876800537, + "kl": 408884378.2116049, + "learning_rate": 6.079735835856362e-07, + "loss": 1157747.0, + "reward": 1.5280384778976441, + "reward_std": 0.19424125757068395, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.517456728219986, + "step": 2390, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.24319633510895072, + "clip_ratio/high_mean": 0.037530579004669565, + "clip_ratio/low_mean": 0.004886501970031531, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04241708111949265, + "completion_length": 74.5425018310547, + "epoch": 0.461050811641533, + "grad_norm": 5.885474681854248, + "kl": 1.4351533338427545, + "learning_rate": 6.052785855932548e-07, + "loss": 0.123, + "reward": 1.4949720859527589, + "reward_std": 0.20392217636108398, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.5002985119819641, + "step": 2400, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.23103972100652753, + "clip_ratio/high_mean": 0.0305588347138837, + "clip_ratio/low_mean": 0.002339675696566701, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03289851036388427, + "completion_length": 70.01750106811524, + "epoch": 0.4629718566900394, + "grad_norm": 0.8806352615356445, + "kl": 1.6503019407391548, + "learning_rate": 6.025815730382463e-07, + "loss": 0.8832, + "reward": 1.6588483333587647, + "reward_std": 0.19124363958835602, + "rewards/code_format_reward": 0.9725000143051148, + "rewards/code_reward": 0.5862991452217102, + "step": 2410, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.02757756725186482, + "clip_ratio/high_mean": 0.005332520017691422, + "clip_ratio/low_mean": 0.019763218611478804, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025095738274103496, + "completion_length": 71.59250183105469, + "epoch": 0.46489290173854575, + "grad_norm": 1.2440141439437866, + "kl": 2.751401698589325, + "learning_rate": 5.998826442100412e-07, + "loss": 362174.725, + "reward": 1.5159764885902405, + "reward_std": 0.1902527991682291, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.5129882216453552, + "step": 2420, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.2530499072512612, + "clip_ratio/high_mean": 0.03376921496528666, + "clip_ratio/low_mean": 0.0062858725665137175, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.04005508716509212, + "completion_length": 76.0425018310547, + "epoch": 0.4668139467870522, + "grad_norm": 66.4449234008789, + "kl": 2164149.3861157326, + "learning_rate": 5.971818974679065e-07, + "loss": 2449736.0, + "reward": 1.6650853157043457, + "reward_std": 0.24712301939725875, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.585355132818222, + "step": 2430, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.2950001623481512, + "clip_ratio/high_mean": 0.042542998865246776, + "clip_ratio/low_mean": 0.0068845050991512835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.049427504313644025, + "completion_length": 75.27000198364257, + "epoch": 0.46873499183555856, + "grad_norm": 2.206911563873291, + "kl": 11.237105096876622, + "learning_rate": 5.944794312373607e-07, + "loss": 0.0298, + "reward": 1.7914002895355225, + "reward_std": 0.22826257348060608, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.649450159072876, + "step": 2440, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.07424517879262567, + "clip_ratio/high_mean": 0.010772422759328038, + "clip_ratio/low_mean": 0.010833968574297614, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02160639046342112, + "completion_length": 71.40500183105469, + "epoch": 0.47065603688406493, + "grad_norm": 76503500980224.0, + "kl": 393.06428125053645, + "learning_rate": 5.917753440065869e-07, + "loss": 909725593.6, + "reward": 1.4975883960723877, + "reward_std": 0.28928079828619957, + "rewards/code_format_reward": 0.9612499833106994, + "rewards/code_reward": 0.5084816813468933, + "step": 2450, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.14294563261792065, + "clip_ratio/high_mean": 0.019953654275741427, + "clip_ratio/low_mean": 0.004493102640844881, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024446757195983083, + "completion_length": 79.72250061035156, + "epoch": 0.4725770819325713, + "grad_norm": 0.778223991394043, + "kl": 2.2069298341870307, + "learning_rate": 5.89069734322844e-07, + "loss": -0.0085, + "reward": 1.5203648328781127, + "reward_std": 0.1896197520196438, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5173698782920837, + "step": 2460, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.0473501511849463, + "clip_ratio/high_mean": 0.006591684772865846, + "clip_ratio/low_mean": 0.0004718510695965961, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007063536025816575, + "completion_length": 76.70500183105469, + "epoch": 0.4744981269810777, + "grad_norm": 0.5978448390960693, + "kl": 0.6427325546741486, + "learning_rate": 5.863627007888745e-07, + "loss": 0.0007, + "reward": 1.7259918212890626, + "reward_std": 0.1515914086252451, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.618620878458023, + "step": 2470, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.048674007039517166, + "clip_ratio/high_mean": 0.010258768184576184, + "clip_ratio/low_mean": 0.012727768435433972, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022986536961980164, + "completion_length": 78.06500244140625, + "epoch": 0.4764191720295841, + "grad_norm": 4.168500900268555, + "kl": 0.5699560895562172, + "learning_rate": 5.836543420593119e-07, + "loss": -0.0011, + "reward": 1.6060274362564086, + "reward_std": 0.2864475339651108, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.557388699054718, + "step": 2480, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.014699839102104307, + "clip_ratio/high_mean": 0.0019118846452329309, + "clip_ratio/low_mean": 0.0005179177765967325, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024298023723531514, + "completion_length": 85.17000122070313, + "epoch": 0.4783402170780905, + "grad_norm": 4.149423599243164, + "kl": 1.3347756370902062, + "learning_rate": 5.809447568370843e-07, + "loss": 0.0102, + "reward": 1.621114158630371, + "reward_std": 0.21484595835208892, + "rewards/code_format_reward": 0.9774999856948853, + "rewards/code_reward": 0.5661820948123932, + "step": 2490, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.029943293944234027, + "clip_ratio/high_mean": 0.006927184848609613, + "clip_ratio/low_mean": 0.0035072380007477475, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010434422721300508, + "completion_length": 83.86250228881836, + "epoch": 0.48026126212659687, + "grad_norm": 5.97049617767334, + "kl": 4.178053397685289, + "learning_rate": 5.782340438698185e-07, + "loss": -0.0063, + "reward": 1.6789068222045898, + "reward_std": 0.25779220163822175, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5903908908367157, + "step": 2500, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.08102847400587052, + "clip_ratio/high_mean": 0.01392527524731122, + "clip_ratio/low_mean": 0.0045509199095249645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018476195022230968, + "completion_length": 83.11250152587891, + "epoch": 0.48218230717510324, + "grad_norm": 5.283038139343262, + "kl": 1.111867392808199, + "learning_rate": 5.755223019462401e-07, + "loss": 17.941, + "reward": 1.577300524711609, + "reward_std": 0.22725088596343995, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5442752420902253, + "step": 2510, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.06463829884305597, + "clip_ratio/high_mean": 0.008858744835015387, + "clip_ratio/low_mean": 0.0054666692391037944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01432541401591152, + "completion_length": 85.7625015258789, + "epoch": 0.4841033522236096, + "grad_norm": 8.200135231018066, + "kl": 0.4475974731147289, + "learning_rate": 5.728096298925745e-07, + "loss": -0.0057, + "reward": 1.5549763917922974, + "reward_std": 0.23400793820619584, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5331131994724274, + "step": 2520, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.031378739466890695, + "clip_ratio/high_mean": 0.00507326218066737, + "clip_ratio/low_mean": 0.010504274675622583, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015577536821365357, + "completion_length": 79.96750030517578, + "epoch": 0.48602439727211605, + "grad_norm": 2.6766583919525146, + "kl": 0.4622874528169632, + "learning_rate": 5.700961265689434e-07, + "loss": -0.0011, + "reward": 1.8167934179306031, + "reward_std": 0.30146218538284303, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.6621467113494873, + "step": 2530, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.07196793179027736, + "clip_ratio/high_mean": 0.013576928357360884, + "clip_ratio/low_mean": 0.0018521397636504845, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015429067742661572, + "completion_length": 87.03000030517578, + "epoch": 0.4879454423206224, + "grad_norm": 1.347899317741394, + "kl": 0.7047492057085037, + "learning_rate": 5.673818908657644e-07, + "loss": -0.0079, + "reward": 1.6893932342529296, + "reward_std": 0.24144218415021895, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.5981341004371643, + "step": 2540, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.03861275149974972, + "clip_ratio/high_mean": 0.005072360605117865, + "clip_ratio/low_mean": 0.0013027720240643248, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0063751326058991255, + "completion_length": 78.80750122070313, + "epoch": 0.4898664873691288, + "grad_norm": 1.7946380376815796, + "kl": 0.7765734851360321, + "learning_rate": 5.646670217001451e-07, + "loss": 0.004, + "reward": 1.8638887882232666, + "reward_std": 0.1732952728867531, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.6831943988800049, + "step": 2550, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.044772564456798135, + "clip_ratio/high_mean": 0.008199371959199198, + "clip_ratio/low_mean": 0.007188984929234721, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015388356836047024, + "completion_length": 92.03750305175781, + "epoch": 0.4917875324176352, + "grad_norm": 8241.9619140625, + "kl": 3.7090125039219854, + "learning_rate": 5.619516180122789e-07, + "loss": 0.2194, + "reward": 1.346347188949585, + "reward_std": 0.3114967554807663, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.4303610801696777, + "step": 2560, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.136108908476308, + "clip_ratio/high_mean": 0.01777363264700398, + "clip_ratio/low_mean": 0.0005986301795928739, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018372263037599625, + "completion_length": 77.66500091552734, + "epoch": 0.4937085774661416, + "grad_norm": 2.8724048137664795, + "kl": 0.30402788892388344, + "learning_rate": 5.592357787618398e-07, + "loss": -0.0095, + "reward": 1.235116672515869, + "reward_std": 0.16121466904878617, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.3706833332777023, + "step": 2570, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.21411730786785482, + "clip_ratio/high_mean": 0.02751181152416393, + "clip_ratio/low_mean": 0.005141469169757329, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03265328073175624, + "completion_length": 77.39250030517579, + "epoch": 0.495629622514648, + "grad_norm": 3.1119463443756104, + "kl": 0.516096468269825, + "learning_rate": 5.565196029243746e-07, + "loss": -0.0097, + "reward": 1.7056148529052735, + "reward_std": 0.26717675626277926, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.6065573751926422, + "step": 2580, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.06655758274719119, + "clip_ratio/high_mean": 0.00869752592407167, + "clip_ratio/low_mean": 0.0007154849590733647, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009413010929711163, + "completion_length": 78.71000061035156, + "epoch": 0.49755066756315436, + "grad_norm": 9.566883087158203, + "kl": 6.985853771865368, + "learning_rate": 5.538031894876971e-07, + "loss": 0.0154, + "reward": 1.8047074317932128, + "reward_std": 0.2406391829252243, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.655791187286377, + "step": 2590, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.04153572088107467, + "clip_ratio/high_mean": 0.0076960999285802245, + "clip_ratio/low_mean": 0.00053562533139484, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008231725194491446, + "completion_length": 87.35249938964844, + "epoch": 0.49947171261166073, + "grad_norm": 4.163487911224365, + "kl": 3.02228729724884, + "learning_rate": 5.510866374482799e-07, + "loss": 0.0014, + "reward": 1.7271404266357422, + "reward_std": 0.20059744864702225, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6170076906681061, + "step": 2600, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.09242721796035766, + "clip_ratio/high_mean": 0.01333312913775444, + "clip_ratio/low_mean": 0.0022988114287727512, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01563194077461958, + "completion_length": 86.66250152587891, + "epoch": 0.5013927576601671, + "grad_norm": 1.7816847562789917, + "kl": 2.135231140255928, + "learning_rate": 5.48370045807647e-07, + "loss": -0.0043, + "reward": 1.5687429666519166, + "reward_std": 0.22490316033363342, + "rewards/code_format_reward": 0.9524999976158142, + "rewards/code_reward": 0.5462464988231659, + "step": 2610, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.10432031177915632, + "clip_ratio/high_mean": 0.01704162026871927, + "clip_ratio/low_mean": 0.0019667694039526397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01900838967994787, + "completion_length": 98.58000183105469, + "epoch": 0.5033138027086735, + "grad_norm": 2.1069369316101074, + "kl": 2.131927290558815, + "learning_rate": 5.456535135687656e-07, + "loss": -0.0069, + "reward": 1.6628828048706055, + "reward_std": 0.23133169412612914, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.5858163475990296, + "step": 2620, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.04196612173691392, + "clip_ratio/high_mean": 0.0064837948535569016, + "clip_ratio/low_mean": 0.0025595034239813685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009043298207689076, + "completion_length": 86.6, + "epoch": 0.5052348477571799, + "grad_norm": 15.670801162719727, + "kl": 2.1330361180007458, + "learning_rate": 5.429371397324378e-07, + "loss": -0.0054, + "reward": 1.4884859561920165, + "reward_std": 0.3388957381248474, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.497055447101593, + "step": 2630, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.036724881688132885, + "clip_ratio/high_mean": 0.005309284973191097, + "clip_ratio/low_mean": 0.003665669827023521, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00897495478275232, + "completion_length": 84.73250122070313, + "epoch": 0.5071558928056863, + "grad_norm": 6.480928421020508, + "kl": 0.9241176024079323, + "learning_rate": 5.402210232936934e-07, + "loss": -0.0009, + "reward": 1.792254877090454, + "reward_std": 0.29597480297088624, + "rewards/code_format_reward": 0.9974999904632569, + "rewards/code_reward": 0.646752405166626, + "step": 2640, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.109601711621508, + "clip_ratio/high_mean": 0.015215938963228837, + "clip_ratio/low_mean": 0.0034228902019094675, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018638829072006046, + "completion_length": 88.82750244140625, + "epoch": 0.5090769378541927, + "grad_norm": 5.080334186553955, + "kl": 0.6404796183109284, + "learning_rate": 5.37505263238181e-07, + "loss": -0.0032, + "reward": 1.7266260623931884, + "reward_std": 0.27733459770679475, + "rewards/code_format_reward": 0.993749988079071, + "rewards/code_reward": 0.6148755311965942, + "step": 2650, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.0589832967845723, + "clip_ratio/high_mean": 0.009531341239926406, + "clip_ratio/low_mean": 0.00046608211705461143, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00999742336862255, + "completion_length": 88.69250183105468, + "epoch": 0.510997982902699, + "grad_norm": 7.949027061462402, + "kl": 0.6428510576486588, + "learning_rate": 5.347899585385619e-07, + "loss": -0.0028, + "reward": 1.8208046436309815, + "reward_std": 0.32592435777187345, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6647772669792176, + "step": 2660, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.15465617645531893, + "clip_ratio/high_mean": 0.022943795099854468, + "clip_ratio/low_mean": 0.0016213681577937678, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024565163621446118, + "completion_length": 87.13250274658203, + "epoch": 0.5129190279512055, + "grad_norm": 34.059959411621094, + "kl": 0.5654895901679993, + "learning_rate": 5.320752081509019e-07, + "loss": -0.0048, + "reward": 1.7013320207595826, + "reward_std": 0.27322621941566466, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6037909984588623, + "step": 2670, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.04982744911685586, + "clip_ratio/high_mean": 0.007483145385049283, + "clip_ratio/low_mean": 0.0010201202865573577, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008503265725448728, + "completion_length": 91.09750213623047, + "epoch": 0.5148400729997118, + "grad_norm": 3.5055086612701416, + "kl": 0.5736653476953506, + "learning_rate": 5.293611110110661e-07, + "loss": -0.0032, + "reward": 1.672940969467163, + "reward_std": 0.24722242057323457, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.5902204990386963, + "step": 2680, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.06388061246834695, + "clip_ratio/high_mean": 0.008427212300011888, + "clip_ratio/low_mean": 0.000504276818537619, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008931488974485546, + "completion_length": 84.06750183105468, + "epoch": 0.5167611180482182, + "grad_norm": 1.1749032735824585, + "kl": 0.6257337100803853, + "learning_rate": 5.266477660311123e-07, + "loss": -0.0049, + "reward": 1.883350706100464, + "reward_std": 0.1923319399356842, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.6929253697395324, + "step": 2690, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.032716090651229025, + "clip_ratio/high_mean": 0.004722256149398163, + "clip_ratio/low_mean": 0.00025295682498835956, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0049752129940316085, + "completion_length": 101.05250244140625, + "epoch": 0.5186821630967247, + "grad_norm": 2.250870704650879, + "kl": 0.3336128618568182, + "learning_rate": 5.239352720956869e-07, + "loss": -0.0014, + "reward": 1.803996729850769, + "reward_std": 0.3182943195104599, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.6548108696937561, + "step": 2700, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.057230279734358194, + "clip_ratio/high_mean": 0.01016470161266625, + "clip_ratio/low_mean": 0.001601585964090191, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01176628761459142, + "completion_length": 92.2125015258789, + "epoch": 0.520603208145231, + "grad_norm": 1.7528822422027588, + "kl": 0.30482072457671167, + "learning_rate": 5.212237280584214e-07, + "loss": -0.0012, + "reward": 1.6862072706222535, + "reward_std": 0.2419889122247696, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5971661269664764, + "step": 2710, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.060608417179901154, + "clip_ratio/high_mean": 0.00894762706157053, + "clip_ratio/low_mean": 0.0007068538383464329, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009654480942117516, + "completion_length": 92.53000183105469, + "epoch": 0.5225242531937374, + "grad_norm": 274.7859802246094, + "kl": 1.1552282243967056, + "learning_rate": 5.185132327383284e-07, + "loss": 0.1157, + "reward": 1.7673757076263428, + "reward_std": 0.3102965742349625, + "rewards/code_format_reward": 0.9887499809265137, + "rewards/code_reward": 0.6365003228187561, + "step": 2720, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.15797494337894022, + "clip_ratio/high_mean": 0.02083307456341572, + "clip_ratio/low_mean": 0.009061275536078028, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02989435092313215, + "completion_length": 88.4500015258789, + "epoch": 0.5244452982422437, + "grad_norm": 4.456059455871582, + "kl": 1.3563814774155616, + "learning_rate": 5.158038849162024e-07, + "loss": 0.0014, + "reward": 1.5090751886367797, + "reward_std": 0.23531495928764343, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.5098500728607178, + "step": 2730, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.054899439518339935, + "clip_ratio/high_mean": 0.008722224002121947, + "clip_ratio/low_mean": 0.0002903619286371395, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009012586006429046, + "completion_length": 85.53750305175781, + "epoch": 0.5263663432907502, + "grad_norm": 1.951745867729187, + "kl": 0.5144835211336612, + "learning_rate": 5.130957833310177e-07, + "loss": -0.0017, + "reward": 1.7648489713668822, + "reward_std": 0.1646851196885109, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6355494737625123, + "step": 2740, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.10675688227638602, + "clip_ratio/high_mean": 0.016114802553784103, + "clip_ratio/low_mean": 0.0011672286826069466, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0172820313135162, + "completion_length": 97.54000244140624, + "epoch": 0.5282873883392566, + "grad_norm": 2.7782888412475586, + "kl": 0.484642443805933, + "learning_rate": 5.103890266763317e-07, + "loss": -0.0017, + "reward": 1.7005881071090698, + "reward_std": 0.17179570347070694, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6046690165996551, + "step": 2750, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.04718098001321778, + "clip_ratio/high_mean": 0.0069286267200368455, + "clip_ratio/low_mean": 0.0022616338639636522, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009190260457398836, + "completion_length": 91.71500091552734, + "epoch": 0.5302084333877629, + "grad_norm": 1.6982417106628418, + "kl": 0.40430613309144975, + "learning_rate": 5.076837135966868e-07, + "loss": -0.0001, + "reward": 1.7166170120239257, + "reward_std": 0.12425057031214237, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.6111209750175476, + "step": 2760, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.01563614197075367, + "clip_ratio/high_mean": 0.0026440696616191416, + "clip_ratio/low_mean": 0.0005518664722330869, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0031959360814653335, + "completion_length": 93.39000091552734, + "epoch": 0.5321294784362693, + "grad_norm": 0.12160471081733704, + "kl": 0.3728202864527702, + "learning_rate": 5.049799426840166e-07, + "loss": -0.0008, + "reward": 1.8690509557724, + "reward_std": 0.20764816105365752, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6889004349708557, + "step": 2770, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.08729656506329775, + "clip_ratio/high_mean": 0.013787648268043995, + "clip_ratio/low_mean": 0.0016946192088653333, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015482267551124095, + "completion_length": 83.49000091552735, + "epoch": 0.5340505234847757, + "grad_norm": 2.061514377593994, + "kl": 0.2805942878127098, + "learning_rate": 5.02277812474052e-07, + "loss": -0.0005, + "reward": 1.5558062076568604, + "reward_std": 0.18851915150880813, + "rewards/code_format_reward": 0.9924999833106994, + "rewards/code_reward": 0.529778128862381, + "step": 2780, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.047503374773077665, + "clip_ratio/high_mean": 0.007121381178149022, + "clip_ratio/low_mean": 0.003943280148087069, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011064661786076613, + "completion_length": 90.33500213623047, + "epoch": 0.5359715685332821, + "grad_norm": 2.8578014373779297, + "kl": 0.9348004341125489, + "learning_rate": 4.995774214427299e-07, + "loss": -0.0083, + "reward": 1.5787676095962524, + "reward_std": 0.24208036959171295, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.5412587821483612, + "step": 2790, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.0682177669601515, + "clip_ratio/high_mean": 0.010770523789688013, + "clip_ratio/low_mean": 0.0030192029429599644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01378972665697802, + "completion_length": 97.35500030517578, + "epoch": 0.5378926135817885, + "grad_norm": 3.7523272037506104, + "kl": 0.49133365601301193, + "learning_rate": 4.968788680026062e-07, + "loss": 0.0019, + "reward": 1.8675085306167603, + "reward_std": 0.3084888607263565, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.6887542605400085, + "step": 2800, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.051895615691319105, + "clip_ratio/high_mean": 0.007092137623112648, + "clip_ratio/low_mean": 0.0009049189888173714, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007997056643944234, + "completion_length": 84.84000244140626, + "epoch": 0.5398136586302948, + "grad_norm": 6879.29296875, + "kl": 41.48030465692282, + "learning_rate": 4.941822504992665e-07, + "loss": 0.3058, + "reward": 1.8456867456436157, + "reward_std": 0.17148398756980895, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.6750308394432067, + "step": 2810, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.055018832255154845, + "clip_ratio/high_mean": 0.009382021031342447, + "clip_ratio/low_mean": 0.0012206103128846735, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0106026312103495, + "completion_length": 92.16250152587891, + "epoch": 0.5417347036788013, + "grad_norm": 1.7369046211242676, + "kl": 39.203208688646555, + "learning_rate": 4.914876672077444e-07, + "loss": 0.0739, + "reward": 1.7605399131774901, + "reward_std": 0.22667703181505203, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6337074398994446, + "step": 2820, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.07783090919256211, + "clip_ratio/high_mean": 0.013414820143952965, + "clip_ratio/low_mean": 0.004346576618263498, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017761396686546506, + "completion_length": 86.75750122070312, + "epoch": 0.5436557487273077, + "grad_norm": 1.3669426441192627, + "kl": 0.6254852950572968, + "learning_rate": 4.887952163289387e-07, + "loss": -0.0037, + "reward": 1.7524815320968627, + "reward_std": 0.18003067299723624, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.6271782517433167, + "step": 2830, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.032230034773238006, + "clip_ratio/high_mean": 0.005299290179391391, + "clip_ratio/low_mean": 0.002357826306251809, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007657116468180902, + "completion_length": 92.63250122070312, + "epoch": 0.545576793775814, + "grad_norm": 6.607495307922363, + "kl": 0.6308505192399025, + "learning_rate": 4.861049959860352e-07, + "loss": -0.0026, + "reward": 1.879476284980774, + "reward_std": 0.21936110258102418, + "rewards/code_format_reward": 0.9787499785423279, + "rewards/code_reward": 0.6950506567955017, + "step": 2840, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.07762792855501174, + "clip_ratio/high_mean": 0.012513539101928473, + "clip_ratio/low_mean": 0.0019065381304244511, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014420077262911946, + "completion_length": 79.06750183105468, + "epoch": 0.5474978388243205, + "grad_norm": 2.1427297592163086, + "kl": 0.7649303644895553, + "learning_rate": 4.834171042209299e-07, + "loss": -0.0016, + "reward": 1.7679643869400024, + "reward_std": 0.2242477983236313, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.640857207775116, + "step": 2850, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.10583647433668375, + "clip_ratio/high_mean": 0.015110303135588764, + "clip_ratio/low_mean": 0.0026529163093073293, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017763219110202046, + "completion_length": 89.09499969482422, + "epoch": 0.5494188838728268, + "grad_norm": 5.39391565322876, + "kl": 1.1404950305819512, + "learning_rate": 4.807316389906573e-07, + "loss": 0.0011, + "reward": 1.6588359355926514, + "reward_std": 0.23765334486961365, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.5822304427623749, + "step": 2860, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.13843556838110088, + "clip_ratio/high_mean": 0.022050847904756664, + "clip_ratio/low_mean": 0.006549120438285172, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028599968180060387, + "completion_length": 85.02750091552734, + "epoch": 0.5513399289213332, + "grad_norm": 6.328859329223633, + "kl": 1.3457766875624657, + "learning_rate": 4.780486981638194e-07, + "loss": 0.004, + "reward": 1.4554174661636352, + "reward_std": 0.291735103726387, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.4839587390422821, + "step": 2870, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.03891797037795186, + "clip_ratio/high_mean": 0.005045192840043455, + "clip_ratio/low_mean": 0.0029750549525488167, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008020247751846909, + "completion_length": 85.00500183105468, + "epoch": 0.5532609739698396, + "grad_norm": 3.746497392654419, + "kl": 1.5130328834056854, + "learning_rate": 4.75368379517019e-07, + "loss": -0.0033, + "reward": 1.8564167737960815, + "reward_std": 0.14603331089019775, + "rewards/code_format_reward": 0.9987499952316284, + "rewards/code_reward": 0.6785208344459533, + "step": 2880, + "zero_std_ratio": 0.7 + }, + { + "clip_ratio/high_max": 0.23034826815128326, + "clip_ratio/high_mean": 0.037423617928288876, + "clip_ratio/low_mean": 0.0014674156729597599, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03889103210531175, + "completion_length": 81.40750122070312, + "epoch": 0.555182019018346, + "grad_norm": 7.282290458679199, + "kl": 0.5326755799353122, + "learning_rate": 4.7269078073129696e-07, + "loss": 0.0032, + "reward": 1.700506567955017, + "reward_std": 0.3424434006214142, + "rewards/code_format_reward": 0.9700000047683716, + "rewards/code_reward": 0.6077533006668091, + "step": 2890, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.06711816978640854, + "clip_ratio/high_mean": 0.008929741784231737, + "clip_ratio/low_mean": 0.0021304662863258273, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011060208058916032, + "completion_length": 75.07750244140625, + "epoch": 0.5571030640668524, + "grad_norm": 3.718710422515869, + "kl": 0.3807241953909397, + "learning_rate": 4.7001599938857204e-07, + "loss": -0.0016, + "reward": 1.6593467235565185, + "reward_std": 0.2742844566702843, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.5821733415126801, + "step": 2900, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.10134089784696698, + "clip_ratio/high_mean": 0.014033923938404769, + "clip_ratio/low_mean": 0.0036910680413711817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01772499195067212, + "completion_length": 74.57250061035157, + "epoch": 0.5590241091153587, + "grad_norm": 18.590866088867188, + "kl": 0.9125221639871597, + "learning_rate": 4.673441329680844e-07, + "loss": 0.0044, + "reward": 1.6198436498641968, + "reward_std": 0.1470041409134865, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.5621092915534973, + "step": 2910, + "zero_std_ratio": 0.7 + }, + { + "clip_ratio/high_max": 0.042256729071959855, + "clip_ratio/high_mean": 0.007948629459133372, + "clip_ratio/low_mean": 0.001496748169302009, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009445377223892137, + "completion_length": 77.4625015258789, + "epoch": 0.5609451541638651, + "grad_norm": 0.18645010888576508, + "kl": 0.4780749522149563, + "learning_rate": 4.6467527884284365e-07, + "loss": 0.0006, + "reward": 1.8204985857009888, + "reward_std": 0.19856311585754155, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.6649368166923523, + "step": 2920, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.1271044396329671, + "clip_ratio/high_mean": 0.016186495171859862, + "clip_ratio/low_mean": 0.0011034044640837238, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01728989938274026, + "completion_length": 82.72500305175781, + "epoch": 0.5628661992123716, + "grad_norm": 6.4396162033081055, + "kl": 0.30610462203621863, + "learning_rate": 4.6200953427607927e-07, + "loss": -0.0021, + "reward": 1.7915108680725098, + "reward_std": 0.22729050666093825, + "rewards/code_format_reward": 0.9700000047683716, + "rewards/code_reward": 0.6532554149627685, + "step": 2930, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.23429102210793645, + "clip_ratio/high_mean": 0.03006269016477745, + "clip_ratio/low_mean": 0.001874277341994457, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0319369669421576, + "completion_length": 88.97500152587891, + "epoch": 0.5647872442608779, + "grad_norm": 43.83531951904297, + "kl": 0.5952823750674725, + "learning_rate": 4.5934699641769747e-07, + "loss": -0.0032, + "reward": 1.837431001663208, + "reward_std": 0.3392215400934219, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.6730904817581177, + "step": 2940, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.06733583421446383, + "clip_ratio/high_mean": 0.00863017894444056, + "clip_ratio/low_mean": 0.005618994176620618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014249173022108153, + "completion_length": 77.96000061035156, + "epoch": 0.5667082893093843, + "grad_norm": 2.642043352127075, + "kl": 0.56968834400177, + "learning_rate": 4.566877623007389e-07, + "loss": 0.0049, + "reward": 1.7328413248062133, + "reward_std": 0.21620932817459107, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.6229831516742707, + "step": 2950, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.04462944087572396, + "clip_ratio/high_mean": 0.007475414098007604, + "clip_ratio/low_mean": 0.002004683316772571, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009480097430059686, + "completion_length": 85.0875015258789, + "epoch": 0.5686293343578906, + "grad_norm": 3.8512065410614014, + "kl": 0.33709155321121215, + "learning_rate": 4.540319288378439e-07, + "loss": -0.0057, + "reward": 1.6900140762329101, + "reward_std": 0.21961634010076522, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.5978195071220398, + "step": 2960, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.06556220971979201, + "clip_ratio/high_mean": 0.01001431758631952, + "clip_ratio/low_mean": 0.003507485325098969, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01352180291141849, + "completion_length": 92.67500152587891, + "epoch": 0.5705503794063971, + "grad_norm": 2.966658592224121, + "kl": 0.5968067184090614, + "learning_rate": 4.513795928177193e-07, + "loss": 0.0007, + "reward": 1.4343469619750977, + "reward_std": 0.16000542044639587, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.4681109845638275, + "step": 2970, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.006220952537842095, + "clip_ratio/high_mean": 0.0009992636245442555, + "clip_ratio/low_mean": 0.0029718225210672244, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0039710860582999885, + "completion_length": 92.65750274658203, + "epoch": 0.5724714244549035, + "grad_norm": 9.493338584899902, + "kl": 0.5875692501664161, + "learning_rate": 4.4873085090161266e-07, + "loss": -0.0009, + "reward": 1.4061829090118407, + "reward_std": 0.20027331858873368, + "rewards/code_format_reward": 0.9762499928474426, + "rewards/code_reward": 0.45902894139289857, + "step": 2980, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.0330589919583872, + "clip_ratio/high_mean": 0.004416179939289578, + "clip_ratio/low_mean": 0.002115111546299886, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006531291425926611, + "completion_length": 79.80750274658203, + "epoch": 0.5743924695034098, + "grad_norm": 1.592423915863037, + "kl": 0.6846940219402313, + "learning_rate": 4.460857996197879e-07, + "loss": -0.0088, + "reward": 1.8656628370285033, + "reward_std": 0.24907293021678925, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.6850189208984375, + "step": 2990, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.10685318629257382, + "clip_ratio/high_mean": 0.014238242123974487, + "clip_ratio/low_mean": 0.0005060926268924959, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014744334877468646, + "completion_length": 75.28500213623047, + "epoch": 0.5763135145519163, + "grad_norm": 11.023285865783691, + "kl": 1.773244822025299, + "learning_rate": 4.434445353680084e-07, + "loss": -0.0004, + "reward": 1.6719849348068236, + "reward_std": 0.23447352051734924, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.5888049364089966, + "step": 3000, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.09909297195263207, + "clip_ratio/high_mean": 0.014624686987372116, + "clip_ratio/low_mean": 0.0008105992455966771, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015435286390129477, + "completion_length": 80.88750228881835, + "epoch": 0.5782345596004226, + "grad_norm": 3.5932817459106445, + "kl": 1.2866470351815225, + "learning_rate": 4.4080715440402417e-07, + "loss": 0.0028, + "reward": 1.7477641582489014, + "reward_std": 0.27256832718849183, + "rewards/code_format_reward": 0.9800000071525574, + "rewards/code_reward": 0.628882086277008, + "step": 3010, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.061281491769477725, + "clip_ratio/high_mean": 0.008347922342363746, + "clip_ratio/low_mean": 0.00354889674927108, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011896818911191076, + "completion_length": 75.85250091552734, + "epoch": 0.580155604648929, + "grad_norm": 4.849332809448242, + "kl": 0.476963010430336, + "learning_rate": 4.381737528440624e-07, + "loss": -0.0002, + "reward": 1.5080678462982178, + "reward_std": 0.1984383262693882, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.5099714159965515, + "step": 3020, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.02014698493294418, + "clip_ratio/high_mean": 0.0029024946445133535, + "clip_ratio/low_mean": 0.001273224765463965, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0041757194034289565, + "completion_length": 86.35750122070313, + "epoch": 0.5820766496974354, + "grad_norm": 5.408311367034912, + "kl": 1.1033611692488194, + "learning_rate": 4.3554442665932664e-07, + "loss": -0.0044, + "reward": 1.7480007410049438, + "reward_std": 0.20548871904611588, + "rewards/code_format_reward": 0.9674999952316284, + "rewards/code_reward": 0.6321253478527069, + "step": 3030, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.07207567039877176, + "clip_ratio/high_mean": 0.010315603285562247, + "clip_ratio/low_mean": 0.0024311804067110644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012746783741749822, + "completion_length": 87.45250091552734, + "epoch": 0.5839976947459418, + "grad_norm": 5.45907735824585, + "kl": 0.7388446770608426, + "learning_rate": 4.329192716724974e-07, + "loss": -0.0134, + "reward": 1.617799663543701, + "reward_std": 0.28184359073638915, + "rewards/code_format_reward": 0.9900000095367432, + "rewards/code_reward": 0.5613998055458069, + "step": 3040, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.03178482772782445, + "clip_ratio/high_mean": 0.00484326797304675, + "clip_ratio/low_mean": 0.0010359384352341295, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005879206501413136, + "completion_length": 83.70250091552734, + "epoch": 0.5859187397944482, + "grad_norm": 6.244964122772217, + "kl": 0.8223805136978626, + "learning_rate": 4.3029838355424165e-07, + "loss": -0.0028, + "reward": 1.5551699638366698, + "reward_std": 0.23868169337511064, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5300849676132202, + "step": 3050, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.031123768421821296, + "clip_ratio/high_mean": 0.0042093763331649825, + "clip_ratio/low_mean": 0.00023920949752209708, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0044485858496045695, + "completion_length": 90.6500015258789, + "epoch": 0.5878397848429545, + "grad_norm": 1.844166874885559, + "kl": 0.9453303083777428, + "learning_rate": 4.2768185781972433e-07, + "loss": 0.0038, + "reward": 1.7277095794677735, + "reward_std": 0.22161270976066588, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.6176047682762146, + "step": 3060, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.06112067271023989, + "clip_ratio/high_mean": 0.008206171146593989, + "clip_ratio/low_mean": 0.0006491162814199925, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008855287660844624, + "completion_length": 81.27750091552734, + "epoch": 0.589760829891461, + "grad_norm": 3.0321500301361084, + "kl": 0.4705409877002239, + "learning_rate": 4.2506978982512964e-07, + "loss": -0.0002, + "reward": 1.9011548519134522, + "reward_std": 0.2363950289785862, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.7037024021148681, + "step": 3070, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.028480308945290744, + "clip_ratio/high_mean": 0.00514335140469484, + "clip_ratio/low_mean": 0.0035089968900138047, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008652348211035133, + "completion_length": 88.05750274658203, + "epoch": 0.5916818749399674, + "grad_norm": 4.498425483703613, + "kl": 0.9383749194443226, + "learning_rate": 4.224622747641835e-07, + "loss": -0.0068, + "reward": 1.2419449806213378, + "reward_std": 0.1959183931350708, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.37597247362136843, + "step": 3080, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.02165755571331829, + "clip_ratio/high_mean": 0.003493850605445914, + "clip_ratio/low_mean": 0.0001163623295724392, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003610212981584482, + "completion_length": 83.10500030517578, + "epoch": 0.5936029199884737, + "grad_norm": 1.0221151113510132, + "kl": 1.614695566892624, + "learning_rate": 4.1985940766468663e-07, + "loss": 0.1048, + "reward": 1.8437815666198731, + "reward_std": 0.12033854126930237, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.6731407642364502, + "step": 3090, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.05757335813250393, + "clip_ratio/high_mean": 0.0107182093168376, + "clip_ratio/low_mean": 0.004042259410198312, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014760468708118424, + "completion_length": 86.6625, + "epoch": 0.5955239650369801, + "grad_norm": 3.0221967697143555, + "kl": 0.4662696644663811, + "learning_rate": 4.1726128338504997e-07, + "loss": 0.0059, + "reward": 1.6797678232192994, + "reward_std": 0.23598156571388246, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.5930089056491852, + "step": 3100, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.1632944119395688, + "clip_ratio/high_mean": 0.02386658971372526, + "clip_ratio/low_mean": 0.000367270597780589, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02423386004229542, + "completion_length": 87.04000244140624, + "epoch": 0.5974450100854864, + "grad_norm": 3124.911865234375, + "kl": 1.4825018651783466, + "learning_rate": 4.146679966108374e-07, + "loss": 0.109, + "reward": 1.7368038177490235, + "reward_std": 0.2290027320384979, + "rewards/code_format_reward": 0.9912499785423279, + "rewards/code_reward": 0.620589405298233, + "step": 3110, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.011806602030992508, + "clip_ratio/high_mean": 0.00222149578621611, + "clip_ratio/low_mean": 0.001782867594738491, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004004363450803794, + "completion_length": 76.50500183105468, + "epoch": 0.5993660551339929, + "grad_norm": 5.609122276306152, + "kl": 1.2381610602140427, + "learning_rate": 4.120796418513165e-07, + "loss": 0.0687, + "reward": 1.6538613319396973, + "reward_std": 0.2478315144777298, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.5813056170940399, + "step": 3120, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.04111457797698677, + "clip_ratio/high_mean": 0.006102612579707056, + "clip_ratio/low_mean": 0.0006678692123387008, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006770481838611886, + "completion_length": 90.63500213623047, + "epoch": 0.6012871001824993, + "grad_norm": 1.7537983655929565, + "kl": 0.8379382207989693, + "learning_rate": 4.094963134360129e-07, + "loss": 3.0713, + "reward": 1.8111864566802978, + "reward_std": 0.23444892466068268, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6599682211875916, + "step": 3130, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.07487462717108428, + "clip_ratio/high_mean": 0.009757341054501012, + "clip_ratio/low_mean": 0.002470593445468694, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012227934325346723, + "completion_length": 84.99250183105468, + "epoch": 0.6032081452310056, + "grad_norm": 7.498387336730957, + "kl": 0.5894037500023842, + "learning_rate": 4.0691810551127327e-07, + "loss": 0.0462, + "reward": 1.6221882104873657, + "reward_std": 0.25462802946567537, + "rewards/code_format_reward": 0.9975000023841858, + "rewards/code_reward": 0.5617190957069397, + "step": 3140, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.015196678857319058, + "clip_ratio/high_mean": 0.0022096226894063875, + "clip_ratio/low_mean": 0.002686911600176245, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004896534324507229, + "completion_length": 88.44750213623047, + "epoch": 0.6051291902795121, + "grad_norm": 0.7371006011962891, + "kl": 1.5165767412632705, + "learning_rate": 4.0434511203683386e-07, + "loss": 0.0113, + "reward": 1.958918571472168, + "reward_std": 0.17050198167562486, + "rewards/code_format_reward": 0.9962499856948852, + "rewards/code_reward": 0.7303967714309693, + "step": 3150, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.047509356867522, + "clip_ratio/high_mean": 0.0060635729460045695, + "clip_ratio/low_mean": 0.0037405278504593297, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009804100578185171, + "completion_length": 93.71500091552734, + "epoch": 0.6070502353280185, + "grad_norm": 4.062532424926758, + "kl": 164.7577206812799, + "learning_rate": 4.017774267823967e-07, + "loss": 0.3479, + "reward": 1.8433427095413208, + "reward_std": 0.20897280275821686, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6760463416576385, + "step": 3160, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.007103513111360371, + "clip_ratio/high_mean": 0.0009442454349482432, + "clip_ratio/low_mean": 0.0005656339257257059, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015098793461220338, + "completion_length": 97.03500061035156, + "epoch": 0.6089712803765248, + "grad_norm": 0.3194718658924103, + "kl": 19.38877977654338, + "learning_rate": 3.9921514332421193e-07, + "loss": 0.1279, + "reward": 1.3801440358161927, + "reward_std": 0.26880781557410954, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.44757200181484225, + "step": 3170, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.027826181857381015, + "clip_ratio/high_mean": 0.004423137854610104, + "clip_ratio/low_mean": 0.000519216748944018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004942354625381995, + "completion_length": 99.6375015258789, + "epoch": 0.6108923254250312, + "grad_norm": 133.27151489257812, + "kl": 91.96420569866896, + "learning_rate": 3.966583550416676e-07, + "loss": 284.3821, + "reward": 1.6065278768539428, + "reward_std": 0.2671674907207489, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.5598264217376709, + "step": 3180, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.03810381339862943, + "clip_ratio/high_mean": 0.005511091940570622, + "clip_ratio/low_mean": 0.00701818183879368, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01252927360474132, + "completion_length": 90.77000122070312, + "epoch": 0.6128133704735376, + "grad_norm": 2.931155204772949, + "kl": 4.587994083762169, + "learning_rate": 3.9410715511388647e-07, + "loss": 28143.1688, + "reward": 1.7186223268508911, + "reward_std": 0.2031429558992386, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.6139986515045166, + "step": 3190, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.18900979291647674, + "clip_ratio/high_mean": 0.025313075329177082, + "clip_ratio/low_mean": 0.00013794690457871183, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02545102240983397, + "completion_length": 88.60750122070313, + "epoch": 0.614734415522044, + "grad_norm": 3.9914708137512207, + "kl": 0.678571529686451, + "learning_rate": 3.915616365163304e-07, + "loss": 0.0002, + "reward": 1.818918228149414, + "reward_std": 0.24608779847621917, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.6653966069221496, + "step": 3200, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.05510081194806844, + "clip_ratio/high_mean": 0.008429678474203683, + "clip_ratio/low_mean": 0.0015389235399197788, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0099686019733781, + "completion_length": 85.05250091552735, + "epoch": 0.6166554605705504, + "grad_norm": 2.0297534465789795, + "kl": 0.5190044179558754, + "learning_rate": 3.890218920174122e-07, + "loss": -0.0056, + "reward": 1.938026785850525, + "reward_std": 0.2829041987657547, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.7218258857727051, + "step": 3210, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.04617450258228928, + "clip_ratio/high_mean": 0.007303895291988738, + "clip_ratio/low_mean": 0.002542783234093804, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009846678806934506, + "completion_length": 92.52000122070312, + "epoch": 0.6185765056190567, + "grad_norm": 3.2283730506896973, + "kl": 0.5362374372780323, + "learning_rate": 3.86488014175114e-07, + "loss": 0.0003, + "reward": 1.7741312742233277, + "reward_std": 0.20447308868169783, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6395656108856201, + "step": 3220, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.059750709845684466, + "clip_ratio/high_mean": 0.00790787541482132, + "clip_ratio/low_mean": 0.0012954409321537241, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009203316466300748, + "completion_length": 90.4375, + "epoch": 0.6204975506675632, + "grad_norm": 2.409045934677124, + "kl": 0.553566773980856, + "learning_rate": 3.8396009533361486e-07, + "loss": -0.0, + "reward": 1.6513851642608643, + "reward_std": 0.24081393480300903, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.580692571401596, + "step": 3230, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.03564520282670856, + "clip_ratio/high_mean": 0.004964679945260286, + "clip_ratio/low_mean": 0.004444090686592972, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009408770385198295, + "completion_length": 79.08000183105469, + "epoch": 0.6224185957160695, + "grad_norm": 7.759763717651367, + "kl": 1.2998816877603532, + "learning_rate": 3.814382276199251e-07, + "loss": -0.0006, + "reward": 1.6336610555648803, + "reward_std": 0.1691926121711731, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.5680804908275604, + "step": 3240, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.011579358880408109, + "clip_ratio/high_mean": 0.002202258622855879, + "clip_ratio/low_mean": 0.0003946456956327893, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025969043519580735, + "completion_length": 88.7375, + "epoch": 0.6243396407645759, + "grad_norm": 9.489768981933594, + "kl": 4.286054483056068, + "learning_rate": 3.7892250294052853e-07, + "loss": 31.2761, + "reward": 1.8622464895248414, + "reward_std": 0.2547990471124649, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.6858106970787048, + "step": 3250, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.07659143296186813, + "clip_ratio/high_mean": 0.010122461079299682, + "clip_ratio/low_mean": 0.0019954566974774933, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012117917880095775, + "completion_length": 99.80750274658203, + "epoch": 0.6262606858130824, + "grad_norm": 2.884183168411255, + "kl": 1.2840011775493623, + "learning_rate": 3.764130129780341e-07, + "loss": 0.0383, + "reward": 1.6670962572097778, + "reward_std": 0.34920003414154055, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5907356142997742, + "step": 3260, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.029844516195589678, + "clip_ratio/high_mean": 0.004244843772175955, + "clip_ratio/low_mean": 0.0002169124811189249, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004461756230011815, + "completion_length": 100.70250091552734, + "epoch": 0.6281817308615887, + "grad_norm": 4.036985397338867, + "kl": 2.1118960954248904, + "learning_rate": 3.7390984918783286e-07, + "loss": 0.9419, + "reward": 1.6084105730056764, + "reward_std": 0.17128639966249465, + "rewards/code_format_reward": 0.9712500095367431, + "rewards/code_reward": 0.5613927602767944, + "step": 3270, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.07067356873303651, + "clip_ratio/high_mean": 0.00971948360092938, + "clip_ratio/low_mean": 0.0006290240438829642, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010348507657181472, + "completion_length": 88.9000015258789, + "epoch": 0.6301027759100951, + "grad_norm": 1.543152928352356, + "kl": 0.5742107287049294, + "learning_rate": 3.714131027947669e-07, + "loss": 0.0006, + "reward": 1.808586883544922, + "reward_std": 0.20984979271888732, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.6564809083938599, + "step": 3280, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.060896387742832306, + "clip_ratio/high_mean": 0.00765781793743372, + "clip_ratio/low_mean": 0.01029690281720832, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017954721208661796, + "completion_length": 80.08500213623047, + "epoch": 0.6320238209586014, + "grad_norm": 2.127617359161377, + "kl": 0.6725200928747654, + "learning_rate": 3.689228647898034e-07, + "loss": 0.1143, + "reward": 1.678031039237976, + "reward_std": 0.19750893712043763, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.5921404898166657, + "step": 3290, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.05414980174973607, + "clip_ratio/high_mean": 0.007520435960032046, + "clip_ratio/low_mean": 0.00011696565634338185, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007637401651300025, + "completion_length": 92.725, + "epoch": 0.6339448660071079, + "grad_norm": 8.315914154052734, + "kl": 0.30459046363830566, + "learning_rate": 3.6643922592671904e-07, + "loss": -0.0066, + "reward": 1.5898099780082702, + "reward_std": 0.1832955375313759, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.5464674949645996, + "step": 3300, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.02745349882170558, + "clip_ratio/high_mean": 0.004275670822244138, + "clip_ratio/low_mean": 0.001036624335392844, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005312295141629874, + "completion_length": 86.80250091552735, + "epoch": 0.6358659110556143, + "grad_norm": 4.2797441482543945, + "kl": 2.398578557372093, + "learning_rate": 3.6396227671879267e-07, + "loss": 0.028, + "reward": 1.7730424404144287, + "reward_std": 0.3175764262676239, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.6387087047100067, + "step": 3310, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.03558065614197403, + "clip_ratio/high_mean": 0.004970578508800827, + "clip_ratio/low_mean": 0.0008951228694058955, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0058657014247728515, + "completion_length": 91.0875015258789, + "epoch": 0.6377869561041206, + "grad_norm": 5.376333713531494, + "kl": 1.4305558323860168, + "learning_rate": 3.614921074355067e-07, + "loss": 0.0034, + "reward": 1.7029305696487427, + "reward_std": 0.34837333858013153, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.6052152514457703, + "step": 3320, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.04346057323855348, + "clip_ratio/high_mean": 0.005737839776702458, + "clip_ratio/low_mean": 0.001675139949657023, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00741297956337803, + "completion_length": 88.75250396728515, + "epoch": 0.639708001152627, + "grad_norm": 2.969228744506836, + "kl": 0.7607076019048691, + "learning_rate": 3.5902880809925704e-07, + "loss": -0.0001, + "reward": 1.6762405157089233, + "reward_std": 0.2515918217599392, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.5909327387809753, + "step": 3330, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.05080293011851609, + "clip_ratio/high_mean": 0.006427765643456951, + "clip_ratio/low_mean": 0.00040708604792598634, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006834851735038683, + "completion_length": 88.0750015258789, + "epoch": 0.6416290462011334, + "grad_norm": 12.137472152709961, + "kl": 0.31881698705255984, + "learning_rate": 3.565724684820727e-07, + "loss": 3.6118, + "reward": 1.8916306495666504, + "reward_std": 0.1850387692451477, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.6973778128623962, + "step": 3340, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.14287711144424975, + "clip_ratio/high_mean": 0.019231261435197666, + "clip_ratio/low_mean": 0.0020263168029487134, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021257578127551822, + "completion_length": 94.19000091552735, + "epoch": 0.6435500912496398, + "grad_norm": 6.10810661315918, + "kl": 0.8296034529805183, + "learning_rate": 3.541231781023436e-07, + "loss": -0.0004, + "reward": 1.6248144626617431, + "reward_std": 0.2219874605536461, + "rewards/code_format_reward": 0.9887499809265137, + "rewards/code_reward": 0.5652197122573852, + "step": 3350, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.07329725201707334, + "clip_ratio/high_mean": 0.009671362905646675, + "clip_ratio/low_mean": 0.005342914546781685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015014277724549174, + "completion_length": 97.74500274658203, + "epoch": 0.6454711362981462, + "grad_norm": 2.801866054534912, + "kl": 0.5770246163010597, + "learning_rate": 3.5168102622155894e-07, + "loss": 0.0, + "reward": 1.6838999271392823, + "reward_std": 0.2707583636045456, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.594137442111969, + "step": 3360, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.005975091701839119, + "clip_ratio/high_mean": 0.0011488659016322344, + "clip_ratio/low_mean": 0.001098146109143272, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022470120195066555, + "completion_length": 89.36750030517578, + "epoch": 0.6473921813466526, + "grad_norm": 34.13050079345703, + "kl": 2.2693535044789312, + "learning_rate": 3.492461018410535e-07, + "loss": 0.0028, + "reward": 1.8977232933044434, + "reward_std": 0.2937870219349861, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.7022991299629211, + "step": 3370, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.04565345844021067, + "clip_ratio/high_mean": 0.009023689541209023, + "clip_ratio/low_mean": 0.00031946374219842254, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009343153254303616, + "completion_length": 84.38249969482422, + "epoch": 0.649313226395159, + "grad_norm": 0.9170461893081665, + "kl": 108.80695619434118, + "learning_rate": 3.468184936987645e-07, + "loss": 920.5057, + "reward": 1.7496967315673828, + "reward_std": 0.2916144669055939, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.626410859823227, + "step": 3380, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.04991705315187574, + "clip_ratio/high_mean": 0.007881995162460954, + "clip_ratio/low_mean": 0.00031314246589317916, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008195137570146472, + "completion_length": 88.86000061035156, + "epoch": 0.6512342714436653, + "grad_norm": 3.084516763687134, + "kl": 1331.8980419039726, + "learning_rate": 3.4439829026599765e-07, + "loss": 2.6994, + "reward": 1.7110779523849486, + "reward_std": 0.22298349142074586, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.6071014523506164, + "step": 3390, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.03301922780228779, + "clip_ratio/high_mean": 0.005802097530977335, + "clip_ratio/low_mean": 0.002479181956732646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00828127931599738, + "completion_length": 77.6500015258789, + "epoch": 0.6531553164921717, + "grad_norm": 3643.742919921875, + "kl": 629.580971956253, + "learning_rate": 3.4198557974420236e-07, + "loss": 1.3601, + "reward": 1.9027020692825318, + "reward_std": 0.23692196756601333, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.7038509964942932, + "step": 3400, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.026931732892990112, + "clip_ratio/high_mean": 0.004060871619731188, + "clip_ratio/low_mean": 0.0013453641964588313, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00540623576962389, + "completion_length": 81.77250213623047, + "epoch": 0.6550763615406782, + "grad_norm": 3.2221176624298096, + "kl": 17.398655989021062, + "learning_rate": 3.3958045006175804e-07, + "loss": 0.0552, + "reward": 1.7479909420013429, + "reward_std": 0.22741918563842772, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.6308704853057862, + "step": 3410, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.06737710665911437, + "clip_ratio/high_mean": 0.008767830353463069, + "clip_ratio/low_mean": 0.0005067014892119915, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009274532069684937, + "completion_length": 92.88500061035157, + "epoch": 0.6569974065891845, + "grad_norm": 4.063995838165283, + "kl": 2.0011128395795823, + "learning_rate": 3.3718298887077003e-07, + "loss": 0.0159, + "reward": 1.7235053777694702, + "reward_std": 0.2168472334742546, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.616440212726593, + "step": 3420, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.019622295489534737, + "clip_ratio/high_mean": 0.003191170998616144, + "clip_ratio/low_mean": 0.0015002752974396572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0046914463368011635, + "completion_length": 80.16000213623047, + "epoch": 0.6589184516376909, + "grad_norm": 1.253300428390503, + "kl": 0.48067781031131745, + "learning_rate": 3.3479328354387286e-07, + "loss": 0.0008, + "reward": 1.7450715541839599, + "reward_std": 0.1590050458908081, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6244107484817505, + "step": 3430, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.03181373123079538, + "clip_ratio/high_mean": 0.0046242739539593455, + "clip_ratio/low_mean": 0.012107005770667456, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016731279762461783, + "completion_length": 84.32750244140625, + "epoch": 0.6608394966861972, + "grad_norm": 1.5854672193527222, + "kl": 0.42518851198256014, + "learning_rate": 3.324114211710498e-07, + "loss": 0.0, + "reward": 1.6541699171066284, + "reward_std": 0.1113172210752964, + "rewards/code_format_reward": 0.9962499856948852, + "rewards/code_reward": 0.5780224561691284, + "step": 3440, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.13535575959831475, + "clip_ratio/high_mean": 0.018421862670220435, + "clip_ratio/low_mean": 0.0012572539155371488, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019679116318002343, + "completion_length": 91.63250122070312, + "epoch": 0.6627605417347037, + "grad_norm": 4.593750476837158, + "kl": 0.7388513803482055, + "learning_rate": 3.300374885564553e-07, + "loss": -0.0, + "reward": 1.5408308625221252, + "reward_std": 0.29571940898895266, + "rewards/code_format_reward": 0.9749999880790711, + "rewards/code_reward": 0.5266654074192048, + "step": 3450, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.04671933995559811, + "clip_ratio/high_mean": 0.0062255718978121875, + "clip_ratio/low_mean": 0.003391482085862663, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009617053843976464, + "completion_length": 78.45250091552734, + "epoch": 0.6646815867832101, + "grad_norm": 2.5849409103393555, + "kl": 10.90581871420145, + "learning_rate": 3.2767157221525437e-07, + "loss": 0.0178, + "reward": 1.5087457418441772, + "reward_std": 0.19353876560926436, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.5074978828430176, + "step": 3460, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.0318772604689002, + "clip_ratio/high_mean": 0.004644899175036699, + "clip_ratio/low_mean": 0.0032211030862526967, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007866002165246754, + "completion_length": 75.36500091552735, + "epoch": 0.6666026318317164, + "grad_norm": 1.8117616176605225, + "kl": 187030.33910432606, + "learning_rate": 3.253137583704673e-07, + "loss": 374.1458, + "reward": 1.6825225114822389, + "reward_std": 0.2058879092335701, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5921987533569336, + "step": 3470, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.09497236199676991, + "clip_ratio/high_mean": 0.015650217607617378, + "clip_ratio/low_mean": 0.0006928690614586231, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016343086725100875, + "completion_length": 89.13250198364258, + "epoch": 0.6685236768802229, + "grad_norm": 5.850868225097656, + "kl": 0.5080707125365734, + "learning_rate": 3.229641329498296e-07, + "loss": 0.0463, + "reward": 1.6678599119186401, + "reward_std": 0.2830047011375427, + "rewards/code_format_reward": 0.9724999904632569, + "rewards/code_reward": 0.5908049464225769, + "step": 3480, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.050425410037860274, + "clip_ratio/high_mean": 0.006465365196345374, + "clip_ratio/low_mean": 0.0006157927738968283, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0070811579586006704, + "completion_length": 81.40749969482422, + "epoch": 0.6704447219287293, + "grad_norm": 10.526844024658203, + "kl": 1.5019532606005668, + "learning_rate": 3.2062278158265866e-07, + "loss": -0.0021, + "reward": 1.7323597908020019, + "reward_std": 0.15349715426564217, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6202423751354218, + "step": 3490, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.07051093662157655, + "clip_ratio/high_mean": 0.009594869159627706, + "clip_ratio/low_mean": 0.001997971232049167, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011592840391676873, + "completion_length": 96.4625, + "epoch": 0.6723657669772356, + "grad_norm": 12.833992958068848, + "kl": 0.37389371246099473, + "learning_rate": 3.182897895967338e-07, + "loss": 0.0008, + "reward": 1.6037346363067626, + "reward_std": 0.329493448138237, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.556554788351059, + "step": 3500, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.017038265755400062, + "clip_ratio/high_mean": 0.0027443476661574095, + "clip_ratio/low_mean": 0.000347714369854657, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003092062100768089, + "completion_length": 84.05, + "epoch": 0.674286812025742, + "grad_norm": 6.119350910186768, + "kl": 0.4559432238340378, + "learning_rate": 3.15965242015187e-07, + "loss": 0.0298, + "reward": 1.6935490131378175, + "reward_std": 0.26772548258304596, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6002120196819305, + "step": 3510, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.03435098186600953, + "clip_ratio/high_mean": 0.005973302901838906, + "clip_ratio/low_mean": 0.0006568559459992684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006630158764892258, + "completion_length": 95.0, + "epoch": 0.6762078570742484, + "grad_norm": 4.796656608581543, + "kl": 0.3851431407034397, + "learning_rate": 3.1364922355340346e-07, + "loss": 0.0214, + "reward": 1.8059131860733033, + "reward_std": 0.18592590391635894, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6554565787315368, + "step": 3520, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.024073917022906243, + "clip_ratio/high_mean": 0.0035194387339288367, + "clip_ratio/low_mean": 0.0002464063392835669, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0037658450222807006, + "completion_length": 86.9800018310547, + "epoch": 0.6781289021227548, + "grad_norm": 7.799978256225586, + "kl": 0.2617302156984806, + "learning_rate": 3.113418186159349e-07, + "loss": -0.0088, + "reward": 1.515157699584961, + "reward_std": 0.2593328535556793, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5138288617134095, + "step": 3530, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.014256173744797707, + "clip_ratio/high_mean": 0.002001363394083455, + "clip_ratio/low_mean": 0.001286455297667999, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0032878186670131982, + "completion_length": 93.04500274658203, + "epoch": 0.6800499471712612, + "grad_norm": 1.323721170425415, + "kl": 0.32287237197160723, + "learning_rate": 3.090431112934235e-07, + "loss": -0.0056, + "reward": 1.8219903230667114, + "reward_std": 0.28862411081790923, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6641201436519623, + "step": 3540, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.0389601900940761, + "clip_ratio/high_mean": 0.005975415915600024, + "clip_ratio/low_mean": 0.0006638197373831645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006639235676266253, + "completion_length": 95.84250183105469, + "epoch": 0.6819709922197675, + "grad_norm": 4.850042819976807, + "kl": 1.8627108559012413, + "learning_rate": 3.067531853595369e-07, + "loss": 1.6968, + "reward": 1.8796481132507323, + "reward_std": 0.13976119682192803, + "rewards/code_format_reward": 0.9837500095367432, + "rewards/code_reward": 0.6938865780830383, + "step": 3550, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.01971529610455036, + "clip_ratio/high_mean": 0.002580236754147336, + "clip_ratio/low_mean": 0.0005060694311396219, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0030863061954732986, + "completion_length": 85.99000091552735, + "epoch": 0.683892037268274, + "grad_norm": 499.5800476074219, + "kl": 3.8079170405864717, + "learning_rate": 3.0447212426791546e-07, + "loss": 0.0153, + "reward": 1.73906729221344, + "reward_std": 0.21255102157592773, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.622658634185791, + "step": 3560, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.03546578506939113, + "clip_ratio/high_mean": 0.005373837990919128, + "clip_ratio/low_mean": 0.0011442803021054714, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0065181183628737925, + "completion_length": 93.75249938964843, + "epoch": 0.6858130823167803, + "grad_norm": 3.144973039627075, + "kl": 0.7828342400491237, + "learning_rate": 3.022000111491309e-07, + "loss": 0.0001, + "reward": 1.8471190690994264, + "reward_std": 0.27725095450878146, + "rewards/code_format_reward": 0.9487499952316284, + "rewards/code_reward": 0.686372023820877, + "step": 3570, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.01367393396794796, + "clip_ratio/high_mean": 0.001831050164764747, + "clip_ratio/low_mean": 0.0008013980732357595, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002632448251824826, + "completion_length": 96.29500122070313, + "epoch": 0.6877341273652867, + "grad_norm": 3.9027657508850098, + "kl": 0.8669951900839805, + "learning_rate": 2.99936928807657e-07, + "loss": -0.0007, + "reward": 1.6410433769226074, + "reward_std": 0.25681858956813813, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.5736466705799103, + "step": 3580, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.028569919406436384, + "clip_ratio/high_mean": 0.0037314103537937626, + "clip_ratio/low_mean": 0.0012738955876557157, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005005306130624376, + "completion_length": 84.3875015258789, + "epoch": 0.6896551724137931, + "grad_norm": 1.8412340879440308, + "kl": 0.6606554225087166, + "learning_rate": 2.976829597188506e-07, + "loss": -0.0007, + "reward": 1.6131571292877198, + "reward_std": 0.15807003602385522, + "rewards/code_format_reward": 0.9950000047683716, + "rewards/code_reward": 0.5578285574913024, + "step": 3590, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.045477401558309795, + "clip_ratio/high_mean": 0.007051247591152787, + "clip_ratio/low_mean": 0.00021358822996262461, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007264835678506642, + "completion_length": 92.69250030517578, + "epoch": 0.6915762174622995, + "grad_norm": 4.787570953369141, + "kl": 0.2786871612071991, + "learning_rate": 2.9543818602594826e-07, + "loss": 0.0001, + "reward": 1.6197675943374634, + "reward_std": 0.2863120764493942, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.5651962697505951, + "step": 3600, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.047238136362284425, + "clip_ratio/high_mean": 0.006483453582040966, + "clip_ratio/low_mean": 0.0015064548759255558, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007989908382296563, + "completion_length": 83.11750183105468, + "epoch": 0.6934972625108059, + "grad_norm": 1.5795401334762573, + "kl": 0.512858135998249, + "learning_rate": 2.932026895370697e-07, + "loss": 0.0021, + "reward": 1.6763751983642579, + "reward_std": 0.12559455148875714, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.5903751432895661, + "step": 3610, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.031613614642992616, + "clip_ratio/high_mean": 0.00453935784753412, + "clip_ratio/low_mean": 0.0026672417909139766, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007206599647179246, + "completion_length": 89.80250244140625, + "epoch": 0.6954183075593122, + "grad_norm": 0.9828081130981445, + "kl": 2.053369848430157, + "learning_rate": 2.909765517222392e-07, + "loss": -0.0015, + "reward": 1.6560463190078736, + "reward_std": 0.2526627391576767, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5820856630802155, + "step": 3620, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.00962083850754425, + "clip_ratio/high_mean": 0.0013845860186847859, + "clip_ratio/low_mean": 0.00100141861839802, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023860046290792524, + "completion_length": 93.0125015258789, + "epoch": 0.6973393526078187, + "grad_norm": 1.4326051473617554, + "kl": 0.7425350762903691, + "learning_rate": 2.887598537104141e-07, + "loss": 0.017, + "reward": 1.608488416671753, + "reward_std": 0.18181688338518143, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.557056725025177, + "step": 3630, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.10694800971541554, + "clip_ratio/high_mean": 0.016866487907827833, + "clip_ratio/low_mean": 0.000146488708560355, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017012976511614396, + "completion_length": 86.96750183105469, + "epoch": 0.6992603976563251, + "grad_norm": 5.3714776039123535, + "kl": 0.5909165881574154, + "learning_rate": 2.8655267628653044e-07, + "loss": 0.0005, + "reward": 1.6461472749710082, + "reward_std": 0.22788509875535964, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.5765111327171326, + "step": 3640, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.015923467138782142, + "clip_ratio/high_mean": 0.0022047571546863765, + "clip_ratio/low_mean": 0.0014544774603564292, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0036592346790712328, + "completion_length": 91.53250122070312, + "epoch": 0.7011814427048314, + "grad_norm": 7.581000328063965, + "kl": 3.2652564592659474, + "learning_rate": 2.8435509988855683e-07, + "loss": -0.0019, + "reward": 1.6843700885772706, + "reward_std": 0.20299706608057022, + "rewards/code_format_reward": 0.993749988079071, + "rewards/code_reward": 0.5937475442886353, + "step": 3650, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.04569347179494798, + "clip_ratio/high_mean": 0.00580012007849291, + "clip_ratio/low_mean": 0.003195645064988639, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008995765156578272, + "completion_length": 82.49500122070313, + "epoch": 0.7031024877533378, + "grad_norm": 10.031012535095215, + "kl": 0.3446802504360676, + "learning_rate": 2.821672046045642e-07, + "loss": -0.003, + "reward": 1.9148546934127808, + "reward_std": 0.15906044691801072, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.7089898109436035, + "step": 3660, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.01652005296200514, + "clip_ratio/high_mean": 0.0032194001134485005, + "clip_ratio/low_mean": 0.0004348491333075799, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003654249230748974, + "completion_length": 87.48000030517578, + "epoch": 0.7050235328018442, + "grad_norm": 4.5817551612854, + "kl": 0.5381794683635235, + "learning_rate": 2.799890701698068e-07, + "loss": -0.0018, + "reward": 1.4432553768157959, + "reward_std": 0.19258553311228752, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.4744401514530182, + "step": 3670, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.03230769606307149, + "clip_ratio/high_mean": 0.004254726751241833, + "clip_ratio/low_mean": 0.0003341716161230579, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0045888983644545075, + "completion_length": 91.96000366210937, + "epoch": 0.7069445778503506, + "grad_norm": 3.1825077533721924, + "kl": 0.5493438571691514, + "learning_rate": 2.7782077596381596e-07, + "loss": 0.0032, + "reward": 1.8943065643310546, + "reward_std": 0.22485891729593277, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6990282416343689, + "step": 3680, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.017006529681384563, + "clip_ratio/high_mean": 0.0026059710187837483, + "clip_ratio/low_mean": 0.00022266755404416473, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0028286385582759976, + "completion_length": 92.6875015258789, + "epoch": 0.708865622898857, + "grad_norm": 3.126534938812256, + "kl": 2.302929486706853, + "learning_rate": 2.7566240100750794e-07, + "loss": 0.0024, + "reward": 1.6279277324676513, + "reward_std": 0.3058730036020279, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.568026351928711, + "step": 3690, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.020053896540775894, + "clip_ratio/high_mean": 0.0029980215302202852, + "clip_ratio/low_mean": 0.0004860887274844572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003484110155841336, + "completion_length": 97.92250061035156, + "epoch": 0.7107866679473633, + "grad_norm": 4.224461555480957, + "kl": 4.42233342602849, + "learning_rate": 2.735140239603034e-07, + "loss": -0.0003, + "reward": 1.960454559326172, + "reward_std": 0.24239360094070433, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.7349147796630859, + "step": 3700, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.02752018291503191, + "clip_ratio/high_mean": 0.005125764373224229, + "clip_ratio/low_mean": 0.00023403638042509555, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005359800613950938, + "completion_length": 101.37250061035157, + "epoch": 0.7127077129958698, + "grad_norm": 4.285885334014893, + "kl": 0.952894814312458, + "learning_rate": 2.713757231172611e-07, + "loss": -0.0013, + "reward": 1.6773537874221802, + "reward_std": 0.2778655707836151, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5927394092082977, + "step": 3710, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.021348989009857176, + "clip_ratio/high_mean": 0.0030060237273573875, + "clip_ratio/low_mean": 0.0013181588088627904, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004324182611890137, + "completion_length": 95.14250183105469, + "epoch": 0.7146287580443761, + "grad_norm": 2.7202091217041016, + "kl": 2.8931914918124675, + "learning_rate": 2.692475764062245e-07, + "loss": -0.0021, + "reward": 1.8867613315582275, + "reward_std": 0.18746355026960373, + "rewards/code_format_reward": 0.9987499952316284, + "rewards/code_reward": 0.6936931371688843, + "step": 3720, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.007143327506491914, + "clip_ratio/high_mean": 0.0009208801442582626, + "clip_ratio/low_mean": 0.00037053466949146243, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001291414822480874, + "completion_length": 94.1875015258789, + "epoch": 0.7165498030928825, + "grad_norm": 2.7853496074676514, + "kl": 0.6755535811185837, + "learning_rate": 2.6712966138498174e-07, + "loss": -0.003, + "reward": 1.723927640914917, + "reward_std": 0.2750594407320023, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.6163387894630432, + "step": 3730, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.019027433777227997, + "clip_ratio/high_mean": 0.002618219889700413, + "clip_ratio/low_mean": 0.0018478537182090803, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004466073628282175, + "completion_length": 102.04000091552734, + "epoch": 0.718470848141389, + "grad_norm": 5.998534202575684, + "kl": 0.9062080264091492, + "learning_rate": 2.650220552384391e-07, + "loss": 0.0289, + "reward": 1.8737354516983031, + "reward_std": 0.34540517926216124, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6912427186965943, + "step": 3740, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.056439303827937694, + "clip_ratio/high_mean": 0.007310985976073425, + "clip_ratio/low_mean": 0.0005420514833531342, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007853037484164816, + "completion_length": 92.48250122070313, + "epoch": 0.7203918931898953, + "grad_norm": 5.3343424797058105, + "kl": 0.3819971337914467, + "learning_rate": 2.6292483477580816e-07, + "loss": -0.011, + "reward": 1.672910475730896, + "reward_std": 0.2516419067978859, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5920802116394043, + "step": 3750, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.13834233868401496, + "clip_ratio/high_mean": 0.018591971611022017, + "clip_ratio/low_mean": 0.0006771487518562935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.019269120390526952, + "completion_length": 99.33000030517579, + "epoch": 0.7223129382384017, + "grad_norm": 1.4892189502716064, + "kl": 0.9441468060016632, + "learning_rate": 2.6083807642780644e-07, + "loss": -0.0084, + "reward": 1.5579908847808839, + "reward_std": 0.272139647603035, + "rewards/code_format_reward": 0.9787500023841857, + "rewards/code_reward": 0.5343079507350922, + "step": 3760, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.023900310718454422, + "clip_ratio/high_mean": 0.005545906673069112, + "clip_ratio/low_mean": 0.0007872088695876301, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006333115603774786, + "completion_length": 90.84000396728516, + "epoch": 0.724233983286908, + "grad_norm": 12.181316375732422, + "kl": 8.179486125707626, + "learning_rate": 2.5876185624387225e-07, + "loss": 0.0398, + "reward": 1.743166995048523, + "reward_std": 0.3216101437807083, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6259585380554199, + "step": 3770, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.00846199265215546, + "clip_ratio/high_mean": 0.0012625553936231881, + "clip_ratio/low_mean": 0.00030621195983258074, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001568767352728173, + "completion_length": 118.35750122070313, + "epoch": 0.7261550283354145, + "grad_norm": 1.6517783403396606, + "kl": 0.968211068212986, + "learning_rate": 2.5669624988939287e-07, + "loss": 0.1551, + "reward": 1.7871047019958497, + "reward_std": 0.21420088410377502, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.644802349805832, + "step": 3780, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.02817701958119869, + "clip_ratio/high_mean": 0.0037564294645562766, + "clip_ratio/low_mean": 0.011859719056519679, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015616148672415875, + "completion_length": 93.14750213623047, + "epoch": 0.7280760733839209, + "grad_norm": 11.322369575500488, + "kl": 0.45075275003910065, + "learning_rate": 2.5464133264294705e-07, + "loss": -0.0008, + "reward": 1.662767267227173, + "reward_std": 0.24967537969350814, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.5848211228847504, + "step": 3790, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.05006286900024861, + "clip_ratio/high_mean": 0.007249254969065077, + "clip_ratio/low_mean": 0.00040258544613607227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007651840391918086, + "completion_length": 110.32750396728515, + "epoch": 0.7299971184324272, + "grad_norm": 16.862590789794922, + "kl": 0.3901309326291084, + "learning_rate": 2.5259717939356175e-07, + "loss": -0.0019, + "reward": 1.7777814149856568, + "reward_std": 0.25982470586895945, + "rewards/code_format_reward": 0.987500011920929, + "rewards/code_reward": 0.6420157194137573, + "step": 3800, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.007159786019474268, + "clip_ratio/high_mean": 0.0011859470629133283, + "clip_ratio/low_mean": 0.0021440873795654626, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0033300343551672996, + "completion_length": 96.07000122070312, + "epoch": 0.7319181634809336, + "grad_norm": 2.4953460693359375, + "kl": 0.3146058402955532, + "learning_rate": 2.505638646379831e-07, + "loss": -0.0042, + "reward": 1.7296765804290772, + "reward_std": 0.3011175274848938, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6189007639884949, + "step": 3810, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.03548359724227339, + "clip_ratio/high_mean": 0.004679994014441036, + "clip_ratio/low_mean": 0.00017329893162241206, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004853292935877107, + "completion_length": 101.32000122070312, + "epoch": 0.7338392085294401, + "grad_norm": 3.954063892364502, + "kl": 0.34448319524526594, + "learning_rate": 2.485414624779603e-07, + "loss": -0.0051, + "reward": 1.690654444694519, + "reward_std": 0.24299487322568894, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.6000146985054016, + "step": 3820, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.00636189088691026, + "clip_ratio/high_mean": 0.0008543322241166606, + "clip_ratio/low_mean": 0.00028777473780792204, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011421069371863267, + "completion_length": 94.74000244140625, + "epoch": 0.7357602535779464, + "grad_norm": 1.0420587062835693, + "kl": 0.28902386128902435, + "learning_rate": 2.4653004661754703e-07, + "loss": 0.0021, + "reward": 1.929768443107605, + "reward_std": 0.19695264101028442, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.7173841595649719, + "step": 3830, + "zero_std_ratio": 0.7 + }, + { + "clip_ratio/high_max": 0.0385974693344906, + "clip_ratio/high_mean": 0.0054892279236810285, + "clip_ratio/low_mean": 0.0004371934803202748, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005926421421463601, + "completion_length": 100.23250122070313, + "epoch": 0.7376812986264528, + "grad_norm": 6.22709846496582, + "kl": 0.39053357392549515, + "learning_rate": 2.445296903604131e-07, + "loss": -0.0123, + "reward": 1.7683161497116089, + "reward_std": 0.4236398935317993, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.6413455486297608, + "step": 3840, + "zero_std_ratio": 0.3 + }, + { + "clip_ratio/high_max": 0.013776408764533699, + "clip_ratio/high_mean": 0.0019065461441641674, + "clip_ratio/low_mean": 0.0035487653221935034, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005455311315017752, + "completion_length": 91.36000213623046, + "epoch": 0.7396023436749591, + "grad_norm": 3.84639573097229, + "kl": 9.267435324192046, + "learning_rate": 2.4254046660717555e-07, + "loss": 0.0107, + "reward": 1.7194789409637452, + "reward_std": 0.23012096285820008, + "rewards/code_format_reward": 0.98125, + "rewards/code_reward": 0.6144269347190857, + "step": 3850, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.019276025268482044, + "clip_ratio/high_mean": 0.0034578723403683397, + "clip_ratio/low_mean": 0.0028569042566232382, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006314776389626786, + "completion_length": 96.3250015258789, + "epoch": 0.7415233887234656, + "grad_norm": 4.765519142150879, + "kl": 0.5375766545534134, + "learning_rate": 2.4056244785273895e-07, + "loss": -0.0038, + "reward": 1.713827419281006, + "reward_std": 0.28884910941123965, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6112887144088746, + "step": 3860, + "zero_std_ratio": 0.35 + }, + { + "clip_ratio/high_max": 0.06692883024225012, + "clip_ratio/high_mean": 0.008779459849756676, + "clip_ratio/low_mean": 0.0002708235711907037, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009050283460237552, + "completion_length": 103.41250152587891, + "epoch": 0.743444433771972, + "grad_norm": 2.68007493019104, + "kl": 0.34222877621650694, + "learning_rate": 2.3859570618365614e-07, + "loss": -0.0009, + "reward": 1.74418466091156, + "reward_std": 0.20953620076179505, + "rewards/code_format_reward": 0.9912499785423279, + "rewards/code_reward": 0.6242798089981079, + "step": 3870, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.01482260066550225, + "clip_ratio/high_mean": 0.0023997865355340764, + "clip_ratio/low_mean": 0.00038790585967944934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002787692387937568, + "completion_length": 98.42250061035156, + "epoch": 0.7453654788204783, + "grad_norm": 4.816893100738525, + "kl": 0.4661983668804169, + "learning_rate": 2.366403132754995e-07, + "loss": -0.0019, + "reward": 1.6338875532150268, + "reward_std": 0.21452725008130075, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.5697562634944916, + "step": 3880, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.02494101980701089, + "clip_ratio/high_mean": 0.003492716047912836, + "clip_ratio/low_mean": 0.00024301124794874341, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0037357273045927285, + "completion_length": 97.44000091552735, + "epoch": 0.7472865238689848, + "grad_norm": 82.46282958984375, + "kl": 0.5981974095106125, + "learning_rate": 2.3469634039024927e-07, + "loss": 0.0024, + "reward": 1.8161945581436156, + "reward_std": 0.17759706005454062, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6621597528457641, + "step": 3890, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.0019452353473752737, + "clip_ratio/high_mean": 0.00039936143439263106, + "clip_ratio/low_mean": 0.0002315789126441814, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006309403397608548, + "completion_length": 94.07750091552734, + "epoch": 0.7492075689174911, + "grad_norm": 6.090396404266357, + "kl": 0.8421477146446705, + "learning_rate": 2.3276385837369632e-07, + "loss": 0.014, + "reward": 1.4471250534057618, + "reward_std": 0.25895166750997306, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.4773125171661377, + "step": 3900, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.02143551183398813, + "clip_ratio/high_mean": 0.002903820894425735, + "clip_ratio/low_mean": 0.00011704202042892576, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0030208629119442775, + "completion_length": 89.32750091552734, + "epoch": 0.7511286139659975, + "grad_norm": 7.675207614898682, + "kl": 4.630686198174954, + "learning_rate": 2.3084293765286074e-07, + "loss": 0.0109, + "reward": 1.7639801740646361, + "reward_std": 0.32505679726600645, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6360525727272034, + "step": 3910, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.007216949050780385, + "clip_ratio/high_mean": 0.0012314463703660295, + "clip_ratio/low_mean": 0.000596191274235025, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018276376475114375, + "completion_length": 93.16250152587891, + "epoch": 0.7530496590145039, + "grad_norm": 3.4967644214630127, + "kl": 0.9979558669030666, + "learning_rate": 2.2893364823342454e-07, + "loss": 0.0016, + "reward": 1.5569410085678101, + "reward_std": 0.2807903170585632, + "rewards/code_format_reward": 0.9674999952316284, + "rewards/code_reward": 0.5365955173969269, + "step": 3920, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.01850514723919332, + "clip_ratio/high_mean": 0.003044746146770194, + "clip_ratio/low_mean": 0.0006967324326978997, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0037414785125292837, + "completion_length": 95.95500183105469, + "epoch": 0.7549707040630103, + "grad_norm": 2.8742544651031494, + "kl": 0.44021010398864746, + "learning_rate": 2.270360596971809e-07, + "loss": -0.0037, + "reward": 1.823073673248291, + "reward_std": 0.24968771934509276, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.663411819934845, + "step": 3930, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.015126590803265571, + "clip_ratio/high_mean": 0.0023361636558547616, + "clip_ratio/low_mean": 0.00015787699958309532, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002494040655437857, + "completion_length": 91.19500122070312, + "epoch": 0.7568917491115167, + "grad_norm": 3.40413236618042, + "kl": 0.386103405430913, + "learning_rate": 2.2515024119949826e-07, + "loss": -0.011, + "reward": 1.5718731164932251, + "reward_std": 0.2807211749255657, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.5415615499019623, + "step": 3940, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.015597179555334151, + "clip_ratio/high_mean": 0.0027747701620683073, + "clip_ratio/low_mean": 0.00042166481143794953, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003196435049176216, + "completion_length": 98.425, + "epoch": 0.758812794160023, + "grad_norm": 4.560734272003174, + "kl": 0.4831135801970959, + "learning_rate": 2.2327626146679974e-07, + "loss": -0.0022, + "reward": 1.7759766340255738, + "reward_std": 0.2547271862626076, + "rewards/code_format_reward": 0.9625, + "rewards/code_reward": 0.6473633050918579, + "step": 3950, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.008683394081890583, + "clip_ratio/high_mean": 0.0011145618045702577, + "clip_ratio/low_mean": 0.0009394719265401364, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020540336496196686, + "completion_length": 102.31250305175782, + "epoch": 0.7607338392085294, + "grad_norm": 0.1577247530221939, + "kl": 1.2770531885325909, + "learning_rate": 2.2141418879405855e-07, + "loss": 0.0032, + "reward": 1.7324957370758056, + "reward_std": 0.19914634823799132, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.6199978470802308, + "step": 3960, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.004086668835952878, + "clip_ratio/high_mean": 0.0005708287237212062, + "clip_ratio/low_mean": 2.821670495904982e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005990454228594899, + "completion_length": 95.21750335693359, + "epoch": 0.7626548842570359, + "grad_norm": 268.0164794921875, + "kl": 3.985953611135483, + "learning_rate": 2.1956409104230986e-07, + "loss": 0.0127, + "reward": 1.7277408480644225, + "reward_std": 0.19516595900058747, + "rewards/code_format_reward": 0.9737500071525573, + "rewards/code_reward": 0.6204329133033752, + "step": 3970, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.02115430913399905, + "clip_ratio/high_mean": 0.003100222998182289, + "clip_ratio/low_mean": 0.00045731081045232715, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0035575337868067438, + "completion_length": 99.47250213623047, + "epoch": 0.7645759293055422, + "grad_norm": 4.087578773498535, + "kl": 0.2619202695786953, + "learning_rate": 2.1772603563617603e-07, + "loss": -0.0024, + "reward": 1.6976868152618407, + "reward_std": 0.31094631999731065, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.6041558861732483, + "step": 3980, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.022111613873858006, + "clip_ratio/high_mean": 0.0033171431292430497, + "clip_ratio/low_mean": 0.00019350402581039817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003510647150687873, + "completion_length": 93.09000091552734, + "epoch": 0.7664969743540486, + "grad_norm": 2.557553291320801, + "kl": 0.4590821463614702, + "learning_rate": 2.1590008956141137e-07, + "loss": -0.0014, + "reward": 1.7825278520584107, + "reward_std": 0.26515288949012755, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.6440764307975769, + "step": 3990, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.03076116186566651, + "clip_ratio/high_mean": 0.004437833256088197, + "clip_ratio/low_mean": 0.0004819675668841228, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004919800782226957, + "completion_length": 89.73500061035156, + "epoch": 0.7684180194025549, + "grad_norm": 2.5422067642211914, + "kl": 0.26607592329382895, + "learning_rate": 2.1408631936245908e-07, + "loss": 0.0026, + "reward": 1.8288384914398192, + "reward_std": 0.2508297085762024, + "rewards/code_format_reward": 0.9837499856948853, + "rewards/code_reward": 0.6684817314147949, + "step": 4000, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.020826040930114687, + "clip_ratio/high_mean": 0.0040985049330629405, + "clip_ratio/low_mean": 0.000369196553947404, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004467701492831111, + "completion_length": 97.69500122070312, + "epoch": 0.7703390644510614, + "grad_norm": 2.079371929168701, + "kl": 0.3304180882871151, + "learning_rate": 2.122847911400278e-07, + "loss": 0.0019, + "reward": 1.693557620048523, + "reward_std": 0.21333991810679437, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5977162718772888, + "step": 4010, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.009364375309087337, + "clip_ratio/high_mean": 0.0013745424774242565, + "clip_ratio/low_mean": 0.0020853754234849476, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034599179547512905, + "completion_length": 94.00750274658203, + "epoch": 0.7722601094995678, + "grad_norm": 3.2660512924194336, + "kl": 0.6432372182607651, + "learning_rate": 2.1049557054868082e-07, + "loss": 0.0073, + "reward": 1.8483120203018188, + "reward_std": 0.316910046339035, + "rewards/code_format_reward": 0.9649999856948852, + "rewards/code_reward": 0.6829060018062592, + "step": 4020, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.08980275879148394, + "clip_ratio/high_mean": 0.011746273408061825, + "clip_ratio/low_mean": 0.000331929670937825, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012078202966949902, + "completion_length": 92.7925018310547, + "epoch": 0.7741811545480741, + "grad_norm": 3.004549503326416, + "kl": 0.74478175714612, + "learning_rate": 2.0871872279444554e-07, + "loss": -0.0021, + "reward": 1.7010861873626708, + "reward_std": 0.25111902356147764, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.6071055889129638, + "step": 4030, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.0778543038177304, + "clip_ratio/high_mean": 0.00988141688721953, + "clip_ratio/low_mean": 0.0002543082577176392, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01013572499359725, + "completion_length": 105.63250122070312, + "epoch": 0.7761021995965806, + "grad_norm": 6.268821716308594, + "kl": 0.32837071269750595, + "learning_rate": 2.0695431263243512e-07, + "loss": -0.0003, + "reward": 1.716653084754944, + "reward_std": 0.2870768278837204, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6108265280723572, + "step": 4040, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.07360692555084825, + "clip_ratio/high_mean": 0.009302017895970493, + "clip_ratio/low_mean": 0.0003425976261496544, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009644615522120148, + "completion_length": 91.73750152587891, + "epoch": 0.7780232446450869, + "grad_norm": 4.801341533660889, + "kl": 13.291237189993263, + "learning_rate": 2.052024043644897e-07, + "loss": 0.0294, + "reward": 1.7232446193695068, + "reward_std": 0.24269133806228638, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6134972870349884, + "step": 4050, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.012731208954937756, + "clip_ratio/high_mean": 0.00180651948612649, + "clip_ratio/low_mean": 0.00015854310477152466, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019650626258226112, + "completion_length": 92.22000274658203, + "epoch": 0.7799442896935933, + "grad_norm": 0.6561126112937927, + "kl": 0.4966626279056072, + "learning_rate": 2.0346306183683254e-07, + "loss": 0.0001, + "reward": 1.8969059467315674, + "reward_std": 0.33292114436626435, + "rewards/code_format_reward": 0.9800000071525574, + "rewards/code_reward": 0.7034529447555542, + "step": 4060, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.015003547444939614, + "clip_ratio/high_mean": 0.002088976529194042, + "clip_ratio/low_mean": 0.0003269152017310262, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002415891730925068, + "completion_length": 88.61250152587891, + "epoch": 0.7818653347420997, + "grad_norm": 3.062511920928955, + "kl": 27.40203034952283, + "learning_rate": 2.0173634843774363e-07, + "loss": 0.0554, + "reward": 1.7011754512786865, + "reward_std": 0.3188599109649658, + "rewards/code_format_reward": 0.981250011920929, + "rewards/code_reward": 0.6052752196788788, + "step": 4070, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.006837105128215626, + "clip_ratio/high_mean": 0.0008925169277063105, + "clip_ratio/low_mean": 0.0005512935545993969, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014438104728469626, + "completion_length": 91.84750213623047, + "epoch": 0.7837863797906061, + "grad_norm": 3.0254440307617188, + "kl": 1.3981286019086838, + "learning_rate": 2.0002232709524897e-07, + "loss": 0.0033, + "reward": 1.6401101350784302, + "reward_std": 0.26853239685297015, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.5738050699234009, + "step": 4080, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.03983018643921241, + "clip_ratio/high_mean": 0.005185264609463047, + "clip_ratio/low_mean": 0.0019072047754889355, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007092469278723002, + "completion_length": 88.79250030517578, + "epoch": 0.7857074248391125, + "grad_norm": 2.8119072914123535, + "kl": 0.41205914914608, + "learning_rate": 1.983210602748279e-07, + "loss": -0.0029, + "reward": 1.9083050966262818, + "reward_std": 0.29446094632148745, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.7085274815559387, + "step": 4090, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.013765955006238072, + "clip_ratio/high_mean": 0.0018926289907540196, + "clip_ratio/low_mean": 0.0033484802523162218, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00524110905098496, + "completion_length": 85.72500305175781, + "epoch": 0.7876284698876188, + "grad_norm": 9.436022758483887, + "kl": 0.5864221028983593, + "learning_rate": 1.966326099771361e-07, + "loss": -0.0013, + "reward": 1.8478533029556274, + "reward_std": 0.2244624227285385, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6770516157150268, + "step": 4100, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.008409230364486575, + "clip_ratio/high_mean": 0.0011749810015317052, + "clip_ratio/low_mean": 0.00043775633239420133, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001612737326649949, + "completion_length": 91.16000213623047, + "epoch": 0.7895495149361252, + "grad_norm": 6.15724515914917, + "kl": 19.288723162561656, + "learning_rate": 1.9495703773574628e-07, + "loss": 0.0383, + "reward": 1.6099607944488525, + "reward_std": 0.30300846993923186, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.5599803984165191, + "step": 4110, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.009142859559506177, + "clip_ratio/high_mean": 0.001581054090638645, + "clip_ratio/low_mean": 0.0003552554393536411, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019363095459993928, + "completion_length": 91.99500122070313, + "epoch": 0.7914705599846317, + "grad_norm": 6.634824752807617, + "kl": 6.53539779484272, + "learning_rate": 1.9329440461490576e-07, + "loss": 0.0342, + "reward": 1.647179627418518, + "reward_std": 0.2863168239593506, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5770273089408875, + "step": 4120, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.002549535338766873, + "clip_ratio/high_mean": 0.0003399143257411197, + "clip_ratio/low_mean": 0.00017536718805786223, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000515281516709365, + "completion_length": 90.89250183105469, + "epoch": 0.793391605033138, + "grad_norm": 2.817605972290039, + "kl": 2.417458937317133, + "learning_rate": 1.9164477120731066e-07, + "loss": 0.0066, + "reward": 1.7660948038101196, + "reward_std": 0.2769928514957428, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.640859854221344, + "step": 4130, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.037738511635689066, + "clip_ratio/high_mean": 0.0050403060296957845, + "clip_ratio/low_mean": 0.0007518758837250061, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005792181929427898, + "completion_length": 96.35750122070313, + "epoch": 0.7953126500816444, + "grad_norm": 4.240172386169434, + "kl": 0.28425633125007155, + "learning_rate": 1.900081976318983e-07, + "loss": 0.002, + "reward": 1.6942025184631349, + "reward_std": 0.3146607309579849, + "rewards/code_format_reward": 0.9737499833106995, + "rewards/code_reward": 0.6036637306213379, + "step": 4140, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.005694918753579259, + "clip_ratio/high_mean": 0.0007544978521764279, + "clip_ratio/low_mean": 0.000571403895446565, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001325901737436652, + "completion_length": 91.79500122070313, + "epoch": 0.7972336951301509, + "grad_norm": 3.9649434089660645, + "kl": 0.5314306125044823, + "learning_rate": 1.8838474353165547e-07, + "loss": -0.0054, + "reward": 1.7638010501861572, + "reward_std": 0.2793388396501541, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.6362755179405213, + "step": 4150, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.07577090607956052, + "clip_ratio/high_mean": 0.009897856542374938, + "clip_ratio/low_mean": 0.00011142043076688424, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010009276978962589, + "completion_length": 94.04000244140624, + "epoch": 0.7991547401786572, + "grad_norm": 2.2340188026428223, + "kl": 0.524626237899065, + "learning_rate": 1.8677446807144554e-07, + "loss": -0.0045, + "reward": 1.7472325563430786, + "reward_std": 0.3027869775891304, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.6289287328720092, + "step": 4160, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.012572024948894978, + "clip_ratio/high_mean": 0.0020916348788887263, + "clip_ratio/low_mean": 0.00022255638323258607, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023141912854043765, + "completion_length": 94.53000183105469, + "epoch": 0.8010757852271636, + "grad_norm": 10.561907768249512, + "kl": 2.102495136484504, + "learning_rate": 1.8517742993585178e-07, + "loss": 0.0137, + "reward": 1.7456205368041993, + "reward_std": 0.2167625606060028, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.6262477397918701, + "step": 4170, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.06855184989399277, + "clip_ratio/high_mean": 0.008778795686521335, + "clip_ratio/low_mean": 5.122950533404946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008830025191855384, + "completion_length": 101.0425018310547, + "epoch": 0.8029968302756699, + "grad_norm": 5.673184871673584, + "kl": 0.428597304970026, + "learning_rate": 1.835936873270389e-07, + "loss": -0.0078, + "reward": 1.818405318260193, + "reward_std": 0.23994216322898865, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6626401782035828, + "step": 4180, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.002845590282231569, + "clip_ratio/high_mean": 0.0004983038117643446, + "clip_ratio/low_mean": 0.00037282529519870876, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008711291156942025, + "completion_length": 92.31250305175782, + "epoch": 0.8049178753241764, + "grad_norm": 6.281589508056641, + "kl": 0.4346353754401207, + "learning_rate": 1.8202329796263172e-07, + "loss": -0.0009, + "reward": 1.8768694639205932, + "reward_std": 0.21767425537109375, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6918722629547119, + "step": 4190, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.003876271191984415, + "clip_ratio/high_mean": 0.0004845338989980519, + "clip_ratio/low_mean": 0.0001560977878398262, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006406316824723035, + "completion_length": 75.7275016784668, + "epoch": 0.8068389203726828, + "grad_norm": 1.0423272848129272, + "kl": 0.9193772681057453, + "learning_rate": 1.8046631907361226e-07, + "loss": 0.0041, + "reward": 1.8756553649902343, + "reward_std": 0.18836807161569596, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.6893901348114013, + "step": 4200, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.004609162057749927, + "clip_ratio/high_mean": 0.0007380300055956468, + "clip_ratio/low_mean": 0.00015018127305665985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000888211271376349, + "completion_length": 86.45750122070312, + "epoch": 0.8087599654211891, + "grad_norm": 4.096966743469238, + "kl": 0.45643181502819063, + "learning_rate": 1.7892280740223303e-07, + "loss": -0.004, + "reward": 1.5836501359939574, + "reward_std": 0.2258547842502594, + "rewards/code_format_reward": 0.9799999952316284, + "rewards/code_reward": 0.5468250632286071, + "step": 4210, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.007562826108187437, + "clip_ratio/high_mean": 0.0010166528285481037, + "clip_ratio/low_mean": 0.000701455632224679, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017181085073389112, + "completion_length": 90.34250335693359, + "epoch": 0.8106810104696955, + "grad_norm": 0.29423439502716064, + "kl": 0.2636001568287611, + "learning_rate": 1.7739281919995045e-07, + "loss": 0.0161, + "reward": 1.5646157741546631, + "reward_std": 0.12648468129336835, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5363703727722168, + "step": 4220, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.02073557274416089, + "clip_ratio/high_mean": 0.002738419675733894, + "clip_ratio/low_mean": 0.001565844019933138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004304263507947326, + "completion_length": 85.92750091552735, + "epoch": 0.8126020555182019, + "grad_norm": 3.8796801567077637, + "kl": 0.6587013073265553, + "learning_rate": 1.7587641022537335e-07, + "loss": -0.0031, + "reward": 1.598485040664673, + "reward_std": 0.23664331436157227, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5526800036430359, + "step": 4230, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.0027086240705102684, + "clip_ratio/high_mean": 0.0003605463745770976, + "clip_ratio/low_mean": 0.0004666288397856988, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008271752245491371, + "completion_length": 86.6875, + "epoch": 0.8145231005667083, + "grad_norm": 6.270168781280518, + "kl": 3.9651204235851765, + "learning_rate": 1.7437363574223244e-07, + "loss": 0.0141, + "reward": 1.8213656187057494, + "reward_std": 0.2221561223268509, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.66474529504776, + "step": 4240, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.009644670388661325, + "clip_ratio/high_mean": 0.0013658979878528044, + "clip_ratio/low_mean": 0.0006750999338692055, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002040997930453159, + "completion_length": 86.42250061035156, + "epoch": 0.8164441456152147, + "grad_norm": 4.402440071105957, + "kl": 0.27487861886620524, + "learning_rate": 1.7288455051736474e-07, + "loss": -0.0005, + "reward": 1.6581492662429809, + "reward_std": 0.14444592781364918, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5825121104717255, + "step": 4250, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.017426467640325426, + "clip_ratio/high_mean": 0.0023936200188472865, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023936200188472865, + "completion_length": 88.66250152587891, + "epoch": 0.818365190663721, + "grad_norm": 15.625293731689453, + "kl": 0.5453658372163772, + "learning_rate": 1.7140920881871927e-07, + "loss": 0.0001, + "reward": 1.9025921821594238, + "reward_std": 0.1951783686876297, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.7037960886955261, + "step": 4260, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.024276501801796257, + "clip_ratio/high_mean": 0.0037041545001557097, + "clip_ratio/low_mean": 0.0004929742426611483, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004197128777741454, + "completion_length": 94.525, + "epoch": 0.8202862357122275, + "grad_norm": 19.728607177734375, + "kl": 3.983573118597269, + "learning_rate": 1.699476644133778e-07, + "loss": 0.0122, + "reward": 1.7488954544067383, + "reward_std": 0.2558127373456955, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6269477069377899, + "step": 4270, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.00913497168221511, + "clip_ratio/high_mean": 0.0011987716374278535, + "clip_ratio/low_mean": 0.0007998564062290825, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019986280538432767, + "completion_length": 87.73999938964843, + "epoch": 0.8222072807607338, + "grad_norm": 4.567457675933838, + "kl": 0.6975361555814743, + "learning_rate": 1.6849997056559662e-07, + "loss": -0.0116, + "reward": 1.7202219009399413, + "reward_std": 0.27057143300771713, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.6169859290122985, + "step": 4280, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.016106344643048942, + "clip_ratio/high_mean": 0.002376156343962066, + "clip_ratio/low_mean": 5.540161509998143e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024315579648828134, + "completion_length": 94.21750030517578, + "epoch": 0.8241283258092402, + "grad_norm": 17.505773544311523, + "kl": 1.1381098613142968, + "learning_rate": 1.670661800348644e-07, + "loss": -0.0006, + "reward": 1.7664429664611816, + "reward_std": 0.283676877617836, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.6369715094566345, + "step": 4290, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.05613061334006488, + "clip_ratio/high_mean": 0.0073514855874236675, + "clip_ratio/low_mean": 0.00012296391359996052, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0074744494573678825, + "completion_length": 94.24500122070313, + "epoch": 0.8260493708577467, + "grad_norm": 36.729576110839844, + "kl": 2.2444246262311935, + "learning_rate": 1.656463450739801e-07, + "loss": 0.0024, + "reward": 1.7431164741516114, + "reward_std": 0.29661422967910767, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.6268707036972045, + "step": 4300, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.003962649451568723, + "clip_ratio/high_mean": 0.0005655559070874006, + "clip_ratio/low_mean": 0.00023454214970115573, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008000980655197054, + "completion_length": 91.94250030517578, + "epoch": 0.827970415906253, + "grad_norm": 5.331088066101074, + "kl": 0.6558065637946129, + "learning_rate": 1.6424051742714851e-07, + "loss": 0.0002, + "reward": 1.76786208152771, + "reward_std": 0.17127570807933806, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.6361185550689697, + "step": 4310, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.007816581195220352, + "clip_ratio/high_mean": 0.001516599569004029, + "clip_ratio/low_mean": 4.7630388871766625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015642299549654126, + "completion_length": 82.7000015258789, + "epoch": 0.8298914609547594, + "grad_norm": 9.622750282287598, + "kl": 0.9509772717952728, + "learning_rate": 1.6284874832809436e-07, + "loss": 0.0023, + "reward": 1.9346927881240845, + "reward_std": 0.3074748650193214, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.7189089298248291, + "step": 4320, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.02933923137607053, + "clip_ratio/high_mean": 0.004640504893905018, + "clip_ratio/low_mean": 0.00011013215407729149, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004750637047982309, + "completion_length": 88.08000183105469, + "epoch": 0.8318125060032657, + "grad_norm": 1.8961539268493652, + "kl": 1.2761327236890794, + "learning_rate": 1.614710884981951e-07, + "loss": 0.0002, + "reward": 1.5815791606903076, + "reward_std": 0.24661691784858703, + "rewards/code_format_reward": 0.9862499833106995, + "rewards/code_reward": 0.5442270636558533, + "step": 4330, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.035589413810521366, + "clip_ratio/high_mean": 0.005689902242738754, + "clip_ratio/low_mean": 0.00015636042662663384, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005846262606792152, + "completion_length": 89.16500244140624, + "epoch": 0.8337335510517722, + "grad_norm": 1.6006284952163696, + "kl": 0.6420656457543373, + "learning_rate": 1.6010758814463287e-07, + "loss": 0.0027, + "reward": 1.643228530883789, + "reward_std": 0.2129346549510956, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.5741142451763153, + "step": 4340, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.007347659638617188, + "clip_ratio/high_mean": 0.001005946182704065, + "clip_ratio/low_mean": 0.00028780620195902886, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012937523904838599, + "completion_length": 98.85000152587891, + "epoch": 0.8356545961002786, + "grad_norm": 5.479083061218262, + "kl": 0.3409851986914873, + "learning_rate": 1.5875829695856406e-07, + "loss": -0.0007, + "reward": 1.882705855369568, + "reward_std": 0.22037020921707154, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.6938528895378113, + "step": 4350, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.032100778096355496, + "clip_ratio/high_mean": 0.004409284892608412, + "clip_ratio/low_mean": 4.9924499762710184e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004459209358901717, + "completion_length": 90.73500213623046, + "epoch": 0.8375756411487849, + "grad_norm": 56.100852966308594, + "kl": 0.22566271349787712, + "learning_rate": 1.5742326411330942e-07, + "loss": 0.0011, + "reward": 1.8064903020858765, + "reward_std": 0.1691088706254959, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.65418261885643, + "step": 4360, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.005500979837961495, + "clip_ratio/high_mean": 0.0008934643206885085, + "clip_ratio/low_mean": 0.0005694760067854077, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001462940318742767, + "completion_length": 93.03750305175781, + "epoch": 0.8394966861972913, + "grad_norm": 7.828958034515381, + "kl": 0.6565275602042675, + "learning_rate": 1.5610253826256036e-07, + "loss": 0.003, + "reward": 1.7732144832611083, + "reward_std": 0.33924323320388794, + "rewards/code_format_reward": 0.9825000047683716, + "rewards/code_reward": 0.6409822225570678, + "step": 4370, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.0038046793546527625, + "clip_ratio/high_mean": 0.0004755849193315953, + "clip_ratio/low_mean": 0.0006387827248545364, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011143676441861317, + "completion_length": 85.95250244140625, + "epoch": 0.8414177312457977, + "grad_norm": 3.0064802169799805, + "kl": 9.46174124404788, + "learning_rate": 1.5479616753860792e-07, + "loss": 0.0195, + "reward": 1.8130270481109618, + "reward_std": 0.1679749459028244, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6583885312080383, + "step": 4380, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.020394568890333177, + "clip_ratio/high_mean": 0.002549321111291647, + "clip_ratio/low_mean": 0.0012285682838410138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003777889395132661, + "completion_length": 94.72500305175781, + "epoch": 0.8433387762943041, + "grad_norm": 8.23426342010498, + "kl": 0.3538756832480431, + "learning_rate": 1.5350419955058645e-07, + "loss": -0.0046, + "reward": 1.6075192928314208, + "reward_std": 0.16927714347839357, + "rewards/code_format_reward": 0.9962499976158142, + "rewards/code_reward": 0.5546970963478088, + "step": 4390, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.06588131491444074, + "clip_ratio/high_mean": 0.009102540424646578, + "clip_ratio/low_mean": 0.0007730728961178101, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009875613666372374, + "completion_length": 90.04500274658203, + "epoch": 0.8452598213428105, + "grad_norm": 7.7215776443481445, + "kl": 0.2362464390695095, + "learning_rate": 1.522266813827407e-07, + "loss": 0.0036, + "reward": 1.8586368560791016, + "reward_std": 0.2194239765405655, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.6805683970451355, + "step": 4400, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.004174966411665082, + "clip_ratio/high_mean": 0.0007423789938911796, + "clip_ratio/low_mean": 7.375134955509566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008161303412634879, + "completion_length": 88.90750122070312, + "epoch": 0.8471808663913168, + "grad_norm": 2.829716920852661, + "kl": 1.5581397600471973, + "learning_rate": 1.509636595927078e-07, + "loss": 0.003, + "reward": 1.9052275657653808, + "reward_std": 0.256375952064991, + "rewards/code_format_reward": 0.9787499904632568, + "rewards/code_reward": 0.7079262495040893, + "step": 4410, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.07640773041639477, + "clip_ratio/high_mean": 0.009898414360941387, + "clip_ratio/low_mean": 9.467430354561656e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00999308866157662, + "completion_length": 95.14500122070312, + "epoch": 0.8491019114398233, + "grad_norm": 0.3017069101333618, + "kl": 0.8497596487402916, + "learning_rate": 1.4971518020982232e-07, + "loss": -0.0017, + "reward": 1.5574845552444458, + "reward_std": 0.1220448928885162, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5321797609329224, + "step": 4420, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.02955477687064558, + "clip_ratio/high_mean": 0.00414732932113111, + "clip_ratio/low_mean": 4.42216987721622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00419155100826174, + "completion_length": 99.7, + "epoch": 0.8510229564883296, + "grad_norm": 5.942404747009277, + "kl": 0.5168043114244938, + "learning_rate": 1.4848128873343773e-07, + "loss": -0.0003, + "reward": 1.6633994817733764, + "reward_std": 0.2619109332561493, + "rewards/code_format_reward": 0.9762499928474426, + "rewards/code_reward": 0.5876372039318085, + "step": 4430, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.014663098810706288, + "clip_ratio/high_mean": 0.0024809099428239278, + "clip_ratio/low_mean": 3.1672295881435276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002512582238705363, + "completion_length": 100.20750122070312, + "epoch": 0.852944001536836, + "grad_norm": 3.1078836917877197, + "kl": 0.39873379915952684, + "learning_rate": 1.4726203013126844e-07, + "loss": 0.006, + "reward": 1.7631917238235473, + "reward_std": 0.22433922737836837, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.6350333511829376, + "step": 4440, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.014706605696119368, + "clip_ratio/high_mean": 0.00257694432802964, + "clip_ratio/low_mean": 0.00032811136916279795, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002905055697192438, + "completion_length": 99.6875, + "epoch": 0.8548650465853425, + "grad_norm": 8.708415985107422, + "kl": 0.4677444875240326, + "learning_rate": 1.4605744883775122e-07, + "loss": -0.0036, + "reward": 1.8840698957443238, + "reward_std": 0.2510286644101143, + "rewards/code_format_reward": 0.9850000143051147, + "rewards/code_reward": 0.6957849264144897, + "step": 4450, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.008664844953455032, + "clip_ratio/high_mean": 0.0019024941400857642, + "clip_ratio/low_mean": 0.002259151160251349, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004161645277054049, + "completion_length": 89.9000015258789, + "epoch": 0.8567860916338488, + "grad_norm": 7.514847755432129, + "kl": 0.3879747323691845, + "learning_rate": 1.4486758875242557e-07, + "loss": -0.0046, + "reward": 1.9147763013839723, + "reward_std": 0.2857444554567337, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.7102006316184998, + "step": 4460, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.01213214877061546, + "clip_ratio/high_mean": 0.0017360628451569937, + "clip_ratio/low_mean": 0.0008027118048630655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002538774654385634, + "completion_length": 100.34250183105469, + "epoch": 0.8587071366823552, + "grad_norm": 4.4957380294799805, + "kl": 0.7120470233261585, + "learning_rate": 1.436924932383341e-07, + "loss": -0.0029, + "reward": 1.7210463523864745, + "reward_std": 0.348609185218811, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.6164606809616089, + "step": 4470, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio/high_max": 0.04909939672797918, + "clip_ratio/high_mean": 0.006639666750561446, + "clip_ratio/low_mean": 0.0001961415633559227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006835808313917368, + "completion_length": 89.65500030517578, + "epoch": 0.8606281817308616, + "grad_norm": 0.6291245818138123, + "kl": 0.914973171055317, + "learning_rate": 1.4253220512044194e-07, + "loss": 0.0052, + "reward": 1.5310453414916991, + "reward_std": 0.2040669571608305, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.519272655248642, + "step": 4480, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.023789329756982624, + "clip_ratio/high_mean": 0.0034216867323266344, + "clip_ratio/low_mean": 6.479026051238179e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003486476981197484, + "completion_length": 94.68500061035157, + "epoch": 0.862549226779368, + "grad_norm": 3.6435203552246094, + "kl": 0.24871882200241088, + "learning_rate": 1.4138676668407637e-07, + "loss": -0.004, + "reward": 1.7728254079818726, + "reward_std": 0.21846108362078667, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.6386001646518707, + "step": 4490, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.02858473571250215, + "clip_ratio/high_mean": 0.004445229801058303, + "clip_ratio/low_mean": 0.005347848287783563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009793078135407996, + "completion_length": 94.14000091552734, + "epoch": 0.8644702718278744, + "grad_norm": 7.250815391540527, + "kl": 1.268965845555067, + "learning_rate": 1.402562196733855e-07, + "loss": 0.1222, + "reward": 1.6482325553894044, + "reward_std": 0.321136474609375, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5813037693500519, + "step": 4500, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.0016861034324392675, + "clip_ratio/high_mean": 0.00025714511721162123, + "clip_ratio/low_mean": 8.047257215366699e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003376176857273094, + "completion_length": 89.82750244140625, + "epoch": 0.8663913168763807, + "grad_norm": 1.5697243213653564, + "kl": 0.3187939524650574, + "learning_rate": 1.3914060528981713e-07, + "loss": -0.0008, + "reward": 1.6549904108047486, + "reward_std": 0.15924324840307236, + "rewards/code_format_reward": 0.9912499785423279, + "rewards/code_reward": 0.5796826839447021, + "step": 4510, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.005441831634379923, + "clip_ratio/high_mean": 0.0007462791429134086, + "clip_ratio/low_mean": 0.0008039395906962454, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015502187496167607, + "completion_length": 97.60250091552734, + "epoch": 0.8683123619248871, + "grad_norm": 2.864607334136963, + "kl": 0.36184127181768416, + "learning_rate": 1.38039964190617e-07, + "loss": -0.0068, + "reward": 1.5000358819961548, + "reward_std": 0.22264644205570222, + "rewards/code_format_reward": 0.9850000023841858, + "rewards/code_reward": 0.5037679553031922, + "step": 4520, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.045888486225157975, + "clip_ratio/high_mean": 0.006488210440147668, + "clip_ratio/low_mean": 4.380840982776135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006532018849975429, + "completion_length": 107.06000061035157, + "epoch": 0.8702334069733936, + "grad_norm": 3.5723934173583984, + "kl": 0.21280892938375473, + "learning_rate": 1.369543364873474e-07, + "loss": 0.0008, + "reward": 1.8976154088974, + "reward_std": 0.22375442534685136, + "rewards/code_format_reward": 0.9737499952316284, + "rewards/code_reward": 0.7053701996803283, + "step": 4530, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.021734172268770634, + "clip_ratio/high_mean": 0.00285033899708651, + "clip_ratio/low_mean": 0.00015430593703058548, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003004644898464903, + "completion_length": 90.5125, + "epoch": 0.8721544520218999, + "grad_norm": 26.33332633972168, + "kl": 16.64756402745843, + "learning_rate": 1.3588376174442495e-07, + "loss": 0.0407, + "reward": 1.8465018033981324, + "reward_std": 0.26863393038511274, + "rewards/code_format_reward": 0.9900000095367432, + "rewards/code_reward": 0.6757509171962738, + "step": 4540, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.01540006476570852, + "clip_ratio/high_mean": 0.00195431642132462, + "clip_ratio/low_mean": 0.0003294220077805221, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022837384400190785, + "completion_length": 91.96750183105469, + "epoch": 0.8740754970704063, + "grad_norm": 5.637061595916748, + "kl": 0.5615961387753486, + "learning_rate": 1.348282789776792e-07, + "loss": 0.0006, + "reward": 1.7335857629776001, + "reward_std": 0.16677757501602172, + "rewards/code_format_reward": 0.9712500095367431, + "rewards/code_reward": 0.6239803791046142, + "step": 4550, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.012607228197157382, + "clip_ratio/high_mean": 0.0017772652208805084, + "clip_ratio/low_mean": 0.00020470973395276816, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019819749519228934, + "completion_length": 90.42000122070313, + "epoch": 0.8759965421189126, + "grad_norm": 4.87404727935791, + "kl": 0.5051522366702557, + "learning_rate": 1.3378792665293032e-07, + "loss": -0.0007, + "reward": 1.8114176988601685, + "reward_std": 0.27143858969211576, + "rewards/code_format_reward": 0.9687499880790711, + "rewards/code_reward": 0.6635213375091553, + "step": 4560, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.003521555650513619, + "clip_ratio/high_mean": 0.0005311336179147474, + "clip_ratio/low_mean": 0.00039719248161418363, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009283260951633565, + "completion_length": 96.31000061035157, + "epoch": 0.8779175871674191, + "grad_norm": 3.5294971466064453, + "kl": 0.44891551434993743, + "learning_rate": 1.3276274268458749e-07, + "loss": -0.0011, + "reward": 1.8015916109085084, + "reward_std": 0.23535949736833572, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.65454580783844, + "step": 4570, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.016813984792679548, + "clip_ratio/high_mean": 0.0026305554260034115, + "clip_ratio/low_mean": 0.00013278115511639044, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027633365796646105, + "completion_length": 92.12000122070313, + "epoch": 0.8798386322159255, + "grad_norm": 3.3064281940460205, + "kl": 147.6638460204005, + "learning_rate": 1.3175276443426704e-07, + "loss": 0.3018, + "reward": 1.8557111263275146, + "reward_std": 0.21927002370357512, + "rewards/code_format_reward": 0.9924999833106994, + "rewards/code_reward": 0.6797305464744567, + "step": 4580, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.004338507051579654, + "clip_ratio/high_mean": 0.0005835221760207787, + "clip_ratio/low_mean": 9.975638386094943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006832785555161535, + "completion_length": 96.14250183105469, + "epoch": 0.8817596772644318, + "grad_norm": 5.933443546295166, + "kl": 0.7469953082501888, + "learning_rate": 1.3075802870943102e-07, + "loss": -0.0005, + "reward": 1.7140401601791382, + "reward_std": 0.32567469477653505, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.6145200908184052, + "step": 4590, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.008221420878544449, + "clip_ratio/high_mean": 0.0010466745734447613, + "clip_ratio/low_mean": 0.00021989296365063638, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012665675370953978, + "completion_length": 96.91000213623047, + "epoch": 0.8836807223129383, + "grad_norm": 3.6585068702697754, + "kl": 0.2884219281375408, + "learning_rate": 1.2977857176204554e-07, + "loss": -0.0014, + "reward": 1.745366358757019, + "reward_std": 0.28437634110450744, + "rewards/code_format_reward": 0.9612499952316285, + "rewards/code_reward": 0.6323706865310669, + "step": 4600, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.013106013461947442, + "clip_ratio/high_mean": 0.001983167743310332, + "clip_ratio/low_mean": 0.0010545071098022162, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0030376748647540806, + "completion_length": 95.46000366210937, + "epoch": 0.8856017673614446, + "grad_norm": 3.166572332382202, + "kl": 0.7999920375645161, + "learning_rate": 1.2881442928725997e-07, + "loss": 0.0024, + "reward": 1.7604058027267455, + "reward_std": 0.1588110476732254, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6342653870582581, + "step": 4610, + "zero_std_ratio": 0.725 + }, + { + "clip_ratio/high_max": 0.03971324802841991, + "clip_ratio/high_mean": 0.005240282195154577, + "clip_ratio/low_mean": 0.00012449334171833472, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005364775560155977, + "completion_length": 91.37250213623047, + "epoch": 0.887522812409951, + "grad_norm": 1.2688926458358765, + "kl": 52.058202140033245, + "learning_rate": 1.2786563642210536e-07, + "loss": 0.1059, + "reward": 1.6578764081001283, + "reward_std": 0.1922210179269314, + "rewards/code_format_reward": 0.9724999904632569, + "rewards/code_reward": 0.5858131945133209, + "step": 4620, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.014312215382233262, + "clip_ratio/high_mean": 0.002295189391588792, + "clip_ratio/low_mean": 0.0010927254450507462, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0033879148249980062, + "completion_length": 92.44750061035157, + "epoch": 0.8894438574584574, + "grad_norm": 1.0473991632461548, + "kl": 0.48051133900880816, + "learning_rate": 1.269322277442151e-07, + "loss": 0.0015, + "reward": 1.8454564094543457, + "reward_std": 0.23949076235294342, + "rewards/code_format_reward": 0.9824999809265137, + "rewards/code_reward": 0.6771032094955445, + "step": 4630, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.040262592025101185, + "clip_ratio/high_mean": 0.005372756696306169, + "clip_ratio/low_mean": 0.0007898360927356407, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006162592757027597, + "completion_length": 84.6050018310547, + "epoch": 0.8913649025069638, + "grad_norm": 6.553028106689453, + "kl": 0.6895815744996071, + "learning_rate": 1.2601423727056346e-07, + "loss": -0.0001, + "reward": 1.6561978340148926, + "reward_std": 0.36703028678894045, + "rewards/code_format_reward": 0.975, + "rewards/code_reward": 0.5843489110469818, + "step": 4640, + "zero_std_ratio": 0.325 + }, + { + "clip_ratio/high_max": 0.06538669131696224, + "clip_ratio/high_mean": 0.009138646663632243, + "clip_ratio/low_mean": 0.0017342485502013006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010872895480133593, + "completion_length": 88.39750061035156, + "epoch": 0.8932859475554702, + "grad_norm": 4.167427062988281, + "kl": 1.728559673577547, + "learning_rate": 1.2511169845622699e-07, + "loss": 0.0019, + "reward": 1.6277015209197998, + "reward_std": 0.21625073552131652, + "rewards/code_format_reward": 0.975000011920929, + "rewards/code_reward": 0.5701007604598999, + "step": 4650, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.042488472175318745, + "clip_ratio/high_mean": 0.005791870540997479, + "clip_ratio/low_mean": 2.0525451691355557e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005812396005785559, + "completion_length": 92.57250213623047, + "epoch": 0.8952069926039765, + "grad_norm": 6.026858806610107, + "kl": 0.7588046140968799, + "learning_rate": 1.2422464419316432e-07, + "loss": 0.0034, + "reward": 1.7008742094039917, + "reward_std": 0.27438378930091856, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.6079370617866516, + "step": 4660, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.013316971366293728, + "clip_ratio/high_mean": 0.0019765587523579596, + "clip_ratio/low_mean": 8.563735173083842e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002062196109909564, + "completion_length": 93.35000305175781, + "epoch": 0.897128037652483, + "grad_norm": 4.863064765930176, + "kl": 7.6627843722701074, + "learning_rate": 1.233531068090184e-07, + "loss": 0.011, + "reward": 1.8806322813034058, + "reward_std": 0.28162118047475815, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.6931286633014679, + "step": 4670, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.004586372757330537, + "clip_ratio/high_mean": 0.0006299379543634132, + "clip_ratio/low_mean": 1.7313018906861545e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0006472509849118069, + "completion_length": 92.36500091552735, + "epoch": 0.8990490827009894, + "grad_norm": 2.1931703090667725, + "kl": 0.2519014351069927, + "learning_rate": 1.2249711806593762e-07, + "loss": 0.0034, + "reward": 1.8040930509567261, + "reward_std": 0.24223610311746596, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.6561090111732483, + "step": 4680, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.006883417209610343, + "clip_ratio/high_mean": 0.0009808192204218357, + "clip_ratio/low_mean": 0.00029269491278682835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001273514133208664, + "completion_length": 91.60500030517578, + "epoch": 0.9009701277494957, + "grad_norm": 21.0294132232666, + "kl": 0.25964570268988607, + "learning_rate": 1.2165670915941866e-07, + "loss": -0.0043, + "reward": 1.9244711637496947, + "reward_std": 0.1629927098751068, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.7156730651855469, + "step": 4690, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.010230390657670795, + "clip_ratio/high_mean": 0.0014585633180104196, + "clip_ratio/low_mean": 3.0266345129348336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001488829671870917, + "completion_length": 87.84500274658203, + "epoch": 0.9028911727980021, + "grad_norm": 1.7218002080917358, + "kl": 16.447690600901844, + "learning_rate": 1.2083191071716937e-07, + "loss": 0.0339, + "reward": 1.940086579322815, + "reward_std": 0.16455088555812836, + "rewards/code_format_reward": 0.993749988079071, + "rewards/code_reward": 0.7216057777404785, + "step": 4700, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.025706328079104425, + "clip_ratio/high_mean": 0.003573437442537397, + "clip_ratio/low_mean": 3.392130311112851e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0036073587427381424, + "completion_length": 81.84500274658203, + "epoch": 0.9048122178465084, + "grad_norm": 0.22543705999851227, + "kl": 0.31741214692592623, + "learning_rate": 1.2002275279799288e-07, + "loss": -0.0056, + "reward": 1.8292718410491944, + "reward_std": 0.12828939855098725, + "rewards/code_format_reward": 0.9987499952316284, + "rewards/code_reward": 0.6649484276771546, + "step": 4710, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.009241180948447437, + "clip_ratio/high_mean": 0.0013442957555525937, + "clip_ratio/low_mean": 4.643963038688526e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013907353932154365, + "completion_length": 95.63500213623047, + "epoch": 0.9067332628950149, + "grad_norm": 5.23514986038208, + "kl": 0.804936108738184, + "learning_rate": 1.192292648906918e-07, + "loss": 0.0031, + "reward": 1.925449275970459, + "reward_std": 0.2213977299630642, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.7152246475219727, + "step": 4720, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.021669640118489042, + "clip_ratio/high_mean": 0.003889294656983111, + "clip_ratio/low_mean": 0.00044275675172684715, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004332051414530724, + "completion_length": 92.25250244140625, + "epoch": 0.9086543079435213, + "grad_norm": 66.00515747070312, + "kl": 2.1086502872407435, + "learning_rate": 1.1845147591299378e-07, + "loss": 0.0162, + "reward": 1.5327723979949952, + "reward_std": 0.2872114762663841, + "rewards/code_format_reward": 0.9725000023841858, + "rewards/code_reward": 0.5232611894607544, + "step": 4730, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.006079713994404301, + "clip_ratio/high_mean": 0.0010439059922646265, + "clip_ratio/low_mean": 0.0013928397551353556, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002436745767045068, + "completion_length": 97.43000030517578, + "epoch": 0.9105753529920276, + "grad_norm": 2.8770546913146973, + "kl": 3.1866038836538793, + "learning_rate": 1.1768941421049768e-07, + "loss": 0.0069, + "reward": 1.7776832818984984, + "reward_std": 0.29561240673065187, + "rewards/code_format_reward": 0.9949999928474427, + "rewards/code_reward": 0.6400915861129761, + "step": 4740, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.005499497149139642, + "clip_ratio/high_mean": 0.0006874371436424553, + "clip_ratio/low_mean": 0.000482194940559566, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011696320783812554, + "completion_length": 88.9500015258789, + "epoch": 0.9124963980405341, + "grad_norm": 8.540057182312012, + "kl": 0.9072364956140518, + "learning_rate": 1.1694310755564014e-07, + "loss": -0.0021, + "reward": 1.6791202545166015, + "reward_std": 0.326928648352623, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.595185148715973, + "step": 4750, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.008684736292343587, + "clip_ratio/high_mean": 0.001148225087672472, + "clip_ratio/low_mean": 0.0005504319502506405, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016986570524750277, + "completion_length": 95.68250122070313, + "epoch": 0.9144174430890404, + "grad_norm": 4.539205551147461, + "kl": 0.860013198107481, + "learning_rate": 1.1621258314668402e-07, + "loss": 0.0, + "reward": 1.7214089155197143, + "reward_std": 0.1847836285829544, + "rewards/code_format_reward": 0.9674999952316284, + "rewards/code_reward": 0.6188294351100921, + "step": 4760, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.012012088089250028, + "clip_ratio/high_mean": 0.0020424059097422288, + "clip_ratio/low_mean": 7.006222731433808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021124681399669496, + "completion_length": 93.21750183105469, + "epoch": 0.9163384881375468, + "grad_norm": 6.60590124130249, + "kl": 0.45315413996577264, + "learning_rate": 1.1549786760672676e-07, + "loss": -0.0013, + "reward": 1.7664082288742065, + "reward_std": 0.24015129953622819, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.6369540929794312, + "step": 4770, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.028257530624978246, + "clip_ratio/high_mean": 0.00573968501703348, + "clip_ratio/low_mean": 0.00028595919138751924, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006025644272449426, + "completion_length": 93.59000244140626, + "epoch": 0.9182595331860532, + "grad_norm": 3.693448781967163, + "kl": 0.5863466400653123, + "learning_rate": 1.1479898698273037e-07, + "loss": 0.0001, + "reward": 1.7522862911224366, + "reward_std": 0.24038469642400742, + "rewards/code_format_reward": 0.9762500047683715, + "rewards/code_reward": 0.6320806205272674, + "step": 4780, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.006044295988976956, + "clip_ratio/high_mean": 0.0008545084856450558, + "clip_ratio/low_mean": 0.0004956311546266079, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013501396053470672, + "completion_length": 100.22000122070312, + "epoch": 0.9201805782345596, + "grad_norm": 17.894886016845703, + "kl": 0.33935268595814705, + "learning_rate": 1.1411596674457193e-07, + "loss": -0.0019, + "reward": 1.697510004043579, + "reward_std": 0.16087576895952224, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6018799901008606, + "step": 4790, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.005959878279827535, + "clip_ratio/high_mean": 0.0009170519857434556, + "clip_ratio/low_mean": 7.972503372002392e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009967770281946286, + "completion_length": 98.54750213623046, + "epoch": 0.922101623283066, + "grad_norm": 3.242460250854492, + "kl": 0.46977903619408606, + "learning_rate": 1.1344883178411565e-07, + "loss": -0.0036, + "reward": 1.7927821159362793, + "reward_std": 0.24044746458530425, + "rewards/code_format_reward": 0.9699999809265136, + "rewards/code_reward": 0.6538910627365112, + "step": 4800, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.00756604690104723, + "clip_ratio/high_mean": 0.0010185762541368604, + "clip_ratio/low_mean": 0.00016034738000598737, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001178923639236018, + "completion_length": 99.49000091552735, + "epoch": 0.9240226683315724, + "grad_norm": 7.225472927093506, + "kl": 0.2285786397755146, + "learning_rate": 1.1279760641430568e-07, + "loss": 0.0001, + "reward": 1.7233760595321654, + "reward_std": 0.22306990921497344, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6148130118846893, + "step": 4810, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.011060118256136776, + "clip_ratio/high_mean": 0.0017047788191121072, + "clip_ratio/low_mean": 0.00028281604463700207, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019875948812114073, + "completion_length": 92.36500091552735, + "epoch": 0.9259437133800787, + "grad_norm": 4.390386581420898, + "kl": 0.8032988727092742, + "learning_rate": 1.1216231436827974e-07, + "loss": 0.0005, + "reward": 1.7829072952270508, + "reward_std": 0.21434771865606309, + "rewards/code_format_reward": 0.9862500071525574, + "rewards/code_reward": 0.6448911607265473, + "step": 4820, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.013947398256277665, + "clip_ratio/high_mean": 0.0018392194229818414, + "clip_ratio/low_mean": 0.0004060613617184572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002245280790521065, + "completion_length": 103.5125015258789, + "epoch": 0.9278647584285852, + "grad_norm": 6.774899482727051, + "kl": 0.34710453301668165, + "learning_rate": 1.1154297879850462e-07, + "loss": 0.0003, + "reward": 1.7023445606231689, + "reward_std": 0.23593612909317016, + "rewards/code_format_reward": 0.96875, + "rewards/code_reward": 0.60898477435112, + "step": 4830, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.00909699429757893, + "clip_ratio/high_mean": 0.0014705892943311482, + "clip_ratio/low_mean": 0.0004355724740889855, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019061617698753253, + "completion_length": 91.47000274658203, + "epoch": 0.9297858034770915, + "grad_norm": 1.7924318313598633, + "kl": 0.5235365644097328, + "learning_rate": 1.1093962227593214e-07, + "loss": 0.0017, + "reward": 1.823938512802124, + "reward_std": 0.18318418860435487, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.6650941967964172, + "step": 4840, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.008868952537886799, + "clip_ratio/high_mean": 0.0013198618631577118, + "clip_ratio/low_mean": 6.896776030771435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013888296205550432, + "completion_length": 97.04750061035156, + "epoch": 0.9317068485255979, + "grad_norm": 5.492427825927734, + "kl": 0.27957614585757257, + "learning_rate": 1.1035226678917662e-07, + "loss": 0.0001, + "reward": 1.7743586778640748, + "reward_std": 0.19067177027463914, + "rewards/code_format_reward": 0.9699999928474426, + "rewards/code_reward": 0.6446793019771576, + "step": 4850, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.00021865542512387038, + "clip_ratio/high_mean": 2.7331928140483797e-05, + "clip_ratio/low_mean": 0.00022580694640055298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002531388745410368, + "completion_length": 91.65750274658203, + "epoch": 0.9336278935741044, + "grad_norm": 8.045164108276367, + "kl": 0.20759812816977502, + "learning_rate": 1.0978093374371373e-07, + "loss": -0.0004, + "reward": 1.7663999795913696, + "reward_std": 0.281513449549675, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.6353874802589417, + "step": 4860, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.02832627217285335, + "clip_ratio/high_mean": 0.0035600741393864155, + "clip_ratio/low_mean": 0.00011176664993399754, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0036718408693559466, + "completion_length": 84.46500244140626, + "epoch": 0.9355489386226107, + "grad_norm": 4.819484233856201, + "kl": 0.5664212189614772, + "learning_rate": 1.0922564396109993e-07, + "loss": -0.0008, + "reward": 1.7755849838256836, + "reward_std": 0.20761601328849794, + "rewards/code_format_reward": 0.9899999856948852, + "rewards/code_reward": 0.640292489528656, + "step": 4870, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.006872700434178114, + "clip_ratio/high_mean": 0.0009693403088022023, + "clip_ratio/low_mean": 3.415665923967026e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010034969680418726, + "completion_length": 92.47500152587891, + "epoch": 0.9374699836711171, + "grad_norm": 2.605060338973999, + "kl": 0.6489929877221584, + "learning_rate": 1.0868641767821432e-07, + "loss": -0.0041, + "reward": 1.9151075601577758, + "reward_std": 0.2566168040037155, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.7113037467002868, + "step": 4880, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.018258474441245197, + "clip_ratio/high_mean": 0.003355332469800487, + "clip_ratio/low_mean": 0.000607103164657019, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003962435649009421, + "completion_length": 90.51000366210937, + "epoch": 0.9393910287196234, + "grad_norm": 4.408846378326416, + "kl": 0.35625301077961924, + "learning_rate": 1.0816327454652044e-07, + "loss": -0.0018, + "reward": 1.7154739379882813, + "reward_std": 0.2987362504005432, + "rewards/code_format_reward": 0.9612499952316285, + "rewards/code_reward": 0.6174244284629822, + "step": 4890, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.010325380798894912, + "clip_ratio/high_mean": 0.0015298718310077675, + "clip_ratio/low_mean": 0.000294900168228196, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018247719475766645, + "completion_length": 100.19250183105468, + "epoch": 0.9413120737681299, + "grad_norm": 9.08279037475586, + "kl": 0.23486268445849418, + "learning_rate": 1.0765623363135061e-07, + "loss": -0.0011, + "reward": 1.5800267338752747, + "reward_std": 0.26311944872140886, + "rewards/code_format_reward": 0.9862499952316284, + "rewards/code_reward": 0.5434508502483368, + "step": 4900, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.004708675656002015, + "clip_ratio/high_mean": 0.0008911975004593842, + "clip_ratio/low_mean": 0.0001348661200609058, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001026063623430673, + "completion_length": 85.6300033569336, + "epoch": 0.9432331188166363, + "grad_norm": 2.5798628330230713, + "kl": 0.5353534445166588, + "learning_rate": 1.071653134112109e-07, + "loss": -0.0018, + "reward": 1.7293733358383179, + "reward_std": 0.23426424115896224, + "rewards/code_format_reward": 0.9862499833106995, + "rewards/code_reward": 0.6181241631507873, + "step": 4910, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.031931064534001054, + "clip_ratio/high_mean": 0.004416047394624911, + "clip_ratio/low_mean": 0.00037934551510261373, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0047953929373761636, + "completion_length": 93.08500061035156, + "epoch": 0.9451541638651426, + "grad_norm": 3.0407347679138184, + "kl": 0.3617399115115404, + "learning_rate": 1.0669053177710766e-07, + "loss": -0.0023, + "reward": 1.602178120613098, + "reward_std": 0.23843889832496643, + "rewards/code_format_reward": 0.987499988079071, + "rewards/code_reward": 0.5542140543460846, + "step": 4920, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.008546069997828453, + "clip_ratio/high_mean": 0.0011838132908451372, + "clip_ratio/low_mean": 6.596306338906288e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012497763542341999, + "completion_length": 102.79500122070313, + "epoch": 0.947075208913649, + "grad_norm": 5.987438678741455, + "kl": 0.28761252388358116, + "learning_rate": 1.0623190603189566e-07, + "loss": 0.0011, + "reward": 1.5471005201339723, + "reward_std": 0.28855718672275543, + "rewards/code_format_reward": 0.9674999952316284, + "rewards/code_reward": 0.5316752552986145, + "step": 4930, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.03786106104962528, + "clip_ratio/high_mean": 0.005264365172479302, + "clip_ratio/low_mean": 0.001157468621386215, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00642183352902066, + "completion_length": 95.2625015258789, + "epoch": 0.9489962539621554, + "grad_norm": 4.088647842407227, + "kl": 9114.393886435031, + "learning_rate": 1.0578945288964734e-07, + "loss": 18.226, + "reward": 1.5625978589057923, + "reward_std": 0.22688832581043245, + "rewards/code_format_reward": 0.9762499928474426, + "rewards/code_reward": 0.5372364044189453, + "step": 4940, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.009068883489817381, + "clip_ratio/high_mean": 0.0015044378931634128, + "clip_ratio/low_mean": 8.003948896657675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015844773850403726, + "completion_length": 88.91999969482421, + "epoch": 0.9509172990106618, + "grad_norm": 4.558302879333496, + "kl": 0.322134206071496, + "learning_rate": 1.0536318847504383e-07, + "loss": 0.0008, + "reward": 1.683999252319336, + "reward_std": 0.15837213546037673, + "rewards/code_format_reward": 0.9887500047683716, + "rewards/code_reward": 0.5948120951652527, + "step": 4950, + "zero_std_ratio": 0.65 + }, + { + "clip_ratio/high_max": 0.004135725944070146, + "clip_ratio/high_mean": 0.0006349837080051657, + "clip_ratio/low_mean": 0.0002028582151979208, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008378419290238526, + "completion_length": 88.58000030517579, + "epoch": 0.9528383440591682, + "grad_norm": 1.3143569231033325, + "kl": 0.32492467686533927, + "learning_rate": 1.0495312832278721e-07, + "loss": 0.001, + "reward": 1.757376217842102, + "reward_std": 0.18446292728185654, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.6318130671977997, + "step": 4960, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.004577422246802599, + "clip_ratio/high_mean": 0.00067823924619006, + "clip_ratio/low_mean": 0.00020590101485140622, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008841402595862746, + "completion_length": 91.77249908447266, + "epoch": 0.9547593891076745, + "grad_norm": 2.7616970539093018, + "kl": 0.6282597549259663, + "learning_rate": 1.0455928737703441e-07, + "loss": 0.0001, + "reward": 1.665701198577881, + "reward_std": 0.1566584974527359, + "rewards/code_format_reward": 0.99375, + "rewards/code_reward": 0.5844130754470825, + "step": 4970, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.012442531622946262, + "clip_ratio/high_mean": 0.0018589732819236815, + "clip_ratio/low_mean": 6.720430101267994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019261775829363613, + "completion_length": 90.92250213623046, + "epoch": 0.956680434156181, + "grad_norm": 2.84462308883667, + "kl": 0.3018207371234894, + "learning_rate": 1.0418167999085259e-07, + "loss": 0.0041, + "reward": 1.7472755432128906, + "reward_std": 0.24319706559181214, + "rewards/code_format_reward": 0.9774999856948853, + "rewards/code_reward": 0.6292627692222595, + "step": 4980, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.01709002295974642, + "clip_ratio/high_mean": 0.002679444645764306, + "clip_ratio/low_mean": 0.0003698979213368148, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003049342567101121, + "completion_length": 92.97750091552734, + "epoch": 0.9586014792046873, + "grad_norm": 11.978320121765137, + "kl": 1.2294385731220245, + "learning_rate": 1.0382031992569592e-07, + "loss": 0.0036, + "reward": 1.739167046546936, + "reward_std": 0.29275294244289396, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.622708535194397, + "step": 4990, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.007942511793226003, + "clip_ratio/high_mean": 0.001185902243014425, + "clip_ratio/low_mean": 5.571418441832066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012416164390742779, + "completion_length": 93.31250305175782, + "epoch": 0.9605225242531937, + "grad_norm": 3.364788055419922, + "kl": 0.35085868686437605, + "learning_rate": 1.0347522035090446e-07, + "loss": -0.0003, + "reward": 1.9564055442810058, + "reward_std": 0.2229623466730118, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.7303902268409729, + "step": 5000, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.015932422177866102, + "clip_ratio/high_mean": 0.0028564550855662675, + "clip_ratio/low_mean": 0.00020086783915758134, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003057322936365381, + "completion_length": 96.12750091552735, + "epoch": 0.9624435693017002, + "grad_norm": 5.283419609069824, + "kl": 0.3115640334784985, + "learning_rate": 1.0314639384322356e-07, + "loss": -0.0037, + "reward": 1.6293291807174684, + "reward_std": 0.2581008836627007, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.5693520545959473, + "step": 5010, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.003986756759695708, + "clip_ratio/high_mean": 0.0006319725507637486, + "clip_ratio/low_mean": 0.000603693921584636, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012356664577964694, + "completion_length": 86.04750366210938, + "epoch": 0.9643646143502065, + "grad_norm": 8.70874309539795, + "kl": 0.47548493221402166, + "learning_rate": 1.0283385238634632e-07, + "loss": 0.0041, + "reward": 1.622909712791443, + "reward_std": 0.2179076835513115, + "rewards/code_format_reward": 0.9712499976158142, + "rewards/code_reward": 0.5686423420906067, + "step": 5020, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.005893218703567982, + "clip_ratio/high_mean": 0.000819433806464076, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000819433806464076, + "completion_length": 88.9375015258789, + "epoch": 0.9662856593987129, + "grad_norm": 6.6337199211120605, + "kl": 0.5933880299329758, + "learning_rate": 1.0253760737047606e-07, + "loss": -0.0043, + "reward": 1.7307970523834229, + "reward_std": 0.1557233951985836, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6172735095024109, + "step": 5030, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.010898534208536148, + "clip_ratio/high_mean": 0.0015226851450279356, + "clip_ratio/low_mean": 0.009100449224933981, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010623134509660303, + "completion_length": 87.65500183105469, + "epoch": 0.9682067044472192, + "grad_norm": 12.837902069091797, + "kl": 0.1521947119385004, + "learning_rate": 1.0225766959191187e-07, + "loss": 0.0007, + "reward": 1.766017746925354, + "reward_std": 0.1697022169828415, + "rewards/code_format_reward": 0.9924999952316285, + "rewards/code_reward": 0.6348838567733764, + "step": 5040, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.039930257271043955, + "clip_ratio/high_mean": 0.005228024450480007, + "clip_ratio/low_mean": 0.0012023555900668725, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006430380133679137, + "completion_length": 99.63500213623047, + "epoch": 0.9701277494957257, + "grad_norm": 3.0135834217071533, + "kl": 0.5389343507587909, + "learning_rate": 1.0199404925265473e-07, + "loss": -0.0011, + "reward": 1.5655887126922607, + "reward_std": 0.1425598829984665, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.5365443468093872, + "step": 5050, + "zero_std_ratio": 0.575 + }, + { + "clip_ratio/high_max": 0.013292990019544959, + "clip_ratio/high_mean": 0.0019570814620237797, + "clip_ratio/low_mean": 0.0004139953598496504, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023710768204182387, + "completion_length": 92.15750122070312, + "epoch": 0.9720487945442321, + "grad_norm": 8.622629165649414, + "kl": 0.3708019584417343, + "learning_rate": 1.0174675596003588e-07, + "loss": -0.0037, + "reward": 1.6285043001174926, + "reward_std": 0.21171441301703453, + "rewards/code_format_reward": 0.9675000071525574, + "rewards/code_reward": 0.5723771452903748, + "step": 5060, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.011086594103835523, + "clip_ratio/high_mean": 0.001482552892412059, + "clip_ratio/low_mean": 7.31003499822691e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001555653239483945, + "completion_length": 92.72000122070312, + "epoch": 0.9739698395927384, + "grad_norm": 10.519503593444824, + "kl": 0.42225370053201916, + "learning_rate": 1.0151579872636673e-07, + "loss": 0.0073, + "reward": 1.9428821086883545, + "reward_std": 0.2824172407388687, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.7261285543441772, + "step": 5070, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.02070889645256102, + "clip_ratio/high_mean": 0.0035216436022892593, + "clip_ratio/low_mean": 0.0003085655207542004, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0038302090688375756, + "completion_length": 105.0050048828125, + "epoch": 0.9758908846412448, + "grad_norm": 4.139841079711914, + "kl": 0.3159520372748375, + "learning_rate": 1.0130118596861028e-07, + "loss": -0.0044, + "reward": 1.6708447217941285, + "reward_std": 0.30501508712768555, + "rewards/code_format_reward": 0.9837499976158142, + "rewards/code_reward": 0.5894848227500915, + "step": 5080, + "zero_std_ratio": 0.4 + }, + { + "clip_ratio/high_max": 0.008058706868905575, + "clip_ratio/high_mean": 0.0012073565638274885, + "clip_ratio/low_mean": 0.00031636476196581496, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015237213257933036, + "completion_length": 84.28750152587891, + "epoch": 0.9778119296897512, + "grad_norm": 4.015879154205322, + "kl": 0.2918614260852337, + "learning_rate": 1.0110292550807451e-07, + "loss": -0.0012, + "reward": 1.7721335172653199, + "reward_std": 0.286711610853672, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.6385667800903321, + "step": 5090, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.019471552316099407, + "clip_ratio/high_mean": 0.0026317643467336895, + "clip_ratio/low_mean": 0.0003316317946882918, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029633961850777267, + "completion_length": 90.81000213623047, + "epoch": 0.9797329747382576, + "grad_norm": 1.132954716682434, + "kl": 0.2704964060336351, + "learning_rate": 1.0092102457012717e-07, + "loss": -0.0022, + "reward": 1.6570582151412965, + "reward_std": 0.21210518777370452, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5810291051864624, + "step": 5100, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio/high_max": 0.011018617497757077, + "clip_ratio/high_mean": 0.0013860319217201323, + "clip_ratio/low_mean": 3.4722223062999547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014207541418727488, + "completion_length": 93.61250305175781, + "epoch": 0.981654019786764, + "grad_norm": 16.08737564086914, + "kl": 0.26382347345352175, + "learning_rate": 1.0075548978393277e-07, + "loss": -0.0002, + "reward": 1.8070130348205566, + "reward_std": 0.1673865035176277, + "rewards/code_format_reward": 0.9912500023841858, + "rewards/code_reward": 0.6556940078735352, + "step": 5110, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio/high_max": 0.010004310857038946, + "clip_ratio/high_mean": 0.0012777127660228871, + "clip_ratio/low_mean": 0.0003342236072057858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016119363936013542, + "completion_length": 89.3, + "epoch": 0.9835750648352704, + "grad_norm": 0.4556010961532593, + "kl": 0.4934497371315956, + "learning_rate": 1.0060632718221066e-07, + "loss": 0.0026, + "reward": 1.3408710062503815, + "reward_std": 0.16168890111148357, + "rewards/code_format_reward": 0.9875, + "rewards/code_reward": 0.42356050610542295, + "step": 5120, + "zero_std_ratio": 0.7 + }, + { + "clip_ratio/high_max": 0.05311971204355359, + "clip_ratio/high_mean": 0.0075716287479735914, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0075716287479735914, + "completion_length": 102.92249908447266, + "epoch": 0.9854961098837768, + "grad_norm": 3.9305222034454346, + "kl": 0.27781638093292715, + "learning_rate": 1.0047354220101518e-07, + "loss": -0.0011, + "reward": 1.630450439453125, + "reward_std": 0.18297318816185, + "rewards/code_format_reward": 0.9887499928474426, + "rewards/code_reward": 0.5680376827716828, + "step": 5130, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.004692732833791524, + "clip_ratio/high_mean": 0.0006299943852354772, + "clip_ratio/low_mean": 0.00031122941145440565, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009412237734068186, + "completion_length": 88.23250122070313, + "epoch": 0.9874171549322832, + "grad_norm": 4.31157112121582, + "kl": 0.2751577727496624, + "learning_rate": 1.0035713967953797e-07, + "loss": -0.0038, + "reward": 1.635274839401245, + "reward_std": 0.29494107216596605, + "rewards/code_format_reward": 0.9849999904632568, + "rewards/code_reward": 0.5713874340057373, + "step": 5140, + "zero_std_ratio": 0.45 + }, + { + "clip_ratio/high_max": 0.012987824180163443, + "clip_ratio/high_mean": 0.0019960356265073644, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019960356265073644, + "completion_length": 86.49750061035157, + "epoch": 0.9893381999807895, + "grad_norm": 7.45393705368042, + "kl": 0.3619408316910267, + "learning_rate": 1.0025712385993115e-07, + "loss": 0.0012, + "reward": 1.687432312965393, + "reward_std": 0.2386924833059311, + "rewards/code_format_reward": 0.9912499904632568, + "rewards/code_reward": 0.5959036707878113, + "step": 5150, + "zero_std_ratio": 0.475 + }, + { + "clip_ratio/high_max": 0.014351918507600203, + "clip_ratio/high_mean": 0.002000216278975131, + "clip_ratio/low_mean": 7.898250914877281e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020791988128621595, + "completion_length": 89.34250030517578, + "epoch": 0.991259245029296, + "grad_norm": 35.51432418823242, + "kl": 0.2617587223649025, + "learning_rate": 1.0017349838715278e-07, + "loss": -0.004, + "reward": 1.2408424496650696, + "reward_std": 0.21315770447254181, + "rewards/code_format_reward": 0.9774999976158142, + "rewards/code_reward": 0.3760462045669556, + "step": 5160, + "zero_std_ratio": 0.525 + }, + { + "clip_ratio/high_max": 0.003973034140653908, + "clip_ratio/high_mean": 0.0004966292675817385, + "clip_ratio/low_mean": 6.530825339723379e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0005031600929214619, + "completion_length": 99.66499938964844, + "epoch": 0.9931802900778023, + "grad_norm": 2.5418131351470947, + "kl": 0.1747375037521124, + "learning_rate": 1.0010626630883432e-07, + "loss": 0.003, + "reward": 1.421428418159485, + "reward_std": 0.09218620862811804, + "rewards/code_format_reward": 0.9612499952316285, + "rewards/code_reward": 0.4704016923904419, + "step": 5170, + "zero_std_ratio": 0.675 + }, + { + "clip_ratio/high_max": 0.02674068254418671, + "clip_ratio/high_mean": 0.003380646219011396, + "clip_ratio/low_mean": 9.682812378741801e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034774743369780483, + "completion_length": 92.72500152587891, + "epoch": 0.9951013351263087, + "grad_norm": 6.10137414932251, + "kl": 0.41192906014621256, + "learning_rate": 1.0005543007516928e-07, + "loss": -0.0051, + "reward": 1.5263760328292846, + "reward_std": 0.28926219046115875, + "rewards/code_format_reward": 0.9899999976158143, + "rewards/code_reward": 0.5156879663467407, + "step": 5180, + "zero_std_ratio": 0.425 + }, + { + "clip_ratio/high_max": 0.1273454572306946, + "clip_ratio/high_mean": 0.016399703072966076, + "clip_ratio/low_mean": 0.0004187120386632159, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016818415274610744, + "completion_length": 81.02250213623047, + "epoch": 0.9970223801748151, + "grad_norm": 7.77527379989624, + "kl": 0.7098278045654297, + "learning_rate": 1.0002099153882402e-07, + "loss": -0.0041, + "reward": 1.6053562879562377, + "reward_std": 0.16601394787430762, + "rewards/code_format_reward": 0.9824999928474426, + "rewards/code_reward": 0.557053166627884, + "step": 5190, + "zero_std_ratio": 0.6 + }, + { + "clip_ratio/high_max": 0.0028753917664289474, + "clip_ratio/high_mean": 0.00045008738234173504, + "clip_ratio/low_mean": 0.00016858125454746186, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000618668642709963, + "completion_length": 95.49250183105468, + "epoch": 0.9989434252233215, + "grad_norm": 6.507387638092041, + "kl": 0.9340068377554417, + "learning_rate": 1.0000295195487024e-07, + "loss": -0.0018, + "reward": 1.4542541027069091, + "reward_std": 0.20283248797059059, + "rewards/code_format_reward": 0.981249988079071, + "rewards/code_reward": 0.4818145722150803, + "step": 5200, + "zero_std_ratio": 0.55 + }, + { + "clip_ratio/high_max": 0.010171899455599487, + "clip_ratio/high_mean": 0.0014317149762064219, + "clip_ratio/low_mean": 0.00016592920292168856, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015976441791281104, + "completion_length": 90.05000305175781, + "epoch": 0.999711843242724, + "kl": 0.5218422394245863, + "reward": 1.0329873859882355, + "reward_std": 0.19616412371397018, + "rewards/code_format_reward": 0.934374988079071, + "rewards/code_reward": 0.28289994597435, + "step": 5204, + "total_flos": 0.0, + "train_loss": 1756184.5472393532, + "train_runtime": 149594.4727, + "train_samples_per_second": 0.139, + "train_steps_per_second": 0.035, + "zero_std_ratio": 0.5625 + } + ], + "logging_steps": 10, + "max_steps": 5205, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}