| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.17561912602044316, | |
| "eval_steps": 500, | |
| "global_step": 8000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00021952390752555395, | |
| "grad_norm": 129536.0, | |
| "learning_rate": 1.99775e-05, | |
| "loss": 11.4186, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0004390478150511079, | |
| "grad_norm": 154.0, | |
| "learning_rate": 1.9952500000000003e-05, | |
| "loss": 6.1068, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0006585717225766619, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.9927500000000002e-05, | |
| "loss": 4.8048, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0008780956301022158, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.99025e-05, | |
| "loss": 0.2014, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0010976195376277698, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.98775e-05, | |
| "loss": 0.2177, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0013171434451533237, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 1.98525e-05, | |
| "loss": 0.2286, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0015366673526788777, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.9827500000000003e-05, | |
| "loss": 0.1803, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0017561912602044316, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.9802500000000002e-05, | |
| "loss": 0.1814, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0019757151677299856, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.97775e-05, | |
| "loss": 0.1881, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0021952390752555395, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.97525e-05, | |
| "loss": 0.1877, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0024147629827810935, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.97275e-05, | |
| "loss": 0.1892, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0026342868903066474, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 1.9702500000000003e-05, | |
| "loss": 0.1921, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0028538107978322014, | |
| "grad_norm": 1.25, | |
| "learning_rate": 1.9677500000000003e-05, | |
| "loss": 0.2261, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0030733347053577553, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.9652500000000002e-05, | |
| "loss": 0.2294, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0032928586128833093, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 1.96275e-05, | |
| "loss": 0.1871, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0035123825204088632, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.96025e-05, | |
| "loss": 0.1888, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.003731906427934417, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.9577500000000004e-05, | |
| "loss": 0.1925, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.003951430335459971, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.95525e-05, | |
| "loss": 0.1974, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.004170954242985525, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.9527500000000002e-05, | |
| "loss": 0.2109, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.004390478150511079, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.9502500000000002e-05, | |
| "loss": 0.2015, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.004610002058036633, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.94775e-05, | |
| "loss": 0.1631, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.004829525965562187, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 1.94525e-05, | |
| "loss": 0.195, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.005049049873087741, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.94275e-05, | |
| "loss": 0.1882, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.005268573780613295, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.9402500000000003e-05, | |
| "loss": 0.1956, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.005488097688138849, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.9377500000000002e-05, | |
| "loss": 0.2236, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.005707621595664403, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 1.93525e-05, | |
| "loss": 0.1906, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.005927145503189957, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.93275e-05, | |
| "loss": 0.2174, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.006146669410715511, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.93025e-05, | |
| "loss": 0.1898, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.006366193318241065, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.9277500000000003e-05, | |
| "loss": 0.2021, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0065857172257666186, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.92525e-05, | |
| "loss": 0.192, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0068052411332921725, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 1.9227500000000002e-05, | |
| "loss": 0.2299, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.0070247650408177265, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 1.92025e-05, | |
| "loss": 0.2137, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.00724428894834328, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 1.91775e-05, | |
| "loss": 0.1918, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.007463812855868834, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 1.91525e-05, | |
| "loss": 0.2303, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.007683336763394388, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1.91275e-05, | |
| "loss": 0.1957, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.007902860670919942, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.9102500000000002e-05, | |
| "loss": 0.2029, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.008122384578445496, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.90775e-05, | |
| "loss": 0.219, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.00834190848597105, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.90525e-05, | |
| "loss": 0.2189, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.008561432393496604, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.90275e-05, | |
| "loss": 0.2014, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.008780956301022158, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 1.9002500000000003e-05, | |
| "loss": 0.1757, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.009000480208547712, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.8977500000000003e-05, | |
| "loss": 0.2146, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.009220004116073266, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.8952500000000002e-05, | |
| "loss": 0.1767, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.00943952802359882, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.89275e-05, | |
| "loss": 0.1913, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.009659051931124374, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.89025e-05, | |
| "loss": 0.1938, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.009878575838649928, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.8877500000000003e-05, | |
| "loss": 0.1898, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.010098099746175482, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.8852500000000003e-05, | |
| "loss": 0.214, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.010317623653701036, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.8827500000000002e-05, | |
| "loss": 0.191, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.01053714756122659, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.88025e-05, | |
| "loss": 0.207, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.010756671468752144, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 1.87775e-05, | |
| "loss": 0.2191, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.010976195376277698, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.8752500000000004e-05, | |
| "loss": 0.2271, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.011195719283803252, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 1.87275e-05, | |
| "loss": 0.1823, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.011415243191328805, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.8702500000000003e-05, | |
| "loss": 0.2054, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.01163476709885436, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.8677500000000002e-05, | |
| "loss": 0.2131, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.011854291006379913, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.86525e-05, | |
| "loss": 0.1569, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.012073814913905467, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.86275e-05, | |
| "loss": 0.1877, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.012293338821431021, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.86025e-05, | |
| "loss": 0.2228, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.012512862728956575, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.8577500000000003e-05, | |
| "loss": 0.2096, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.01273238663648213, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.8552500000000002e-05, | |
| "loss": 0.1861, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.012951910544007683, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.8527500000000002e-05, | |
| "loss": 0.1832, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.013171434451533237, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 1.85025e-05, | |
| "loss": 0.2182, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.013390958359058791, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.84775e-05, | |
| "loss": 0.2285, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.013610482266584345, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 1.8452500000000003e-05, | |
| "loss": 0.2024, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.013830006174109899, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 1.8427500000000003e-05, | |
| "loss": 0.2115, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.014049530081635453, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.8402500000000002e-05, | |
| "loss": 0.227, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.014269053989161007, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.83775e-05, | |
| "loss": 0.185, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.01448857789668656, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.83525e-05, | |
| "loss": 0.1754, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.014708101804212115, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.8327500000000004e-05, | |
| "loss": 0.2198, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.014927625711737669, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 1.83025e-05, | |
| "loss": 0.1966, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.015147149619263223, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.8277500000000002e-05, | |
| "loss": 0.187, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.015366673526788777, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.8252500000000002e-05, | |
| "loss": 0.2088, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01558619743431433, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.82275e-05, | |
| "loss": 0.2132, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.015805721341839885, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.82025e-05, | |
| "loss": 0.2149, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.01602524524936544, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.81775e-05, | |
| "loss": 0.1931, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.016244769156890992, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 1.8152500000000003e-05, | |
| "loss": 0.1721, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.016464293064416546, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.8127500000000002e-05, | |
| "loss": 0.1911, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0166838169719421, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.81025e-05, | |
| "loss": 0.2294, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.016903340879467654, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.80775e-05, | |
| "loss": 0.1867, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.017122864786993208, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.80525e-05, | |
| "loss": 0.1997, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.017342388694518762, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.8027500000000003e-05, | |
| "loss": 0.2095, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.017561912602044316, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.80025e-05, | |
| "loss": 0.1862, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.01778143650956987, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.7977500000000002e-05, | |
| "loss": 0.1971, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.018000960417095424, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.79525e-05, | |
| "loss": 0.193, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.018220484324620978, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.79275e-05, | |
| "loss": 0.2195, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.018440008232146532, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1.79025e-05, | |
| "loss": 0.1954, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.018659532139672086, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.78775e-05, | |
| "loss": 0.1546, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.01887905604719764, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.7852500000000002e-05, | |
| "loss": 0.188, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.019098579954723194, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.78275e-05, | |
| "loss": 0.2128, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.019318103862248748, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.78025e-05, | |
| "loss": 0.1846, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.0195376277697743, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.77775e-05, | |
| "loss": 0.1938, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.019757151677299856, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 1.77525e-05, | |
| "loss": 0.1781, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.01997667558482541, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.7727500000000003e-05, | |
| "loss": 0.1848, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.020196199492350964, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.7702500000000002e-05, | |
| "loss": 0.1739, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.020415723399876518, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.76775e-05, | |
| "loss": 0.1878, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.02063524730740207, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.76525e-05, | |
| "loss": 0.2028, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.020854771214927625, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.76275e-05, | |
| "loss": 0.1897, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.02107429512245318, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.7602500000000003e-05, | |
| "loss": 0.1748, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.021293819029978733, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.75775e-05, | |
| "loss": 0.1997, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.021513342937504287, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.7552500000000002e-05, | |
| "loss": 0.1989, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.02173286684502984, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.75275e-05, | |
| "loss": 0.1836, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.021952390752555395, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.7502500000000004e-05, | |
| "loss": 0.2198, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02217191466008095, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.74775e-05, | |
| "loss": 0.1847, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.022391438567606503, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.7452500000000003e-05, | |
| "loss": 0.1954, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.022610962475132057, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.7427500000000002e-05, | |
| "loss": 0.2007, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.02283048638265761, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.74025e-05, | |
| "loss": 0.2068, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.023050010290183165, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.73775e-05, | |
| "loss": 0.1898, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.02326953419770872, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.73525e-05, | |
| "loss": 0.1661, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.023489058105234273, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 1.7327500000000003e-05, | |
| "loss": 0.1745, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.023708582012759827, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.7302500000000002e-05, | |
| "loss": 0.1691, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.02392810592028538, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.7277500000000002e-05, | |
| "loss": 0.204, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.024147629827810935, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 1.72525e-05, | |
| "loss": 0.1775, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.02436715373533649, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.72275e-05, | |
| "loss": 0.1953, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.024586677642862043, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.7202500000000003e-05, | |
| "loss": 0.1916, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.024806201550387597, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.7177500000000003e-05, | |
| "loss": 0.1762, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.02502572545791315, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.7152500000000002e-05, | |
| "loss": 0.2267, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.025245249365438704, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.71275e-05, | |
| "loss": 0.1924, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.02546477327296426, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.71025e-05, | |
| "loss": 0.2081, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.025684297180489812, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.7077500000000004e-05, | |
| "loss": 0.1693, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.025903821088015366, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 1.70525e-05, | |
| "loss": 0.2278, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.02612334499554092, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.7027500000000003e-05, | |
| "loss": 0.1819, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.026342868903066474, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.7002500000000002e-05, | |
| "loss": 0.147, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.026562392810592028, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 1.69775e-05, | |
| "loss": 0.1857, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.026781916718117582, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 1.69525e-05, | |
| "loss": 0.1665, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.027001440625643136, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.69275e-05, | |
| "loss": 0.2032, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.02722096453316869, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.6902500000000003e-05, | |
| "loss": 0.182, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.027440488440694244, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.6877500000000002e-05, | |
| "loss": 0.1936, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.027660012348219798, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.6852500000000002e-05, | |
| "loss": 0.1971, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.027879536255745352, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.68275e-05, | |
| "loss": 0.1824, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.028099060163270906, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.68025e-05, | |
| "loss": 0.1731, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.02831858407079646, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.6777500000000003e-05, | |
| "loss": 0.1818, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.028538107978322014, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.67525e-05, | |
| "loss": 0.1959, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.028757631885847568, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.6727500000000002e-05, | |
| "loss": 0.1777, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.02897715579337312, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.67025e-05, | |
| "loss": 0.1719, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.029196679700898676, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.66775e-05, | |
| "loss": 0.1985, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.02941620360842423, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.66525e-05, | |
| "loss": 0.1841, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.029635727515949783, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.66275e-05, | |
| "loss": 0.2174, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.029855251423475337, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.6602500000000002e-05, | |
| "loss": 0.1925, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.03007477533100089, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.6577500000000002e-05, | |
| "loss": 0.1686, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.030294299238526445, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.65525e-05, | |
| "loss": 0.1877, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.030513823146052, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.65275e-05, | |
| "loss": 0.2048, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.030733347053577553, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.65025e-05, | |
| "loss": 0.2031, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.030952870961103107, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.6477500000000003e-05, | |
| "loss": 0.217, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.03117239486862866, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.6452500000000002e-05, | |
| "loss": 0.1783, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.031391918776154215, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.64275e-05, | |
| "loss": 0.1761, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.03161144268367977, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.64025e-05, | |
| "loss": 0.182, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.03183096659120532, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 1.63775e-05, | |
| "loss": 0.1732, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.03205049049873088, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.6352500000000003e-05, | |
| "loss": 0.1951, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.03227001440625643, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.63275e-05, | |
| "loss": 0.2427, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.032489538313781985, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.6302500000000002e-05, | |
| "loss": 0.1904, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.03270906222130754, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.62775e-05, | |
| "loss": 0.1938, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.03292858612883309, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.62525e-05, | |
| "loss": 0.1673, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03314811003635865, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.62275e-05, | |
| "loss": 0.2362, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.0333676339438842, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.62025e-05, | |
| "loss": 0.1716, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.033587157851409755, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.6177500000000002e-05, | |
| "loss": 0.1872, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.03380668175893531, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.61525e-05, | |
| "loss": 0.2068, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.03402620566646086, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.61275e-05, | |
| "loss": 0.1902, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.034245729573986416, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.61025e-05, | |
| "loss": 0.1866, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.03446525348151197, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 1.60775e-05, | |
| "loss": 0.1694, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.034684777389037524, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.6052500000000003e-05, | |
| "loss": 0.1915, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.03490430129656308, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.60275e-05, | |
| "loss": 0.2041, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.03512382520408863, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.60025e-05, | |
| "loss": 0.2597, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.035343349111614186, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.59775e-05, | |
| "loss": 0.2025, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.03556287301913974, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 1.5952500000000004e-05, | |
| "loss": 0.178, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.035782396926665294, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.5927500000000003e-05, | |
| "loss": 0.2145, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.03600192083419085, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.5902500000000002e-05, | |
| "loss": 0.2196, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.0362214447417164, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.5877500000000002e-05, | |
| "loss": 0.236, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.036440968649241956, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.58525e-05, | |
| "loss": 0.1927, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.03666049255676751, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.5827500000000004e-05, | |
| "loss": 0.1991, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.036880016464293064, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.58025e-05, | |
| "loss": 0.2282, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.03709954037181862, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.5777500000000003e-05, | |
| "loss": 0.1799, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.03731906427934417, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.5752500000000002e-05, | |
| "loss": 0.1936, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.037538588186869726, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.57275e-05, | |
| "loss": 0.1886, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.03775811209439528, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.57025e-05, | |
| "loss": 0.1771, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.037977636001920834, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.56775e-05, | |
| "loss": 0.1853, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.03819715990944639, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.5652500000000003e-05, | |
| "loss": 0.1987, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.03841668381697194, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.5627500000000002e-05, | |
| "loss": 0.2049, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.038636207724497496, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 1.5602500000000002e-05, | |
| "loss": 0.1691, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.03885573163202305, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.55775e-05, | |
| "loss": 0.1981, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.0390752555395486, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 1.55525e-05, | |
| "loss": 0.1842, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.03929477944707416, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.5527500000000003e-05, | |
| "loss": 0.1696, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.03951430335459971, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.55025e-05, | |
| "loss": 0.2018, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.039733827262125265, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.5477500000000002e-05, | |
| "loss": 0.201, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.03995335116965082, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.54525e-05, | |
| "loss": 0.1855, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.04017287507717637, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.54275e-05, | |
| "loss": 0.1679, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.04039239898470193, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.54025e-05, | |
| "loss": 0.1916, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.04061192289222748, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.53775e-05, | |
| "loss": 0.1715, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.040831446799753035, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1.5352500000000003e-05, | |
| "loss": 0.1899, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.04105097070727859, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.5327500000000002e-05, | |
| "loss": 0.193, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.04127049461480414, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.53025e-05, | |
| "loss": 0.1822, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.0414900185223297, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 1.52775e-05, | |
| "loss": 0.1502, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.04170954242985525, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.5252500000000002e-05, | |
| "loss": 0.1909, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.041929066337380805, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.5227500000000001e-05, | |
| "loss": 0.1853, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.04214859024490636, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.5202500000000002e-05, | |
| "loss": 0.1905, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.04236811415243191, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.51775e-05, | |
| "loss": 0.1877, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.04258763805995747, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.5152500000000001e-05, | |
| "loss": 0.1947, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.04280716196748302, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.5127500000000002e-05, | |
| "loss": 0.1985, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.043026685875008575, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.5102500000000002e-05, | |
| "loss": 0.1846, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.04324620978253413, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 1.5077500000000001e-05, | |
| "loss": 0.2175, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.04346573369005968, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.50525e-05, | |
| "loss": 0.203, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.043685257597585236, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.5027500000000001e-05, | |
| "loss": 0.1688, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.04390478150511079, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.5002500000000002e-05, | |
| "loss": 0.1951, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.044124305412636344, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.49775e-05, | |
| "loss": 0.1931, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.0443438293201619, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.4952500000000001e-05, | |
| "loss": 0.2017, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.04456335322768745, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.49275e-05, | |
| "loss": 0.2095, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.044782877135213006, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 1.4902500000000002e-05, | |
| "loss": 0.1814, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.04500240104273856, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.48775e-05, | |
| "loss": 0.1912, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.045221924950264114, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.48525e-05, | |
| "loss": 0.1799, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.04544144885778967, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.4827500000000002e-05, | |
| "loss": 0.1754, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.04566097276531522, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.4802500000000003e-05, | |
| "loss": 0.2019, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.045880496672840776, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.47775e-05, | |
| "loss": 0.1767, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.04610002058036633, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.4752500000000001e-05, | |
| "loss": 0.1762, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.046319544487891884, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.4727500000000001e-05, | |
| "loss": 0.2086, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.04653906839541744, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.4702500000000002e-05, | |
| "loss": 0.2083, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.04675859230294299, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.4677500000000003e-05, | |
| "loss": 0.1774, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.046978116210468546, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.46525e-05, | |
| "loss": 0.1908, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.0471976401179941, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.4627500000000002e-05, | |
| "loss": 0.2223, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.047417164025519654, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.4602500000000001e-05, | |
| "loss": 0.2084, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.04763668793304521, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.4577500000000002e-05, | |
| "loss": 0.2029, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.04785621184057076, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.45525e-05, | |
| "loss": 0.1641, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.048075735748096315, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.4527500000000001e-05, | |
| "loss": 0.2053, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.04829525965562187, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.4502500000000002e-05, | |
| "loss": 0.1783, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.04851478356314742, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.4477500000000002e-05, | |
| "loss": 0.1733, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.04873430747067298, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.4452500000000001e-05, | |
| "loss": 0.1852, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.04895383137819853, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.44275e-05, | |
| "loss": 0.1626, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.049173355285724085, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.4402500000000001e-05, | |
| "loss": 0.1804, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.04939287919324964, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.4377500000000003e-05, | |
| "loss": 0.1895, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.04961240310077519, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.43525e-05, | |
| "loss": 0.1911, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.04983192700830075, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.4327500000000001e-05, | |
| "loss": 0.1903, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.0500514509158263, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.43025e-05, | |
| "loss": 0.2024, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.050270974823351855, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.4277500000000002e-05, | |
| "loss": 0.1669, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.05049049873087741, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.42525e-05, | |
| "loss": 0.1819, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.05071002263840296, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.42275e-05, | |
| "loss": 0.1808, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.05092954654592852, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 1.4202500000000002e-05, | |
| "loss": 0.1809, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.05114907045345407, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.4177500000000001e-05, | |
| "loss": 0.177, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.051368594360979625, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.41525e-05, | |
| "loss": 0.1882, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.05158811826850518, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.41275e-05, | |
| "loss": 0.1705, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.05180764217603073, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.4102500000000001e-05, | |
| "loss": 0.1671, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.05202716608355629, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.4077500000000002e-05, | |
| "loss": 0.1622, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.05224668999108184, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.4052500000000001e-05, | |
| "loss": 0.1847, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.052466213898607394, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.40275e-05, | |
| "loss": 0.1789, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.05268573780613295, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 1.4002500000000002e-05, | |
| "loss": 0.1815, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0529052617136585, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.3977500000000001e-05, | |
| "loss": 0.2352, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.053124785621184056, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 1.3952500000000002e-05, | |
| "loss": 0.1681, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.05334430952870961, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.39275e-05, | |
| "loss": 0.2157, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.053563833436235164, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.3902500000000001e-05, | |
| "loss": 0.1829, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.05378335734376072, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 1.3877500000000002e-05, | |
| "loss": 0.1778, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.05400288125128627, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 1.3852500000000002e-05, | |
| "loss": 0.2, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.054222405158811826, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.3827500000000001e-05, | |
| "loss": 0.1975, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.05444192906633738, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.38025e-05, | |
| "loss": 0.1732, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.054661452973862934, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.3777500000000001e-05, | |
| "loss": 0.2128, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.05488097688138849, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.3752500000000003e-05, | |
| "loss": 0.2146, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.05510050078891404, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.37275e-05, | |
| "loss": 0.1893, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.055320024696439596, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.3702500000000001e-05, | |
| "loss": 0.2053, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.05553954860396515, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.36775e-05, | |
| "loss": 0.1867, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.055759072511490704, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.3652500000000002e-05, | |
| "loss": 0.2008, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.05597859641901626, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 1.36275e-05, | |
| "loss": 0.1968, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.05619812032654181, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.36025e-05, | |
| "loss": 0.199, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.056417644234067366, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.3577500000000002e-05, | |
| "loss": 0.1708, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.05663716814159292, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.3552500000000001e-05, | |
| "loss": 0.1923, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.056856692049118474, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 1.35275e-05, | |
| "loss": 0.2035, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.05707621595664403, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.35025e-05, | |
| "loss": 0.2349, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.05729573986416958, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.3477500000000001e-05, | |
| "loss": 0.1965, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.057515263771695135, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.3452500000000002e-05, | |
| "loss": 0.1683, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.05773478767922069, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.3427500000000001e-05, | |
| "loss": 0.2099, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.05795431158674624, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.3402500000000001e-05, | |
| "loss": 0.1802, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.0581738354942718, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 1.33775e-05, | |
| "loss": 0.1875, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.05839335940179735, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.3352500000000001e-05, | |
| "loss": 0.1849, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.058612883309322905, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.3327500000000002e-05, | |
| "loss": 0.1549, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.05883240721684846, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.33025e-05, | |
| "loss": 0.1936, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.05905193112437401, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.3277500000000001e-05, | |
| "loss": 0.1891, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.05927145503189957, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.3252500000000002e-05, | |
| "loss": 0.1879, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.05949097893942512, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.3227500000000002e-05, | |
| "loss": 0.1838, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.059710502846950675, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.3202500000000001e-05, | |
| "loss": 0.1807, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.05993002675447623, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.31775e-05, | |
| "loss": 0.1519, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.06014955066200178, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.3152500000000002e-05, | |
| "loss": 0.1843, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.06036907456952734, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.3127500000000003e-05, | |
| "loss": 0.2063, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.06058859847705289, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 1.31025e-05, | |
| "loss": 0.1785, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.060808122384578445, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.3077500000000001e-05, | |
| "loss": 0.1707, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.061027646292104, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.30525e-05, | |
| "loss": 0.1761, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.06124717019962955, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.3027500000000002e-05, | |
| "loss": 0.1782, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.061466694107155107, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 1.30025e-05, | |
| "loss": 0.1988, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.06168621801468066, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.29775e-05, | |
| "loss": 0.1701, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.061905741922206214, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.2952500000000002e-05, | |
| "loss": 0.2079, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.06212526582973177, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.2927500000000001e-05, | |
| "loss": 0.1982, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.06234478973725732, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1.29025e-05, | |
| "loss": 0.1798, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.06256431364478288, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.28775e-05, | |
| "loss": 0.1582, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.06278383755230843, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.2852500000000001e-05, | |
| "loss": 0.2233, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.06300336145983398, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.2827500000000002e-05, | |
| "loss": 0.1952, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.06322288536735954, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.2802500000000002e-05, | |
| "loss": 0.1854, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.06344240927488509, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.2777500000000001e-05, | |
| "loss": 0.205, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.06366193318241065, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.27525e-05, | |
| "loss": 0.1899, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.0638814570899362, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.2727500000000001e-05, | |
| "loss": 0.1874, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.06410098099746175, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 1.2702500000000002e-05, | |
| "loss": 0.186, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.06432050490498731, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 1.26775e-05, | |
| "loss": 0.1619, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.06454002881251286, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.2652500000000001e-05, | |
| "loss": 0.1959, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.06475955272003842, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.26275e-05, | |
| "loss": 0.1919, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.06497907662756397, | |
| "grad_norm": 1.25, | |
| "learning_rate": 1.2602500000000002e-05, | |
| "loss": 0.1998, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.06519860053508952, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.25775e-05, | |
| "loss": 0.2058, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.06541812444261508, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.25525e-05, | |
| "loss": 0.1861, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.06563764835014063, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.2527500000000002e-05, | |
| "loss": 0.2199, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.06585717225766619, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.2502500000000003e-05, | |
| "loss": 0.1804, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06607669616519174, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 1.24775e-05, | |
| "loss": 0.1742, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.0662962200727173, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.2452500000000001e-05, | |
| "loss": 0.1782, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.06651574398024285, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.2427500000000001e-05, | |
| "loss": 0.1822, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.0667352678877684, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.2402500000000002e-05, | |
| "loss": 0.1902, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.06695479179529396, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.23775e-05, | |
| "loss": 0.1937, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.06717431570281951, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 1.23525e-05, | |
| "loss": 0.1919, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.06739383961034506, | |
| "grad_norm": 5.625, | |
| "learning_rate": 1.2327500000000002e-05, | |
| "loss": 0.1763, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.06761336351787062, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.2302500000000001e-05, | |
| "loss": 0.1956, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.06783288742539617, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.22775e-05, | |
| "loss": 0.2159, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.06805241133292173, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 1.22525e-05, | |
| "loss": 0.1775, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.06827193524044728, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.2227500000000001e-05, | |
| "loss": 0.1617, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.06849145914797283, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.2202500000000002e-05, | |
| "loss": 0.1725, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.06871098305549839, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.2177500000000002e-05, | |
| "loss": 0.2037, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.06893050696302394, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.2152500000000001e-05, | |
| "loss": 0.21, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.0691500308705495, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.21275e-05, | |
| "loss": 0.1816, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.06936955477807505, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.2102500000000001e-05, | |
| "loss": 0.1811, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.0695890786856006, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.2077500000000003e-05, | |
| "loss": 0.1659, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.06980860259312616, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.20525e-05, | |
| "loss": 0.1924, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.07002812650065171, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 1.2027500000000001e-05, | |
| "loss": 0.1891, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.07024765040817726, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.20025e-05, | |
| "loss": 0.1836, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.07046717431570282, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.1977500000000002e-05, | |
| "loss": 0.2071, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.07068669822322837, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.19525e-05, | |
| "loss": 0.2144, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.07090622213075393, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.19275e-05, | |
| "loss": 0.1881, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.07112574603827948, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 1.1902500000000002e-05, | |
| "loss": 0.1751, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.07134526994580503, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.1877500000000001e-05, | |
| "loss": 0.1844, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.07156479385333059, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.18525e-05, | |
| "loss": 0.177, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.07178431776085614, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.18275e-05, | |
| "loss": 0.1831, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.0720038416683817, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.1802500000000001e-05, | |
| "loss": 0.1793, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.07222336557590725, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.1777500000000002e-05, | |
| "loss": 0.1824, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.0724428894834328, | |
| "grad_norm": 0.875, | |
| "learning_rate": 1.17525e-05, | |
| "loss": 0.2064, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.07266241339095836, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 1.17275e-05, | |
| "loss": 0.1824, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.07288193729848391, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.1702500000000002e-05, | |
| "loss": 0.1792, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.07310146120600947, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 1.1677500000000001e-05, | |
| "loss": 0.1882, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.07332098511353502, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.16525e-05, | |
| "loss": 0.1945, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.07354050902106057, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.16275e-05, | |
| "loss": 0.2059, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.07376003292858613, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.1602500000000001e-05, | |
| "loss": 0.2108, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.07397955683611168, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 1.1577500000000002e-05, | |
| "loss": 0.1833, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.07419908074363724, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.1552500000000002e-05, | |
| "loss": 0.1719, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.07441860465116279, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.1527500000000001e-05, | |
| "loss": 0.191, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.07463812855868834, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.15025e-05, | |
| "loss": 0.1933, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.0748576524662139, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 1.1477500000000001e-05, | |
| "loss": 0.2048, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.07507717637373945, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.1452500000000003e-05, | |
| "loss": 0.1669, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.075296700281265, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 1.14275e-05, | |
| "loss": 0.1684, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.07551622418879056, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.1402500000000001e-05, | |
| "loss": 0.1744, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.07573574809631611, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.13775e-05, | |
| "loss": 0.1822, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.07595527200384167, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 1.1352500000000002e-05, | |
| "loss": 0.1808, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.07617479591136722, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.13275e-05, | |
| "loss": 0.1872, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.07639431981889278, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.13025e-05, | |
| "loss": 0.192, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.07661384372641833, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 1.1277500000000002e-05, | |
| "loss": 0.219, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.07683336763394388, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.1252500000000001e-05, | |
| "loss": 0.2029, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07705289154146944, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.12275e-05, | |
| "loss": 0.1787, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.07727241544899499, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.12025e-05, | |
| "loss": 0.1613, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.07749193935652054, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.1177500000000001e-05, | |
| "loss": 0.1823, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.0777114632640461, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 1.1152500000000002e-05, | |
| "loss": 0.1823, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.07793098717157165, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.11275e-05, | |
| "loss": 0.194, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.0781505110790972, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.1102500000000001e-05, | |
| "loss": 0.202, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.07837003498662276, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 1.10775e-05, | |
| "loss": 0.1599, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.07858955889414831, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.1052500000000001e-05, | |
| "loss": 0.183, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.07880908280167387, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.1027499999999999e-05, | |
| "loss": 0.1697, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.07902860670919942, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.10025e-05, | |
| "loss": 0.1872, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.07924813061672498, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 1.0977500000000001e-05, | |
| "loss": 0.2008, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.07946765452425053, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.0952500000000002e-05, | |
| "loss": 0.1857, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.07968717843177608, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.0927500000000002e-05, | |
| "loss": 0.2013, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.07990670233930164, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.0902500000000001e-05, | |
| "loss": 0.1996, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.08012622624682719, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.08775e-05, | |
| "loss": 0.1866, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.08034575015435275, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.0852500000000002e-05, | |
| "loss": 0.1854, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.0805652740618783, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.0827500000000003e-05, | |
| "loss": 0.2041, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.08078479796940385, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.08025e-05, | |
| "loss": 0.2103, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.08100432187692941, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 1.0777500000000001e-05, | |
| "loss": 0.1689, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.08122384578445496, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.07525e-05, | |
| "loss": 0.2075, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.08144336969198052, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.0727500000000002e-05, | |
| "loss": 0.1958, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.08166289359950607, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.07025e-05, | |
| "loss": 0.2133, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.08188241750703162, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 1.06775e-05, | |
| "loss": 0.2199, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.08210194141455718, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.0652500000000002e-05, | |
| "loss": 0.1816, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.08232146532208273, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.0627500000000001e-05, | |
| "loss": 0.1618, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.08254098922960829, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.06025e-05, | |
| "loss": 0.2193, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.08276051313713384, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.05775e-05, | |
| "loss": 0.1626, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.0829800370446594, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.0552500000000001e-05, | |
| "loss": 0.1873, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.08319956095218495, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.0527500000000002e-05, | |
| "loss": 0.2129, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.0834190848597105, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.05025e-05, | |
| "loss": 0.1906, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.08363860876723606, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 1.0477500000000001e-05, | |
| "loss": 0.1711, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.08385813267476161, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 1.04525e-05, | |
| "loss": 0.1813, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.08407765658228716, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 1.0427500000000001e-05, | |
| "loss": 0.1996, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.08429718048981272, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1.0402499999999999e-05, | |
| "loss": 0.1536, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.08451670439733827, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 1.03775e-05, | |
| "loss": 0.1617, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.08473622830486383, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 1.0352500000000001e-05, | |
| "loss": 0.1874, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.08495575221238938, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.03275e-05, | |
| "loss": 0.2133, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.08517527611991493, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.0302500000000002e-05, | |
| "loss": 0.1796, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.08539480002744049, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.02775e-05, | |
| "loss": 0.1698, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.08561432393496604, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.02525e-05, | |
| "loss": 0.1938, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.0858338478424916, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.0227500000000002e-05, | |
| "loss": 0.1816, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.08605337175001715, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 1.0202500000000003e-05, | |
| "loss": 0.1677, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.0862728956575427, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 1.01775e-05, | |
| "loss": 0.1956, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.08649241956506826, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.0152500000000001e-05, | |
| "loss": 0.1769, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.08671194347259381, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.0127500000000001e-05, | |
| "loss": 0.2014, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.08693146738011936, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 1.0102500000000002e-05, | |
| "loss": 0.2199, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.08715099128764492, | |
| "grad_norm": 0.75, | |
| "learning_rate": 1.00775e-05, | |
| "loss": 0.1815, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.08737051519517047, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.00525e-05, | |
| "loss": 0.1797, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.08759003910269603, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.0027500000000002e-05, | |
| "loss": 0.2142, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.08780956301022158, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.0002500000000001e-05, | |
| "loss": 0.2015, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08802908691774713, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 9.9775e-06, | |
| "loss": 0.1653, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.08824861082527269, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 9.9525e-06, | |
| "loss": 0.1784, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.08846813473279824, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.927500000000001e-06, | |
| "loss": 0.1915, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.0886876586403238, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 9.9025e-06, | |
| "loss": 0.1997, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.08890718254784935, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 9.877500000000002e-06, | |
| "loss": 0.1816, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.0891267064553749, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 9.852500000000001e-06, | |
| "loss": 0.1954, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.08934623036290046, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 9.8275e-06, | |
| "loss": 0.1532, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.08956575427042601, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 9.8025e-06, | |
| "loss": 0.2117, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.08978527817795157, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 9.7775e-06, | |
| "loss": 0.1957, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.09000480208547712, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.7525e-06, | |
| "loss": 0.1683, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.09022432599300267, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 9.727500000000001e-06, | |
| "loss": 0.202, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.09044384990052823, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 9.7025e-06, | |
| "loss": 0.1806, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.09066337380805378, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 9.6775e-06, | |
| "loss": 0.1941, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.09088289771557934, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 9.652500000000001e-06, | |
| "loss": 0.1824, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.09110242162310489, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 9.6275e-06, | |
| "loss": 0.1489, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.09132194553063044, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 9.602500000000002e-06, | |
| "loss": 0.1578, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.091541469438156, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.577500000000001e-06, | |
| "loss": 0.1837, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.09176099334568155, | |
| "grad_norm": 0.75, | |
| "learning_rate": 9.5525e-06, | |
| "loss": 0.1943, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.0919805172532071, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 9.5275e-06, | |
| "loss": 0.1938, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.09220004116073266, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 9.502500000000001e-06, | |
| "loss": 0.1722, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.09241956506825821, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 9.4775e-06, | |
| "loss": 0.1893, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.09263908897578377, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 9.452500000000001e-06, | |
| "loss": 0.1843, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.09285861288330932, | |
| "grad_norm": 0.75, | |
| "learning_rate": 9.4275e-06, | |
| "loss": 0.2091, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.09307813679083488, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 9.402500000000002e-06, | |
| "loss": 0.1744, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.09329766069836043, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 9.377500000000001e-06, | |
| "loss": 0.1934, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.09351718460588598, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 9.3525e-06, | |
| "loss": 0.1982, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.09373670851341154, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 9.3275e-06, | |
| "loss": 0.1705, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.09395623242093709, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 9.302500000000001e-06, | |
| "loss": 0.1701, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.09417575632846265, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 9.2775e-06, | |
| "loss": 0.1956, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.0943952802359882, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 9.252500000000002e-06, | |
| "loss": 0.1792, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.09461480414351375, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 9.227500000000001e-06, | |
| "loss": 0.1744, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.09483432805103931, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 9.2025e-06, | |
| "loss": 0.2185, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.09505385195856486, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 9.1775e-06, | |
| "loss": 0.1655, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.09527337586609042, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 9.152500000000001e-06, | |
| "loss": 0.2083, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.09549289977361597, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.1275e-06, | |
| "loss": 0.2039, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.09571242368114152, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 9.102500000000001e-06, | |
| "loss": 0.1923, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.09593194758866708, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 9.0775e-06, | |
| "loss": 0.2013, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.09615147149619263, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 9.0525e-06, | |
| "loss": 0.1856, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.09637099540371818, | |
| "grad_norm": 0.75, | |
| "learning_rate": 9.027500000000001e-06, | |
| "loss": 0.2029, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.09659051931124374, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 9.0025e-06, | |
| "loss": 0.1753, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.09681004321876929, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 8.977500000000002e-06, | |
| "loss": 0.1767, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.09702956712629485, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 8.952500000000001e-06, | |
| "loss": 0.1727, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.0972490910338204, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 8.9275e-06, | |
| "loss": 0.1802, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.09746861494134595, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.9025e-06, | |
| "loss": 0.217, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.09768813884887151, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 8.877500000000001e-06, | |
| "loss": 0.1644, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.09790766275639706, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 8.8525e-06, | |
| "loss": 0.195, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.09812718666392262, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 8.827500000000001e-06, | |
| "loss": 0.1872, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.09834671057144817, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 8.802500000000001e-06, | |
| "loss": 0.196, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.09856623447897372, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 8.7775e-06, | |
| "loss": 0.186, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.09878575838649928, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 8.7525e-06, | |
| "loss": 0.1871, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09900528229402483, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 8.7275e-06, | |
| "loss": 0.189, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.09922480620155039, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 8.7025e-06, | |
| "loss": 0.1596, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.09944433010907594, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 8.677500000000001e-06, | |
| "loss": 0.1781, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.0996638540166015, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 8.6525e-06, | |
| "loss": 0.1879, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.09988337792412705, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 8.627500000000002e-06, | |
| "loss": 0.2002, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.1001029018316526, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.602500000000001e-06, | |
| "loss": 0.2004, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.10032242573917816, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 8.5775e-06, | |
| "loss": 0.1844, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.10054194964670371, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 8.5525e-06, | |
| "loss": 0.1701, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.10076147355422926, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 8.527500000000001e-06, | |
| "loss": 0.2176, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.10098099746175482, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 8.5025e-06, | |
| "loss": 0.1736, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.10120052136928037, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 8.477500000000001e-06, | |
| "loss": 0.1813, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.10142004527680593, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 8.4525e-06, | |
| "loss": 0.1878, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.10163956918433148, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 8.4275e-06, | |
| "loss": 0.1796, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.10185909309185703, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 8.402500000000001e-06, | |
| "loss": 0.1702, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.10207861699938259, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 8.3775e-06, | |
| "loss": 0.1689, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.10229814090690814, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 8.352500000000002e-06, | |
| "loss": 0.1922, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.1025176648144337, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 8.327500000000001e-06, | |
| "loss": 0.1813, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.10273718872195925, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 8.3025e-06, | |
| "loss": 0.1486, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.1029567126294848, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 8.2775e-06, | |
| "loss": 0.1606, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.10317623653701036, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 8.252500000000001e-06, | |
| "loss": 0.2014, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.10339576044453591, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 8.2275e-06, | |
| "loss": 0.1757, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.10361528435206147, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 8.202500000000002e-06, | |
| "loss": 0.1895, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.10383480825958702, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 8.177500000000001e-06, | |
| "loss": 0.1753, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.10405433216711257, | |
| "grad_norm": 0.75, | |
| "learning_rate": 8.1525e-06, | |
| "loss": 0.1705, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.10427385607463813, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 8.1275e-06, | |
| "loss": 0.217, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.10449337998216368, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 8.1025e-06, | |
| "loss": 0.2155, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.10471290388968924, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 8.0775e-06, | |
| "loss": 0.1972, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.10493242779721479, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 8.052500000000001e-06, | |
| "loss": 0.1927, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.10515195170474034, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8.0275e-06, | |
| "loss": 0.2067, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.1053714756122659, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 8.0025e-06, | |
| "loss": 0.1802, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.10559099951979145, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 7.9775e-06, | |
| "loss": 0.1808, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.105810523427317, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 7.9525e-06, | |
| "loss": 0.1641, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.10603004733484256, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 7.9275e-06, | |
| "loss": 0.175, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.10624957124236811, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 7.902500000000001e-06, | |
| "loss": 0.1984, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.10646909514989367, | |
| "grad_norm": 1.0, | |
| "learning_rate": 7.8775e-06, | |
| "loss": 0.182, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.10668861905741922, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 7.852500000000001e-06, | |
| "loss": 0.1785, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.10690814296494477, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 7.827500000000001e-06, | |
| "loss": 0.1917, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.10712766687247033, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 7.8025e-06, | |
| "loss": 0.1747, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.10734719077999588, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 7.777500000000001e-06, | |
| "loss": 0.1806, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.10756671468752144, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 7.7525e-06, | |
| "loss": 0.1672, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.10778623859504699, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 7.727500000000002e-06, | |
| "loss": 0.2223, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.10800576250257254, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 7.702500000000001e-06, | |
| "loss": 0.1946, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.1082252864100981, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 7.6775e-06, | |
| "loss": 0.1682, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.10844481031762365, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 7.6525e-06, | |
| "loss": 0.1494, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.1086643342251492, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 7.627500000000001e-06, | |
| "loss": 0.1856, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.10888385813267476, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 7.6025000000000005e-06, | |
| "loss": 0.1922, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.10910338204020031, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 7.577500000000001e-06, | |
| "loss": 0.1919, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.10932290594772587, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 7.5525e-06, | |
| "loss": 0.1758, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.10954242985525142, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 7.527500000000001e-06, | |
| "loss": 0.2053, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.10976195376277698, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 7.502500000000001e-06, | |
| "loss": 0.1742, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.10998147767030253, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 7.477500000000001e-06, | |
| "loss": 0.2089, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.11020100157782808, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 7.4525e-06, | |
| "loss": 0.1788, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.11042052548535364, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 7.4275000000000005e-06, | |
| "loss": 0.1981, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.11064004939287919, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 7.4025e-06, | |
| "loss": 0.1708, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.11085957330040475, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 7.377500000000001e-06, | |
| "loss": 0.1621, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.1110790972079303, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 7.3525e-06, | |
| "loss": 0.1805, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.11129862111545585, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 7.3275000000000006e-06, | |
| "loss": 0.201, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.11151814502298141, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 7.3025e-06, | |
| "loss": 0.2034, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.11173766893050696, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 7.277500000000001e-06, | |
| "loss": 0.1858, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.11195719283803252, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 7.2525000000000004e-06, | |
| "loss": 0.2008, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.11217671674555807, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 7.227500000000001e-06, | |
| "loss": 0.1954, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.11239624065308362, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 7.2025e-06, | |
| "loss": 0.1638, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.11261576456060918, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 7.1775e-06, | |
| "loss": 0.1845, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.11283528846813473, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 7.152500000000001e-06, | |
| "loss": 0.1953, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.11305481237566029, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 7.127500000000001e-06, | |
| "loss": 0.1937, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.11327433628318584, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 7.102500000000001e-06, | |
| "loss": 0.1736, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.1134938601907114, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 7.0775000000000004e-06, | |
| "loss": 0.2043, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.11371338409823695, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 7.052500000000001e-06, | |
| "loss": 0.1758, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.1139329080057625, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 7.0275e-06, | |
| "loss": 0.1895, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.11415243191328805, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 7.002500000000001e-06, | |
| "loss": 0.2119, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.11437195582081361, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 6.9775000000000005e-06, | |
| "loss": 0.1839, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.11459147972833916, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 6.952500000000001e-06, | |
| "loss": 0.186, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.11481100363586472, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 6.9275e-06, | |
| "loss": 0.1887, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.11503052754339027, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 6.902500000000001e-06, | |
| "loss": 0.1916, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.11525005145091582, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 6.877500000000001e-06, | |
| "loss": 0.1755, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.11546957535844138, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 6.852500000000001e-06, | |
| "loss": 0.2005, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.11568909926596693, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 6.8275e-06, | |
| "loss": 0.229, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.11590862317349249, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 6.8025000000000005e-06, | |
| "loss": 0.1747, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.11612814708101804, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 6.7775e-06, | |
| "loss": 0.1735, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.1163476709885436, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 6.752500000000001e-06, | |
| "loss": 0.1971, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.11656719489606915, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 6.7275e-06, | |
| "loss": 0.1885, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.1167867188035947, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 6.702500000000001e-06, | |
| "loss": 0.1801, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.11700624271112026, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 6.6775e-06, | |
| "loss": 0.2179, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.11722576661864581, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 6.6525e-06, | |
| "loss": 0.1607, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.11744529052617136, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 6.6275e-06, | |
| "loss": 0.1931, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.11766481443369692, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 6.602500000000001e-06, | |
| "loss": 0.2009, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.11788433834122247, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 6.5775e-06, | |
| "loss": 0.1551, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.11810386224874803, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 6.5525e-06, | |
| "loss": 0.1775, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.11832338615627358, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 6.5275000000000015e-06, | |
| "loss": 0.1673, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.11854291006379913, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 6.502500000000001e-06, | |
| "loss": 0.1785, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.11876243397132469, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 6.477500000000001e-06, | |
| "loss": 0.1902, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.11898195787885024, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 6.4525000000000005e-06, | |
| "loss": 0.1794, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.1192014817863758, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 6.427500000000001e-06, | |
| "loss": 0.1954, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.11942100569390135, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 6.4025e-06, | |
| "loss": 0.21, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.1196405296014269, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 6.377500000000001e-06, | |
| "loss": 0.1887, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.11986005350895246, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 6.352500000000001e-06, | |
| "loss": 0.1928, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.12007957741647801, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 6.327500000000001e-06, | |
| "loss": 0.2024, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.12029910132400357, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 6.3025e-06, | |
| "loss": 0.1855, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.12051862523152912, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 6.2775000000000005e-06, | |
| "loss": 0.1693, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.12073814913905467, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 6.2525e-06, | |
| "loss": 0.1777, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.12095767304658023, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 6.227500000000001e-06, | |
| "loss": 0.221, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.12117719695410578, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 6.2025e-06, | |
| "loss": 0.1692, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.12139672086163134, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 6.1775000000000006e-06, | |
| "loss": 0.192, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.12161624476915689, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 6.1525e-06, | |
| "loss": 0.1882, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.12183576867668244, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 6.127500000000001e-06, | |
| "loss": 0.2051, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.122055292584208, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 6.1025000000000004e-06, | |
| "loss": 0.2132, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.12227481649173355, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 6.077500000000001e-06, | |
| "loss": 0.1776, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.1224943403992591, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 6.0525e-06, | |
| "loss": 0.2029, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.12271386430678466, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 6.0275e-06, | |
| "loss": 0.209, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.12293338821431021, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 6.0025e-06, | |
| "loss": 0.1967, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.12315291212183577, | |
| "grad_norm": 1.0, | |
| "learning_rate": 5.977500000000001e-06, | |
| "loss": 0.2383, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.12337243602936132, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 5.9525e-06, | |
| "loss": 0.163, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.12359195993688687, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 5.9275e-06, | |
| "loss": 0.196, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.12381148384441243, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 5.902500000000001e-06, | |
| "loss": 0.2017, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.12403100775193798, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 5.8775e-06, | |
| "loss": 0.1859, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.12425053165946354, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 5.852500000000001e-06, | |
| "loss": 0.2002, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.12447005556698909, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 5.8275000000000005e-06, | |
| "loss": 0.1784, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.12468957947451464, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 5.802500000000001e-06, | |
| "loss": 0.1809, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.1249091033820402, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 5.7775e-06, | |
| "loss": 0.1663, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.12512862728956575, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 5.752500000000001e-06, | |
| "loss": 0.173, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.1253481511970913, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 5.727500000000001e-06, | |
| "loss": 0.1743, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.12556767510461686, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 5.702500000000001e-06, | |
| "loss": 0.2364, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.12578719901214241, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 5.6775e-06, | |
| "loss": 0.212, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.12600672291966797, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 5.6525000000000005e-06, | |
| "loss": 0.1757, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.12622624682719352, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 5.6275e-06, | |
| "loss": 0.1868, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.12644577073471908, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 5.602500000000001e-06, | |
| "loss": 0.1901, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.12666529464224463, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 5.5775e-06, | |
| "loss": 0.2302, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.12688481854977018, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 5.552500000000001e-06, | |
| "loss": 0.1639, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.12710434245729574, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 5.5275e-06, | |
| "loss": 0.1778, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.1273238663648213, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 5.5025e-06, | |
| "loss": 0.1727, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.12754339027234685, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 5.4775e-06, | |
| "loss": 0.1731, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.1277629141798724, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 5.452500000000001e-06, | |
| "loss": 0.1762, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.12798243808739795, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 5.4275e-06, | |
| "loss": 0.1874, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.1282019619949235, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 5.4025e-06, | |
| "loss": 0.199, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.12842148590244906, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 5.3775e-06, | |
| "loss": 0.1935, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.12864100980997462, | |
| "grad_norm": 0.875, | |
| "learning_rate": 5.352500000000001e-06, | |
| "loss": 0.1862, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.12886053371750017, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 5.3275e-06, | |
| "loss": 0.1693, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.12908005762502572, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 5.3025000000000005e-06, | |
| "loss": 0.1972, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.12929958153255128, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 5.277500000000001e-06, | |
| "loss": 0.1742, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.12951910544007683, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 5.2525e-06, | |
| "loss": 0.1765, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.12973862934760239, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 5.227500000000001e-06, | |
| "loss": 0.1713, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.12995815325512794, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 5.202500000000001e-06, | |
| "loss": 0.1647, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.1301776771626535, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 5.177500000000001e-06, | |
| "loss": 0.1795, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.13039720107017905, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 5.1525e-06, | |
| "loss": 0.2107, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.1306167249777046, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 5.1275000000000005e-06, | |
| "loss": 0.1919, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.13083624888523016, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 5.1025e-06, | |
| "loss": 0.1755, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.1310557727927557, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 5.077500000000001e-06, | |
| "loss": 0.1673, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.13127529670028126, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 5.0525e-06, | |
| "loss": 0.2152, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.13149482060780682, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 5.0275000000000006e-06, | |
| "loss": 0.2161, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.13171434451533237, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 5.0025e-06, | |
| "loss": 0.1849, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.13193386842285793, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 4.977500000000001e-06, | |
| "loss": 0.1786, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.13215339233038348, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 4.9525000000000004e-06, | |
| "loss": 0.1818, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.13237291623790903, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 4.927500000000001e-06, | |
| "loss": 0.182, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.1325924401454346, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 4.902500000000001e-06, | |
| "loss": 0.1778, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.13281196405296014, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 4.8775e-06, | |
| "loss": 0.165, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.1330314879604857, | |
| "grad_norm": 0.875, | |
| "learning_rate": 4.8525000000000006e-06, | |
| "loss": 0.2036, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.13325101186801125, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 4.827500000000001e-06, | |
| "loss": 0.1749, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.1334705357755368, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 4.8025e-06, | |
| "loss": 0.1979, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.13369005968306236, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 4.7775e-06, | |
| "loss": 0.1883, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.1339095835905879, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 4.752500000000001e-06, | |
| "loss": 0.1742, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.13412910749811346, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 4.7275e-06, | |
| "loss": 0.1704, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.13434863140563902, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 4.7025e-06, | |
| "loss": 0.1963, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.13456815531316457, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 4.6775000000000005e-06, | |
| "loss": 0.1935, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.13478767922069013, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 4.652500000000001e-06, | |
| "loss": 0.1756, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.13500720312821568, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 4.6275e-06, | |
| "loss": 0.1922, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.13522672703574123, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 4.6025e-06, | |
| "loss": 0.2005, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.1354462509432668, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 4.577500000000001e-06, | |
| "loss": 0.184, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.13566577485079234, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 4.5525e-06, | |
| "loss": 0.1865, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.1358852987583179, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 4.5275e-06, | |
| "loss": 0.201, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.13610482266584345, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 4.5025000000000005e-06, | |
| "loss": 0.2129, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.136324346573369, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 4.4775e-06, | |
| "loss": 0.2103, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.13654387048089456, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 4.4525e-06, | |
| "loss": 0.1933, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.1367633943884201, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 4.4275e-06, | |
| "loss": 0.1837, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.13698291829594567, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 4.4025e-06, | |
| "loss": 0.1696, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.13720244220347122, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 4.3775e-06, | |
| "loss": 0.1885, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.13742196611099677, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 4.3525e-06, | |
| "loss": 0.175, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.13764149001852233, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 4.3275000000000005e-06, | |
| "loss": 0.1905, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.13786101392604788, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 4.302500000000001e-06, | |
| "loss": 0.1854, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.13808053783357344, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 4.2775e-06, | |
| "loss": 0.2116, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.138300061741099, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 4.2525e-06, | |
| "loss": 0.184, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.13851958564862454, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 4.227500000000001e-06, | |
| "loss": 0.1831, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.1387391095561501, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 4.202500000000001e-06, | |
| "loss": 0.1765, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.13895863346367565, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 4.1775e-06, | |
| "loss": 0.1742, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.1391781573712012, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 4.1525000000000005e-06, | |
| "loss": 0.2031, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.13939768127872676, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 4.127500000000001e-06, | |
| "loss": 0.1976, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.1396172051862523, | |
| "grad_norm": 0.625, | |
| "learning_rate": 4.1025e-06, | |
| "loss": 0.1701, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.13983672909377787, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 4.0775e-06, | |
| "loss": 0.1827, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.14005625300130342, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 4.052500000000001e-06, | |
| "loss": 0.1767, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.14027577690882898, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 4.0275e-06, | |
| "loss": 0.1869, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.14049530081635453, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 4.0025e-06, | |
| "loss": 0.1954, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.14071482472388008, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 3.9775000000000005e-06, | |
| "loss": 0.1762, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.14093434863140564, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 3.9525e-06, | |
| "loss": 0.1865, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.1411538725389312, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.9275e-06, | |
| "loss": 0.1816, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.14137339644645674, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 3.9025e-06, | |
| "loss": 0.1757, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.1415929203539823, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 3.8775000000000006e-06, | |
| "loss": 0.1836, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.14181244426150785, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 3.8525e-06, | |
| "loss": 0.1953, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.1420319681690334, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 3.8275e-06, | |
| "loss": 0.1605, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.14225149207655896, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 3.8025e-06, | |
| "loss": 0.1865, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.14247101598408451, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.7775000000000003e-06, | |
| "loss": 0.1746, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.14269053989161007, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.7525e-06, | |
| "loss": 0.1572, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.14291006379913562, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.7275000000000007e-06, | |
| "loss": 0.1942, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.14312958770666118, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 3.7025000000000005e-06, | |
| "loss": 0.1841, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.14334911161418673, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.6775000000000004e-06, | |
| "loss": 0.1964, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.14356863552171228, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.6525000000000006e-06, | |
| "loss": 0.198, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.14378815942923784, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 3.6275000000000004e-06, | |
| "loss": 0.1773, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.1440076833367634, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 3.6025000000000002e-06, | |
| "loss": 0.1699, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.14422720724428895, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 3.5775000000000005e-06, | |
| "loss": 0.2117, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 0.1444467311518145, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 3.5525000000000003e-06, | |
| "loss": 0.1783, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.14466625505934005, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.5275000000000005e-06, | |
| "loss": 0.1608, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 0.1448857789668656, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 3.5025000000000003e-06, | |
| "loss": 0.1933, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.14510530287439116, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.4775e-06, | |
| "loss": 0.2031, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 0.14532482678191672, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 3.4525000000000004e-06, | |
| "loss": 0.188, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.14554435068944227, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 3.4275000000000002e-06, | |
| "loss": 0.1767, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 0.14576387459696782, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.4025000000000005e-06, | |
| "loss": 0.1888, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.14598339850449338, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 3.3775000000000003e-06, | |
| "loss": 0.1918, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.14620292241201893, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.3525e-06, | |
| "loss": 0.167, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.14642244631954449, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.3275000000000003e-06, | |
| "loss": 0.1635, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 0.14664197022707004, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 3.3025e-06, | |
| "loss": 0.2107, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.1468614941345956, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 3.2775e-06, | |
| "loss": 0.1872, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 0.14708101804212115, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 3.2525e-06, | |
| "loss": 0.1627, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.1473005419496467, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 3.2275e-06, | |
| "loss": 0.1499, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 0.14752006585717226, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.2025000000000003e-06, | |
| "loss": 0.1921, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.1477395897646978, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 3.1775e-06, | |
| "loss": 0.1811, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 0.14795911367222336, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 3.1525e-06, | |
| "loss": 0.176, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.14817863757974892, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 3.1275e-06, | |
| "loss": 0.2066, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.14839816148727447, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.1025000000000004e-06, | |
| "loss": 0.1424, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.14861768539480003, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.0775000000000006e-06, | |
| "loss": 0.1997, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 0.14883720930232558, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 3.0525000000000004e-06, | |
| "loss": 0.1976, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.14905673320985113, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.0275000000000002e-06, | |
| "loss": 0.1596, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 0.1492762571173767, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.0025000000000005e-06, | |
| "loss": 0.1694, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.14949578102490224, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 2.9775000000000003e-06, | |
| "loss": 0.1774, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 0.1497153049324278, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.9525000000000005e-06, | |
| "loss": 0.1849, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.14993482883995335, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 2.9275000000000003e-06, | |
| "loss": 0.2215, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 0.1501543527474789, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.9025e-06, | |
| "loss": 0.1916, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.15037387665500446, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 2.8775000000000004e-06, | |
| "loss": 0.185, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.15059340056253, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 2.8525000000000002e-06, | |
| "loss": 0.1826, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.15081292447005556, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 2.8275e-06, | |
| "loss": 0.1935, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 0.15103244837758112, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 2.8025000000000003e-06, | |
| "loss": 0.1683, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.15125197228510667, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 2.7775e-06, | |
| "loss": 0.2083, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 0.15147149619263223, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 2.7525000000000003e-06, | |
| "loss": 0.1656, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.15169102010015778, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 2.7275e-06, | |
| "loss": 0.1748, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 0.15191054400768333, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.7025e-06, | |
| "loss": 0.2087, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.1521300679152089, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 2.6775e-06, | |
| "loss": 0.1721, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 0.15234959182273444, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 2.6525e-06, | |
| "loss": 0.2098, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.15256911573026, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 2.6275000000000003e-06, | |
| "loss": 0.1765, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.15278863963778555, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 2.6025e-06, | |
| "loss": 0.1839, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.1530081635453111, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 2.5775e-06, | |
| "loss": 0.1648, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 0.15322768745283666, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 2.5525e-06, | |
| "loss": 0.1808, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.1534472113603622, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 2.5275e-06, | |
| "loss": 0.1903, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 0.15366673526788777, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.5024999999999998e-06, | |
| "loss": 0.1867, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.15388625917541332, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.4775e-06, | |
| "loss": 0.1942, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 0.15410578308293887, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 2.4525000000000002e-06, | |
| "loss": 0.1753, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.15432530699046443, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 2.4275e-06, | |
| "loss": 0.1916, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 0.15454483089798998, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 2.4025000000000003e-06, | |
| "loss": 0.1735, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.15476435480551554, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 2.3775e-06, | |
| "loss": 0.1675, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.1549838787130411, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.3525e-06, | |
| "loss": 0.176, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.15520340262056664, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 2.3275e-06, | |
| "loss": 0.1785, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 0.1554229265280922, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 2.3025000000000004e-06, | |
| "loss": 0.1981, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.15564245043561775, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 2.2775000000000002e-06, | |
| "loss": 0.2026, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 0.1558619743431433, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 2.2525e-06, | |
| "loss": 0.1676, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.15608149825066886, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 2.2275000000000003e-06, | |
| "loss": 0.1657, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 0.1563010221581944, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 2.2025e-06, | |
| "loss": 0.1702, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.15652054606571997, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.1775000000000003e-06, | |
| "loss": 0.1788, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 0.15674006997324552, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 2.1525e-06, | |
| "loss": 0.1713, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.15695959388077108, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 2.1275e-06, | |
| "loss": 0.1754, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.15717911778829663, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 2.1025e-06, | |
| "loss": 0.1924, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.15739864169582218, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.0775e-06, | |
| "loss": 0.1997, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 0.15761816560334774, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 2.0525000000000003e-06, | |
| "loss": 0.1917, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.1578376895108733, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 2.0275000000000005e-06, | |
| "loss": 0.2014, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 0.15805721341839885, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 2.0025000000000003e-06, | |
| "loss": 0.1756, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.1582767373259244, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 1.9775e-06, | |
| "loss": 0.1767, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 0.15849626123344995, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.9525000000000004e-06, | |
| "loss": 0.1863, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.1587157851409755, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.9275e-06, | |
| "loss": 0.2036, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 0.15893530904850106, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 1.9025000000000002e-06, | |
| "loss": 0.1922, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.15915483295602662, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.8775000000000002e-06, | |
| "loss": 0.1937, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.15937435686355217, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.8525e-06, | |
| "loss": 0.2084, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.15959388077107772, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 1.8275e-06, | |
| "loss": 0.2239, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 0.15981340467860328, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.8025000000000001e-06, | |
| "loss": 0.1826, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.16003292858612883, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.7775000000000001e-06, | |
| "loss": 0.1847, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 0.16025245249365438, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 1.7525e-06, | |
| "loss": 0.2061, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.16047197640117994, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 1.7275e-06, | |
| "loss": 0.1872, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 0.1606915003087055, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 1.7025000000000002e-06, | |
| "loss": 0.1826, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.16091102421623105, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.6775000000000002e-06, | |
| "loss": 0.1821, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 0.1611305481237566, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.6525000000000003e-06, | |
| "loss": 0.1842, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.16135007203128215, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 1.6275e-06, | |
| "loss": 0.1754, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.1615695959388077, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 1.6025000000000001e-06, | |
| "loss": 0.1928, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.16178911984633326, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.5775000000000001e-06, | |
| "loss": 0.1871, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 0.16200864375385882, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.5525000000000002e-06, | |
| "loss": 0.2064, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.16222816766138437, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.5275000000000002e-06, | |
| "loss": 0.2008, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 0.16244769156890992, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 1.5025e-06, | |
| "loss": 0.1788, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.16266721547643548, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.4775e-06, | |
| "loss": 0.1762, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 0.16288673938396103, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 1.4525e-06, | |
| "loss": 0.1807, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.1631062632914866, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.4275e-06, | |
| "loss": 0.2052, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 0.16332578719901214, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.4025000000000003e-06, | |
| "loss": 0.1669, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.1635453111065377, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 1.3775000000000002e-06, | |
| "loss": 0.1858, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.16376483501406325, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.3525000000000002e-06, | |
| "loss": 0.1636, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.1639843589215888, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.3275000000000002e-06, | |
| "loss": 0.1912, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 0.16420388282911436, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.3025000000000002e-06, | |
| "loss": 0.2127, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.1644234067366399, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 1.2775e-06, | |
| "loss": 0.1856, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 0.16464293064416546, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.2525e-06, | |
| "loss": 0.1888, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.16486245455169102, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.2275000000000001e-06, | |
| "loss": 0.2093, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 0.16508197845921657, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.2025000000000001e-06, | |
| "loss": 0.1947, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.16530150236674213, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.1775e-06, | |
| "loss": 0.2203, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 0.16552102627426768, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 1.1525000000000002e-06, | |
| "loss": 0.1957, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.16574055018179323, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.1275000000000002e-06, | |
| "loss": 0.1815, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.1659600740893188, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.1025e-06, | |
| "loss": 0.163, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.16617959799684434, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.0775e-06, | |
| "loss": 0.167, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 0.1663991219043699, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 1.0525e-06, | |
| "loss": 0.1934, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.16661864581189545, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.0275000000000001e-06, | |
| "loss": 0.1982, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 0.166838169719421, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.0025000000000001e-06, | |
| "loss": 0.202, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.16705769362694656, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 9.775000000000002e-07, | |
| "loss": 0.1615, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 0.1672772175344721, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 9.525000000000001e-07, | |
| "loss": 0.2037, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.16749674144199767, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 9.275000000000001e-07, | |
| "loss": 0.2211, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 0.16771626534952322, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 9.025e-07, | |
| "loss": 0.1871, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.16793578925704877, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 8.775000000000001e-07, | |
| "loss": 0.2264, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.16815531316457433, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 8.525000000000001e-07, | |
| "loss": 0.1951, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.16837483707209988, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 8.275000000000001e-07, | |
| "loss": 0.1819, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 0.16859436097962543, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 8.025e-07, | |
| "loss": 0.1665, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.168813884887151, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 7.775000000000001e-07, | |
| "loss": 0.1548, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 0.16903340879467654, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 7.525e-07, | |
| "loss": 0.1992, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.1692529327022021, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 7.275e-07, | |
| "loss": 0.1725, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 0.16947245660972765, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 7.025000000000002e-07, | |
| "loss": 0.1665, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 0.1696919805172532, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 6.775000000000001e-07, | |
| "loss": 0.1567, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 0.16991150442477876, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 6.525000000000001e-07, | |
| "loss": 0.1834, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 0.1701310283323043, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 6.275e-07, | |
| "loss": 0.1979, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.17035055223982987, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 6.025000000000001e-07, | |
| "loss": 0.2028, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 0.17057007614735542, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 5.775000000000001e-07, | |
| "loss": 0.181, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 0.17078960005488097, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 5.525e-07, | |
| "loss": 0.1798, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 0.17100912396240653, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 5.275e-07, | |
| "loss": 0.1906, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 0.17122864786993208, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 5.025000000000001e-07, | |
| "loss": 0.1689, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.17144817177745764, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 4.775000000000001e-07, | |
| "loss": 0.1893, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 0.1716676956849832, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 4.525e-07, | |
| "loss": 0.1728, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 0.17188721959250874, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 4.275000000000001e-07, | |
| "loss": 0.1853, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 0.1721067435000343, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 4.0250000000000006e-07, | |
| "loss": 0.1792, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 0.17232626740755985, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 3.7750000000000004e-07, | |
| "loss": 0.1823, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.1725457913150854, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 3.525e-07, | |
| "loss": 0.1791, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 0.17276531522261096, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 3.2750000000000004e-07, | |
| "loss": 0.192, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 0.17298483913013651, | |
| "grad_norm": 0.875, | |
| "learning_rate": 3.025e-07, | |
| "loss": 0.1608, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 0.17320436303766207, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 2.7750000000000004e-07, | |
| "loss": 0.1995, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 0.17342388694518762, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 2.525e-07, | |
| "loss": 0.186, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.17364341085271318, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 2.2750000000000002e-07, | |
| "loss": 0.1908, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 0.17386293476023873, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 2.0250000000000002e-07, | |
| "loss": 0.1637, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 0.17408245866776428, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 1.775e-07, | |
| "loss": 0.1692, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 0.17430198257528984, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 1.5250000000000002e-07, | |
| "loss": 0.1555, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 0.1745215064828154, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.275e-07, | |
| "loss": 0.1798, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.17474103039034095, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 1.0250000000000001e-07, | |
| "loss": 0.2014, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 0.1749605542978665, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 7.750000000000001e-08, | |
| "loss": 0.1988, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 0.17518007820539205, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 5.250000000000001e-08, | |
| "loss": 0.2301, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 0.1753996021129176, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 2.75e-08, | |
| "loss": 0.1639, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 0.17561912602044316, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 2.5e-09, | |
| "loss": 0.1793, | |
| "step": 8000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5267428972077793e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |