|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.980891719745223, |
|
"eval_steps": 500, |
|
"global_step": 156, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01910828025477707, |
|
"grad_norm": 7.708484172821045, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.2073, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03821656050955414, |
|
"grad_norm": 7.611099720001221, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.2027, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05732484076433121, |
|
"grad_norm": 7.214547157287598, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 1.1863, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07643312101910828, |
|
"grad_norm": 7.0485100746154785, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.1672, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09554140127388536, |
|
"grad_norm": 6.79018497467041, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.1641, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.11464968152866242, |
|
"grad_norm": 5.033609390258789, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.1304, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1337579617834395, |
|
"grad_norm": 2.866844892501831, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 1.0871, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.15286624203821655, |
|
"grad_norm": 2.427063226699829, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0407, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.17197452229299362, |
|
"grad_norm": 5.020963191986084, |
|
"learning_rate": 5.625e-06, |
|
"loss": 1.053, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1910828025477707, |
|
"grad_norm": 5.467742443084717, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.0493, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.21019108280254778, |
|
"grad_norm": 5.2093048095703125, |
|
"learning_rate": 6.875e-06, |
|
"loss": 1.0052, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.22929936305732485, |
|
"grad_norm": 3.968076467514038, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0001, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2484076433121019, |
|
"grad_norm": 4.139124870300293, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 1.0206, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.267515923566879, |
|
"grad_norm": 3.207618236541748, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.97, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.28662420382165604, |
|
"grad_norm": 2.585268259048462, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.9518, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3057324840764331, |
|
"grad_norm": 2.2902705669403076, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8687, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3248407643312102, |
|
"grad_norm": 2.4253506660461426, |
|
"learning_rate": 9.998741174712534e-06, |
|
"loss": 0.9033, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.34394904458598724, |
|
"grad_norm": 2.1728992462158203, |
|
"learning_rate": 9.994965332706574e-06, |
|
"loss": 0.9332, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3630573248407643, |
|
"grad_norm": 1.7330336570739746, |
|
"learning_rate": 9.98867437523228e-06, |
|
"loss": 0.8931, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3821656050955414, |
|
"grad_norm": 1.7003523111343384, |
|
"learning_rate": 9.979871469976197e-06, |
|
"loss": 0.8825, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4012738853503185, |
|
"grad_norm": 1.7072489261627197, |
|
"learning_rate": 9.968561049466214e-06, |
|
"loss": 0.8748, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.42038216560509556, |
|
"grad_norm": 1.5173128843307495, |
|
"learning_rate": 9.954748808839675e-06, |
|
"loss": 0.9187, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4394904458598726, |
|
"grad_norm": 1.184166431427002, |
|
"learning_rate": 9.938441702975689e-06, |
|
"loss": 0.8753, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.4585987261146497, |
|
"grad_norm": 1.209220290184021, |
|
"learning_rate": 9.91964794299315e-06, |
|
"loss": 0.8688, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.47770700636942676, |
|
"grad_norm": 1.1812776327133179, |
|
"learning_rate": 9.898376992116179e-06, |
|
"loss": 0.8692, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4968152866242038, |
|
"grad_norm": 1.200333595275879, |
|
"learning_rate": 9.874639560909118e-06, |
|
"loss": 0.9032, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5159235668789809, |
|
"grad_norm": 1.0679785013198853, |
|
"learning_rate": 9.848447601883436e-06, |
|
"loss": 0.843, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.535031847133758, |
|
"grad_norm": 0.872157633304596, |
|
"learning_rate": 9.819814303479268e-06, |
|
"loss": 0.8743, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.554140127388535, |
|
"grad_norm": 0.9527489542961121, |
|
"learning_rate": 9.788754083424654e-06, |
|
"loss": 0.9002, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5732484076433121, |
|
"grad_norm": 0.9146716594696045, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.8339, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5923566878980892, |
|
"grad_norm": 0.8611162900924683, |
|
"learning_rate": 9.719416651541839e-06, |
|
"loss": 0.8442, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6114649681528662, |
|
"grad_norm": 0.7984014749526978, |
|
"learning_rate": 9.681174353198687e-06, |
|
"loss": 0.8327, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6305732484076433, |
|
"grad_norm": 1.1210668087005615, |
|
"learning_rate": 9.640574942595195e-06, |
|
"loss": 0.8135, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6496815286624203, |
|
"grad_norm": 0.8967783451080322, |
|
"learning_rate": 9.597638862757255e-06, |
|
"loss": 0.8697, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6687898089171974, |
|
"grad_norm": 0.8040908575057983, |
|
"learning_rate": 9.552387733294081e-06, |
|
"loss": 0.8529, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6878980891719745, |
|
"grad_norm": 1.0382592678070068, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 0.8301, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7070063694267515, |
|
"grad_norm": 0.951885461807251, |
|
"learning_rate": 9.45503262094184e-06, |
|
"loss": 0.8276, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7261146496815286, |
|
"grad_norm": 0.8443611264228821, |
|
"learning_rate": 9.40297765928369e-06, |
|
"loss": 0.8255, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7452229299363057, |
|
"grad_norm": 0.8591488003730774, |
|
"learning_rate": 9.348705665778479e-06, |
|
"loss": 0.8254, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7643312101910829, |
|
"grad_norm": 0.7851564288139343, |
|
"learning_rate": 9.292243968009332e-06, |
|
"loss": 0.8579, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7834394904458599, |
|
"grad_norm": 0.8532727956771851, |
|
"learning_rate": 9.233620996141421e-06, |
|
"loss": 0.8215, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.802547770700637, |
|
"grad_norm": 0.7524774074554443, |
|
"learning_rate": 9.172866268606514e-06, |
|
"loss": 0.8329, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.821656050955414, |
|
"grad_norm": 0.8203862309455872, |
|
"learning_rate": 9.110010377239552e-06, |
|
"loss": 0.8479, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8407643312101911, |
|
"grad_norm": 0.7861639261245728, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.8653, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8598726114649682, |
|
"grad_norm": 0.7861957550048828, |
|
"learning_rate": 8.978122744408905e-06, |
|
"loss": 0.8374, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8789808917197452, |
|
"grad_norm": 0.7732595205307007, |
|
"learning_rate": 8.90915741234015e-06, |
|
"loss": 0.8295, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.8980891719745223, |
|
"grad_norm": 0.7838217616081238, |
|
"learning_rate": 8.838223701790057e-06, |
|
"loss": 0.8284, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.9171974522292994, |
|
"grad_norm": 0.757358193397522, |
|
"learning_rate": 8.765357330018056e-06, |
|
"loss": 0.8344, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9363057324840764, |
|
"grad_norm": 0.6782490611076355, |
|
"learning_rate": 8.690594987436705e-06, |
|
"loss": 0.8132, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.9554140127388535, |
|
"grad_norm": 0.7422513365745544, |
|
"learning_rate": 8.613974319136959e-06, |
|
"loss": 0.8021, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9745222929936306, |
|
"grad_norm": 0.8458060622215271, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.797, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9936305732484076, |
|
"grad_norm": 0.7343083024024963, |
|
"learning_rate": 8.455313244934324e-06, |
|
"loss": 0.819, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.0127388535031847, |
|
"grad_norm": 0.8479852080345154, |
|
"learning_rate": 8.373352729660373e-06, |
|
"loss": 0.8041, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.0318471337579618, |
|
"grad_norm": 0.7780753970146179, |
|
"learning_rate": 8.289693629698564e-06, |
|
"loss": 0.7353, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0509554140127388, |
|
"grad_norm": 0.7262213826179504, |
|
"learning_rate": 8.204378069925121e-06, |
|
"loss": 0.7098, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.070063694267516, |
|
"grad_norm": 0.7861199378967285, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 0.7316, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.089171974522293, |
|
"grad_norm": 0.7583940625190735, |
|
"learning_rate": 8.0289502192041e-06, |
|
"loss": 0.7164, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.10828025477707, |
|
"grad_norm": 0.8067767024040222, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.7395, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.127388535031847, |
|
"grad_norm": 0.8200020790100098, |
|
"learning_rate": 7.84742246584226e-06, |
|
"loss": 0.7217, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.1464968152866242, |
|
"grad_norm": 0.7458541989326477, |
|
"learning_rate": 7.754484907260513e-06, |
|
"loss": 0.7164, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1656050955414012, |
|
"grad_norm": 0.7313222289085388, |
|
"learning_rate": 7.660160382576683e-06, |
|
"loss": 0.7025, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.1847133757961783, |
|
"grad_norm": 0.8323162198066711, |
|
"learning_rate": 7.564496387029532e-06, |
|
"loss": 0.708, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.2038216560509554, |
|
"grad_norm": 0.7040552496910095, |
|
"learning_rate": 7.467541090321735e-06, |
|
"loss": 0.716, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.2229299363057324, |
|
"grad_norm": 0.6890729665756226, |
|
"learning_rate": 7.369343312364994e-06, |
|
"loss": 0.7143, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.2420382165605095, |
|
"grad_norm": 0.7383683323860168, |
|
"learning_rate": 7.269952498697734e-06, |
|
"loss": 0.6923, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.2611464968152866, |
|
"grad_norm": 0.7806842923164368, |
|
"learning_rate": 7.169418695587791e-06, |
|
"loss": 0.7027, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.2802547770700636, |
|
"grad_norm": 0.7316763997077942, |
|
"learning_rate": 7.067792524832604e-06, |
|
"loss": 0.7037, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.2993630573248407, |
|
"grad_norm": 0.7495648264884949, |
|
"learning_rate": 6.965125158269619e-06, |
|
"loss": 0.7166, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3184713375796178, |
|
"grad_norm": 0.6818484663963318, |
|
"learning_rate": 6.8614682920097265e-06, |
|
"loss": 0.7045, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.3375796178343948, |
|
"grad_norm": 0.6459638476371765, |
|
"learning_rate": 6.7568741204067145e-06, |
|
"loss": 0.6958, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.356687898089172, |
|
"grad_norm": 0.759671151638031, |
|
"learning_rate": 6.651395309775837e-06, |
|
"loss": 0.6958, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.3757961783439492, |
|
"grad_norm": 0.8332314491271973, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.7271, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.394904458598726, |
|
"grad_norm": 0.7383757829666138, |
|
"learning_rate": 6.437996637160086e-06, |
|
"loss": 0.7338, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.4140127388535033, |
|
"grad_norm": 0.7254230380058289, |
|
"learning_rate": 6.330184227833376e-06, |
|
"loss": 0.7093, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4331210191082802, |
|
"grad_norm": 0.8567808866500854, |
|
"learning_rate": 6.2217020306894705e-06, |
|
"loss": 0.7191, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4522292993630574, |
|
"grad_norm": 0.6672337055206299, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 0.6588, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.4713375796178343, |
|
"grad_norm": 0.6806163191795349, |
|
"learning_rate": 6.002947078916365e-06, |
|
"loss": 0.695, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.4904458598726116, |
|
"grad_norm": 0.7049647569656372, |
|
"learning_rate": 5.892784473993184e-06, |
|
"loss": 0.6863, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5095541401273884, |
|
"grad_norm": 0.7396883368492126, |
|
"learning_rate": 5.782172325201155e-06, |
|
"loss": 0.7092, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.5286624203821657, |
|
"grad_norm": 0.6506292223930359, |
|
"learning_rate": 5.671166329088278e-06, |
|
"loss": 0.6712, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5477707006369426, |
|
"grad_norm": 0.6761842370033264, |
|
"learning_rate": 5.559822380516539e-06, |
|
"loss": 0.7179, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5668789808917198, |
|
"grad_norm": 0.7145752310752869, |
|
"learning_rate": 5.448196544517168e-06, |
|
"loss": 0.7166, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.5859872611464967, |
|
"grad_norm": 0.763866126537323, |
|
"learning_rate": 5.336345028060199e-06, |
|
"loss": 0.7323, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.605095541401274, |
|
"grad_norm": 0.6711301207542419, |
|
"learning_rate": 5.224324151752575e-06, |
|
"loss": 0.7167, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6242038216560508, |
|
"grad_norm": 0.6363449692726135, |
|
"learning_rate": 5.112190321479026e-06, |
|
"loss": 0.7016, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.643312101910828, |
|
"grad_norm": 0.7524890303611755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7403, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.662420382165605, |
|
"grad_norm": 0.6629307270050049, |
|
"learning_rate": 4.887809678520976e-06, |
|
"loss": 0.6969, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.6815286624203822, |
|
"grad_norm": 0.6388441920280457, |
|
"learning_rate": 4.775675848247427e-06, |
|
"loss": 0.6936, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.700636942675159, |
|
"grad_norm": 0.7082536816596985, |
|
"learning_rate": 4.663654971939802e-06, |
|
"loss": 0.7065, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.7197452229299364, |
|
"grad_norm": 0.5878234505653381, |
|
"learning_rate": 4.551803455482833e-06, |
|
"loss": 0.6884, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7388535031847132, |
|
"grad_norm": 0.6400618553161621, |
|
"learning_rate": 4.4401776194834615e-06, |
|
"loss": 0.6928, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.7579617834394905, |
|
"grad_norm": 0.7262267470359802, |
|
"learning_rate": 4.3288336709117246e-06, |
|
"loss": 0.6991, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.7770700636942676, |
|
"grad_norm": 0.6692636013031006, |
|
"learning_rate": 4.217827674798845e-06, |
|
"loss": 0.6845, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.7961783439490446, |
|
"grad_norm": 0.6550084352493286, |
|
"learning_rate": 4.107215526006818e-06, |
|
"loss": 0.706, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.8152866242038217, |
|
"grad_norm": 0.7849605679512024, |
|
"learning_rate": 3.997052921083637e-06, |
|
"loss": 0.7135, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.8343949044585988, |
|
"grad_norm": 0.7244142889976501, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 0.7136, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.8535031847133758, |
|
"grad_norm": 0.6480051279067993, |
|
"learning_rate": 3.778297969310529e-06, |
|
"loss": 0.7117, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.872611464968153, |
|
"grad_norm": 0.6031900644302368, |
|
"learning_rate": 3.669815772166625e-06, |
|
"loss": 0.6943, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.89171974522293, |
|
"grad_norm": 0.6481626629829407, |
|
"learning_rate": 3.562003362839914e-06, |
|
"loss": 0.7036, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.910828025477707, |
|
"grad_norm": 0.6028628945350647, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.691, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.929936305732484, |
|
"grad_norm": 0.6366122364997864, |
|
"learning_rate": 3.3486046902241663e-06, |
|
"loss": 0.715, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.9490445859872612, |
|
"grad_norm": 0.7005493640899658, |
|
"learning_rate": 3.2431258795932863e-06, |
|
"loss": 0.6892, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.9681528662420382, |
|
"grad_norm": 0.645294725894928, |
|
"learning_rate": 3.1385317079902743e-06, |
|
"loss": 0.6585, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.9872611464968153, |
|
"grad_norm": 0.6466977596282959, |
|
"learning_rate": 3.0348748417303826e-06, |
|
"loss": 0.7003, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.0063694267515926, |
|
"grad_norm": 0.6410728096961975, |
|
"learning_rate": 2.932207475167398e-06, |
|
"loss": 0.6645, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.0254777070063694, |
|
"grad_norm": 0.6502147912979126, |
|
"learning_rate": 2.83058130441221e-06, |
|
"loss": 0.6352, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.0445859872611467, |
|
"grad_norm": 0.690471351146698, |
|
"learning_rate": 2.7300475013022666e-06, |
|
"loss": 0.6252, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.0636942675159236, |
|
"grad_norm": 0.6230637431144714, |
|
"learning_rate": 2.6306566876350072e-06, |
|
"loss": 0.6184, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.082802547770701, |
|
"grad_norm": 0.6241819262504578, |
|
"learning_rate": 2.532458909678266e-06, |
|
"loss": 0.5941, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.1019108280254777, |
|
"grad_norm": 0.6172868013381958, |
|
"learning_rate": 2.43550361297047e-06, |
|
"loss": 0.6202, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.121019108280255, |
|
"grad_norm": 0.6882143616676331, |
|
"learning_rate": 2.339839617423318e-06, |
|
"loss": 0.6181, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.140127388535032, |
|
"grad_norm": 0.678433358669281, |
|
"learning_rate": 2.245515092739488e-06, |
|
"loss": 0.6061, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.159235668789809, |
|
"grad_norm": 0.6652299165725708, |
|
"learning_rate": 2.1525775341577404e-06, |
|
"loss": 0.6266, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.178343949044586, |
|
"grad_norm": 0.6305775046348572, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.6136, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.1974522292993632, |
|
"grad_norm": 0.6421285271644592, |
|
"learning_rate": 1.971049780795901e-06, |
|
"loss": 0.5996, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.21656050955414, |
|
"grad_norm": 0.5938356518745422, |
|
"learning_rate": 1.8825509907063328e-06, |
|
"loss": 0.6067, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.2356687898089174, |
|
"grad_norm": 0.6172759532928467, |
|
"learning_rate": 1.7956219300748796e-06, |
|
"loss": 0.6304, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.254777070063694, |
|
"grad_norm": 0.6095340251922607, |
|
"learning_rate": 1.7103063703014372e-06, |
|
"loss": 0.6251, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.2738853503184715, |
|
"grad_norm": 0.6015990376472473, |
|
"learning_rate": 1.6266472703396286e-06, |
|
"loss": 0.6078, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.2929936305732483, |
|
"grad_norm": 0.562563419342041, |
|
"learning_rate": 1.544686755065677e-06, |
|
"loss": 0.6215, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3121019108280256, |
|
"grad_norm": 0.6117820739746094, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.6161, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.3312101910828025, |
|
"grad_norm": 0.6286083459854126, |
|
"learning_rate": 1.3860256808630429e-06, |
|
"loss": 0.6061, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.3503184713375798, |
|
"grad_norm": 0.6039571166038513, |
|
"learning_rate": 1.3094050125632973e-06, |
|
"loss": 0.6138, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.3694267515923566, |
|
"grad_norm": 0.5991165637969971, |
|
"learning_rate": 1.234642669981946e-06, |
|
"loss": 0.609, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.388535031847134, |
|
"grad_norm": 0.5799904465675354, |
|
"learning_rate": 1.1617762982099446e-06, |
|
"loss": 0.5998, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.4076433121019107, |
|
"grad_norm": 0.5716231465339661, |
|
"learning_rate": 1.0908425876598512e-06, |
|
"loss": 0.6081, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.426751592356688, |
|
"grad_norm": 0.6321515440940857, |
|
"learning_rate": 1.0218772555910955e-06, |
|
"loss": 0.6126, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.445859872611465, |
|
"grad_norm": 0.6207079887390137, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.6433, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.464968152866242, |
|
"grad_norm": 0.5693238973617554, |
|
"learning_rate": 8.899896227604509e-07, |
|
"loss": 0.6239, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.484076433121019, |
|
"grad_norm": 0.6268942952156067, |
|
"learning_rate": 8.271337313934869e-07, |
|
"loss": 0.6018, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.5031847133757963, |
|
"grad_norm": 0.5692458152770996, |
|
"learning_rate": 7.663790038585794e-07, |
|
"loss": 0.6089, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.522292993630573, |
|
"grad_norm": 0.5981783270835876, |
|
"learning_rate": 7.077560319906696e-07, |
|
"loss": 0.6138, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.5414012738853504, |
|
"grad_norm": 0.5806254148483276, |
|
"learning_rate": 6.512943342215234e-07, |
|
"loss": 0.6205, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.5605095541401273, |
|
"grad_norm": 0.6364081501960754, |
|
"learning_rate": 5.9702234071631e-07, |
|
"loss": 0.6085, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.5796178343949046, |
|
"grad_norm": 0.5379471182823181, |
|
"learning_rate": 5.449673790581611e-07, |
|
"loss": 0.6078, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.5987261146496814, |
|
"grad_norm": 0.5562213063240051, |
|
"learning_rate": 4.951556604879049e-07, |
|
"loss": 0.6079, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.6178343949044587, |
|
"grad_norm": 0.5902286171913147, |
|
"learning_rate": 4.4761226670592074e-07, |
|
"loss": 0.6007, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.6369426751592355, |
|
"grad_norm": 0.6162657141685486, |
|
"learning_rate": 4.0236113724274716e-07, |
|
"loss": 0.5939, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.656050955414013, |
|
"grad_norm": 0.5354500412940979, |
|
"learning_rate": 3.5942505740480583e-07, |
|
"loss": 0.6016, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.6751592356687897, |
|
"grad_norm": 0.5618153214454651, |
|
"learning_rate": 3.18825646801314e-07, |
|
"loss": 0.5971, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.694267515923567, |
|
"grad_norm": 0.5426648259162903, |
|
"learning_rate": 2.8058334845816214e-07, |
|
"loss": 0.5973, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.713375796178344, |
|
"grad_norm": 0.5665692687034607, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.6194, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.732484076433121, |
|
"grad_norm": 0.556145191192627, |
|
"learning_rate": 2.1124591657534776e-07, |
|
"loss": 0.6287, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.7515923566878984, |
|
"grad_norm": 0.5904255509376526, |
|
"learning_rate": 1.801856965207338e-07, |
|
"loss": 0.6378, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.770700636942675, |
|
"grad_norm": 0.5670586824417114, |
|
"learning_rate": 1.5155239811656562e-07, |
|
"loss": 0.6061, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.789808917197452, |
|
"grad_norm": 0.6461930274963379, |
|
"learning_rate": 1.253604390908819e-07, |
|
"loss": 0.5864, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.8089171974522293, |
|
"grad_norm": 0.5549110174179077, |
|
"learning_rate": 1.0162300788382263e-07, |
|
"loss": 0.614, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.8280254777070066, |
|
"grad_norm": 0.5617705583572388, |
|
"learning_rate": 8.035205700685167e-08, |
|
"loss": 0.6184, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.8471337579617835, |
|
"grad_norm": 0.5576688051223755, |
|
"learning_rate": 6.15582970243117e-08, |
|
"loss": 0.6153, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.8662420382165603, |
|
"grad_norm": 0.5535495281219482, |
|
"learning_rate": 4.52511911603265e-08, |
|
"loss": 0.6011, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.8853503184713376, |
|
"grad_norm": 0.5694997906684875, |
|
"learning_rate": 3.143895053378698e-08, |
|
"loss": 0.6115, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.904458598726115, |
|
"grad_norm": 0.5671418309211731, |
|
"learning_rate": 2.012853002380466e-08, |
|
"loss": 0.5982, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.9235668789808917, |
|
"grad_norm": 0.582301676273346, |
|
"learning_rate": 1.132562476771959e-08, |
|
"loss": 0.6132, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.9426751592356686, |
|
"grad_norm": 0.5656945705413818, |
|
"learning_rate": 5.034667293427053e-09, |
|
"loss": 0.5871, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.961783439490446, |
|
"grad_norm": 0.570439875125885, |
|
"learning_rate": 1.2588252874673469e-09, |
|
"loss": 0.6125, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.980891719745223, |
|
"grad_norm": 0.5605185627937317, |
|
"learning_rate": 0.0, |
|
"loss": 0.5954, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.980891719745223, |
|
"step": 156, |
|
"total_flos": 1.266291131972649e+17, |
|
"train_loss": 0.7465221996490772, |
|
"train_runtime": 3657.1071, |
|
"train_samples_per_second": 4.102, |
|
"train_steps_per_second": 0.043 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 156, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.266291131972649e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|