ryanmarten commited on
Commit
71ccd97
·
verified ·
1 Parent(s): 2ca9258

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +439 -439
  5. training_loss.png +0 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: am_100k
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # am_100k
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
19
 
20
  ## Model description
21
 
 
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: am_100k
 
16
 
17
  # am_100k
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/am_100k dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.9655172413793105,
3
- "total_flos": 4.786232862271603e+18,
4
- "train_loss": 0.0,
5
- "train_runtime": 15.6924,
6
- "train_samples_per_second": 5912.423,
7
- "train_steps_per_second": 11.471
8
  }
 
1
  {
2
+ "epoch": 4.9862068965517246,
3
+ "total_flos": 4.652519618982707e+18,
4
+ "train_loss": 0.20232847813102933,
5
+ "train_runtime": 7864.7522,
6
+ "train_samples_per_second": 11.797,
7
+ "train_steps_per_second": 0.023
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.9655172413793105,
3
- "total_flos": 4.786232862271603e+18,
4
- "train_loss": 0.0,
5
- "train_runtime": 15.6924,
6
- "train_samples_per_second": 5912.423,
7
- "train_steps_per_second": 11.471
8
  }
 
1
  {
2
+ "epoch": 4.9862068965517246,
3
+ "total_flos": 4.652519618982707e+18,
4
+ "train_loss": 0.20232847813102933,
5
+ "train_runtime": 7864.7522,
6
+ "train_samples_per_second": 11.797,
7
+ "train_steps_per_second": 0.023
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.9655172413793105,
5
  "eval_steps": 500,
6
  "global_step": 180,
7
  "is_hyper_param_search": false,
@@ -10,1272 +10,1272 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.027586206896551724,
13
- "grad_norm": 6.048244184358018,
14
  "learning_rate": 4.444444444444444e-06,
15
- "loss": 1.1114,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.05517241379310345,
20
- "grad_norm": 6.128523961574793,
21
  "learning_rate": 8.888888888888888e-06,
22
- "loss": 1.1182,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.08275862068965517,
27
- "grad_norm": 4.585031907245013,
28
  "learning_rate": 1.3333333333333333e-05,
29
- "loss": 1.0608,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.1103448275862069,
34
- "grad_norm": 2.0818255535512282,
35
  "learning_rate": 1.7777777777777777e-05,
36
- "loss": 0.9821,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.13793103448275862,
41
- "grad_norm": 5.670082204508566,
42
  "learning_rate": 2.2222222222222227e-05,
43
- "loss": 1.0184,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.16551724137931034,
48
- "grad_norm": 9.860654060648336,
49
  "learning_rate": 2.6666666666666667e-05,
50
- "loss": 1.0618,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.19310344827586207,
55
- "grad_norm": 5.84868479150573,
56
  "learning_rate": 3.111111111111112e-05,
57
- "loss": 0.994,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.2206896551724138,
62
- "grad_norm": 4.241883231288793,
63
  "learning_rate": 3.555555555555555e-05,
64
- "loss": 0.9391,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.2482758620689655,
69
- "grad_norm": 2.9377843777009893,
70
  "learning_rate": 4e-05,
71
- "loss": 0.9055,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.27586206896551724,
76
- "grad_norm": 2.4287532619102503,
77
  "learning_rate": 4.444444444444445e-05,
78
- "loss": 0.8671,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.30344827586206896,
83
- "grad_norm": 1.7709039212383275,
84
  "learning_rate": 4.88888888888889e-05,
85
- "loss": 0.849,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.3310344827586207,
90
- "grad_norm": 2.1766387830586047,
91
  "learning_rate": 5.333333333333333e-05,
92
- "loss": 0.8309,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.3586206896551724,
97
- "grad_norm": 1.4949626741459447,
98
  "learning_rate": 5.777777777777778e-05,
99
- "loss": 0.8201,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.38620689655172413,
104
- "grad_norm": 1.4905453601642287,
105
  "learning_rate": 6.222222222222223e-05,
106
- "loss": 0.8169,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.41379310344827586,
111
- "grad_norm": 2.259033985306724,
112
  "learning_rate": 6.666666666666667e-05,
113
- "loss": 0.8024,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.4413793103448276,
118
- "grad_norm": 1.4336986320826892,
119
  "learning_rate": 7.11111111111111e-05,
120
- "loss": 0.7912,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.4689655172413793,
125
- "grad_norm": 2.041826959920012,
126
  "learning_rate": 7.555555555555556e-05,
127
- "loss": 0.8067,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.496551724137931,
132
- "grad_norm": 1.0666780280290322,
133
  "learning_rate": 8e-05,
134
- "loss": 0.7786,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.5241379310344828,
139
- "grad_norm": 1.848278725508881,
140
  "learning_rate": 7.999247881794007e-05,
141
- "loss": 0.7714,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.5517241379310345,
146
- "grad_norm": 1.6888595634675432,
147
  "learning_rate": 7.996991810016922e-05,
148
- "loss": 0.7813,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.5793103448275863,
153
- "grad_norm": 1.3711841885480445,
154
  "learning_rate": 7.993232633085074e-05,
155
- "loss": 0.7705,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.6068965517241379,
160
- "grad_norm": 1.2852118365537728,
161
  "learning_rate": 7.987971764671168e-05,
162
- "loss": 0.7692,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.6344827586206897,
167
- "grad_norm": 0.958091141602523,
168
  "learning_rate": 7.981211183172663e-05,
169
- "loss": 0.7438,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.6620689655172414,
174
- "grad_norm": 2.1312896971048456,
175
  "learning_rate": 7.972953430967773e-05,
176
- "loss": 0.7752,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.6896551724137931,
181
- "grad_norm": 1.2512484954013028,
182
  "learning_rate": 7.963201613459381e-05,
183
- "loss": 0.7602,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.7172413793103448,
188
- "grad_norm": 1.965820846747513,
189
  "learning_rate": 7.951959397907237e-05,
190
- "loss": 0.763,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.7448275862068966,
195
- "grad_norm": 1.2536305988752512,
196
  "learning_rate": 7.939231012048833e-05,
197
- "loss": 0.7582,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.7724137931034483,
202
- "grad_norm": 1.4877796163409278,
203
  "learning_rate": 7.925021242509539e-05,
204
- "loss": 0.7556,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.8,
209
- "grad_norm": 1.2280174897561162,
210
  "learning_rate": 7.909335433002543e-05,
211
- "loss": 0.7392,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.8275862068965517,
216
- "grad_norm": 1.262213496725281,
217
  "learning_rate": 7.892179482319297e-05,
218
- "loss": 0.7275,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.8551724137931035,
223
- "grad_norm": 1.2461285461495126,
224
  "learning_rate": 7.873559842111225e-05,
225
- "loss": 0.7332,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.8827586206896552,
230
- "grad_norm": 1.0437544355721975,
231
  "learning_rate": 7.853483514463521e-05,
232
- "loss": 0.7367,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.9103448275862069,
237
- "grad_norm": 1.284142636022308,
238
  "learning_rate": 7.831958049261956e-05,
239
- "loss": 0.7192,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.9379310344827586,
244
- "grad_norm": 1.0978222371570605,
245
  "learning_rate": 7.808991541353662e-05,
246
- "loss": 0.7199,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.9655172413793104,
251
- "grad_norm": 1.228062924424424,
252
  "learning_rate": 7.784592627503004e-05,
253
- "loss": 0.7144,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.993103448275862,
258
- "grad_norm": 1.1498514242934654,
259
  "learning_rate": 7.758770483143634e-05,
260
- "loss": 0.705,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 1.0206896551724138,
265
- "grad_norm": 1.8774846941468446,
266
  "learning_rate": 7.731534818928004e-05,
267
- "loss": 1.235,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 1.0482758620689656,
272
- "grad_norm": 0.9987161382234897,
273
  "learning_rate": 7.702895877075563e-05,
274
- "loss": 0.6918,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 1.0758620689655172,
279
- "grad_norm": 1.4270636334035107,
280
  "learning_rate": 7.672864427521097e-05,
281
- "loss": 0.6933,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 1.103448275862069,
286
- "grad_norm": 0.9251969227415783,
287
  "learning_rate": 7.641451763864587e-05,
288
- "loss": 0.6738,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 1.1310344827586207,
293
- "grad_norm": 1.1160409717405146,
294
  "learning_rate": 7.608669699124153e-05,
295
- "loss": 0.6853,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 1.1586206896551725,
300
- "grad_norm": 1.0949147674177735,
301
  "learning_rate": 7.57453056129365e-05,
302
- "loss": 0.6681,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 1.186206896551724,
307
- "grad_norm": 0.9886798320746403,
308
  "learning_rate": 7.539047188706631e-05,
309
- "loss": 0.6729,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 1.2137931034482758,
314
- "grad_norm": 1.0095530760015006,
315
  "learning_rate": 7.502232925208365e-05,
316
- "loss": 0.6566,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 1.2413793103448276,
321
- "grad_norm": 0.9176565519096451,
322
  "learning_rate": 7.464101615137756e-05,
323
- "loss": 0.6682,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 1.2689655172413792,
328
- "grad_norm": 0.8515083463293015,
329
  "learning_rate": 7.424667598121067e-05,
330
- "loss": 0.6642,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 1.296551724137931,
335
- "grad_norm": 0.7912812847060907,
336
  "learning_rate": 7.383945703679365e-05,
337
- "loss": 0.6745,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 1.3241379310344827,
342
- "grad_norm": 0.5034884537363618,
343
  "learning_rate": 7.341951245651747e-05,
344
- "loss": 0.6474,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 1.3517241379310345,
349
- "grad_norm": 0.7432028876919811,
350
  "learning_rate": 7.298700016436427e-05,
351
- "loss": 0.6652,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 1.3793103448275863,
356
- "grad_norm": 0.6046442942870782,
357
  "learning_rate": 7.254208281051871e-05,
358
- "loss": 0.6583,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 1.4068965517241379,
363
- "grad_norm": 0.6714883350584058,
364
  "learning_rate": 7.208492771020176e-05,
365
- "loss": 0.6547,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 1.4344827586206896,
370
- "grad_norm": 0.769077486908158,
371
  "learning_rate": 7.161570678075038e-05,
372
- "loss": 0.6403,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 1.4620689655172414,
377
- "grad_norm": 0.5063852054295792,
378
  "learning_rate": 7.113459647696641e-05,
379
- "loss": 0.6454,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 1.489655172413793,
384
- "grad_norm": 0.3258263703520632,
385
  "learning_rate": 7.064177772475912e-05,
386
- "loss": 0.6482,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 1.5172413793103448,
391
- "grad_norm": 0.40259270386783697,
392
  "learning_rate": 7.013743585310642e-05,
393
- "loss": 0.6469,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 1.5448275862068965,
398
- "grad_norm": 0.5331097141521068,
399
  "learning_rate": 6.96217605243602e-05,
400
- "loss": 0.6524,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 1.5724137931034483,
405
- "grad_norm": 0.5121831997934545,
406
  "learning_rate": 6.909494566292195e-05,
407
- "loss": 0.651,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 1.6,
412
- "grad_norm": 0.31535600252855955,
413
  "learning_rate": 6.855718938231597e-05,
414
- "loss": 0.6374,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 1.6275862068965519,
419
- "grad_norm": 0.30169608577322565,
420
  "learning_rate": 6.800869391068674e-05,
421
- "loss": 0.635,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 1.6551724137931034,
426
- "grad_norm": 0.30049400766976586,
427
  "learning_rate": 6.744966551474936e-05,
428
- "loss": 0.6415,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 1.6827586206896552,
433
- "grad_norm": 0.4209614683699927,
434
  "learning_rate": 6.688031442222091e-05,
435
- "loss": 0.6386,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 1.7103448275862068,
440
- "grad_norm": 0.46464000543744954,
441
  "learning_rate": 6.630085474276256e-05,
442
- "loss": 0.6428,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 1.7379310344827585,
447
- "grad_norm": 0.41360907325359436,
448
  "learning_rate": 6.571150438746157e-05,
449
- "loss": 0.6443,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 1.7655172413793103,
454
- "grad_norm": 0.3825819326128904,
455
  "learning_rate": 6.511248498688396e-05,
456
- "loss": 0.634,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 1.793103448275862,
461
- "grad_norm": 0.38488753974210554,
462
  "learning_rate": 6.450402180772811e-05,
463
- "loss": 0.6351,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 1.8206896551724139,
468
- "grad_norm": 0.3884436448875785,
469
  "learning_rate": 6.388634366811146e-05,
470
- "loss": 0.64,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 1.8482758620689657,
475
- "grad_norm": 0.32102606962069763,
476
  "learning_rate": 6.325968285152107e-05,
477
- "loss": 0.6348,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 1.8758620689655172,
482
- "grad_norm": 0.3685097552173206,
483
  "learning_rate": 6.262427501946155e-05,
484
- "loss": 0.6379,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 1.903448275862069,
489
- "grad_norm": 0.5030548934456391,
490
  "learning_rate": 6.198035912283225e-05,
491
- "loss": 0.6403,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 1.9310344827586206,
496
- "grad_norm": 0.7115229105133056,
497
  "learning_rate": 6.132817731206766e-05,
498
- "loss": 0.6361,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 1.9586206896551723,
503
- "grad_norm": 0.9085995602583713,
504
  "learning_rate": 6.0667974846074524e-05,
505
- "loss": 0.6439,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 1.986206896551724,
510
- "grad_norm": 1.221676208452332,
511
  "learning_rate": 6.000000000000001e-05,
512
- "loss": 0.6501,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 2.013793103448276,
517
- "grad_norm": 1.098962915073864,
518
  "learning_rate": 5.9324503971865545e-05,
519
- "loss": 1.0766,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 2.0413793103448277,
524
- "grad_norm": 1.1139862963993739,
525
  "learning_rate": 5.8641740788101566e-05,
526
- "loss": 0.5821,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 2.0689655172413794,
531
- "grad_norm": 0.7926504196483102,
532
  "learning_rate": 5.79519672080185e-05,
533
- "loss": 0.5814,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 2.0965517241379312,
538
- "grad_norm": 0.4836752190121402,
539
  "learning_rate": 5.7255442627250146e-05,
540
- "loss": 0.5691,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 2.1241379310344826,
545
- "grad_norm": 0.6231872682864155,
546
  "learning_rate": 5.6552428980205575e-05,
547
- "loss": 0.5777,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 2.1517241379310343,
552
- "grad_norm": 0.8504947761681395,
553
  "learning_rate": 5.584319064156628e-05,
554
- "loss": 0.5808,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 2.179310344827586,
559
- "grad_norm": 0.7664865985172806,
560
  "learning_rate": 5.5127994326865706e-05,
561
- "loss": 0.5721,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 2.206896551724138,
566
- "grad_norm": 0.45009821427217384,
567
  "learning_rate": 5.440710899218842e-05,
568
- "loss": 0.5885,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 2.2344827586206897,
573
- "grad_norm": 0.5295669544206002,
574
  "learning_rate": 5.368080573302676e-05,
575
- "loss": 0.5784,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 2.2620689655172415,
580
- "grad_norm": 0.6069545788118131,
581
  "learning_rate": 5.294935768233285e-05,
582
- "loss": 0.5848,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 2.2896551724137932,
587
- "grad_norm": 0.3723173407781202,
588
  "learning_rate": 5.2213039907804535e-05,
589
- "loss": 0.5706,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 2.317241379310345,
594
- "grad_norm": 0.36467131449654966,
595
  "learning_rate": 5.1472129308443616e-05,
596
- "loss": 0.5788,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 2.344827586206897,
601
- "grad_norm": 0.43918512507379004,
602
  "learning_rate": 5.07269045104255e-05,
603
- "loss": 0.5779,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 2.372413793103448,
608
- "grad_norm": 0.29804139927079065,
609
  "learning_rate": 4.9977645762319255e-05,
610
- "loss": 0.5826,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 2.4,
615
- "grad_norm": 0.3617685923192836,
616
  "learning_rate": 4.922463482969761e-05,
617
- "loss": 0.5743,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 2.4275862068965517,
622
- "grad_norm": 0.3958418246469877,
623
  "learning_rate": 4.846815488917644e-05,
624
- "loss": 0.5667,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 2.4551724137931035,
629
- "grad_norm": 0.3481445885537368,
630
  "learning_rate": 4.7708490421923596e-05,
631
- "loss": 0.5708,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 2.4827586206896552,
636
- "grad_norm": 0.3425604141403272,
637
  "learning_rate": 4.694592710667723e-05,
638
- "loss": 0.5688,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 2.510344827586207,
643
- "grad_norm": 0.40177734001098314,
644
  "learning_rate": 4.618075171231363e-05,
645
- "loss": 0.5669,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 2.5379310344827584,
650
- "grad_norm": 0.29099701833530267,
651
  "learning_rate": 4.541325199000525e-05,
652
- "loss": 0.5754,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 2.56551724137931,
657
- "grad_norm": 0.26856519536575985,
658
  "learning_rate": 4.464371656500921e-05,
659
- "loss": 0.5722,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 2.593103448275862,
664
- "grad_norm": 0.3672905231776436,
665
  "learning_rate": 4.387243482812717e-05,
666
- "loss": 0.5739,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 2.6206896551724137,
671
- "grad_norm": 0.21317736512877272,
672
  "learning_rate": 4.309969682687724e-05,
673
- "loss": 0.5613,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 2.6482758620689655,
678
- "grad_norm": 0.2621968123513734,
679
  "learning_rate": 4.2325793156419035e-05,
680
- "loss": 0.5722,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 2.6758620689655173,
685
- "grad_norm": 0.3220289394960516,
686
  "learning_rate": 4.155101485027268e-05,
687
- "loss": 0.564,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 2.703448275862069,
692
- "grad_norm": 0.20667724871975712,
693
  "learning_rate": 4.077565327087298e-05,
694
- "loss": 0.5733,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 2.731034482758621,
699
- "grad_norm": 0.270131170253454,
700
  "learning_rate": 4e-05,
701
- "loss": 0.5749,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 2.7586206896551726,
706
- "grad_norm": 0.24605123035244134,
707
  "learning_rate": 3.9224346729127034e-05,
708
- "loss": 0.5659,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 2.7862068965517244,
713
- "grad_norm": 0.1952924137800316,
714
  "learning_rate": 3.844898514972733e-05,
715
- "loss": 0.5668,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 2.8137931034482757,
720
- "grad_norm": 0.2361123059496943,
721
  "learning_rate": 3.767420684358097e-05,
722
- "loss": 0.5702,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 2.8413793103448275,
727
- "grad_norm": 0.18467799589256295,
728
  "learning_rate": 3.690030317312277e-05,
729
- "loss": 0.568,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 2.8689655172413793,
734
- "grad_norm": 0.242288989184374,
735
  "learning_rate": 3.612756517187284e-05,
736
- "loss": 0.5724,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 2.896551724137931,
741
- "grad_norm": 0.21175011904492963,
742
  "learning_rate": 3.535628343499079e-05,
743
- "loss": 0.5778,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 2.924137931034483,
748
- "grad_norm": 0.20379198813949495,
749
  "learning_rate": 3.458674800999477e-05,
750
- "loss": 0.5775,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 2.9517241379310346,
755
- "grad_norm": 0.20271740174715297,
756
  "learning_rate": 3.3819248287686386e-05,
757
- "loss": 0.5726,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 2.979310344827586,
762
- "grad_norm": 0.20729414058393245,
763
  "learning_rate": 3.305407289332279e-05,
764
- "loss": 0.5604,
765
  "step": 108
766
  },
767
  {
768
- "epoch": 3.0068965517241377,
769
- "grad_norm": 0.3076386520316924,
770
  "learning_rate": 3.229150957807641e-05,
771
- "loss": 1.0015,
772
  "step": 109
773
  },
774
  {
775
- "epoch": 3.0344827586206895,
776
- "grad_norm": 0.3249950011947247,
777
  "learning_rate": 3.153184511082359e-05,
778
- "loss": 0.5213,
779
  "step": 110
780
  },
781
  {
782
- "epoch": 3.0620689655172413,
783
- "grad_norm": 0.19776597344993513,
784
  "learning_rate": 3.07753651703024e-05,
785
- "loss": 0.52,
786
  "step": 111
787
  },
788
  {
789
- "epoch": 3.089655172413793,
790
- "grad_norm": 0.23949243914948978,
791
  "learning_rate": 3.0022354237680752e-05,
792
- "loss": 0.5251,
793
  "step": 112
794
  },
795
  {
796
- "epoch": 3.117241379310345,
797
- "grad_norm": 0.24078394703769124,
798
  "learning_rate": 2.9273095489574502e-05,
799
- "loss": 0.5295,
800
  "step": 113
801
  },
802
  {
803
- "epoch": 3.1448275862068966,
804
- "grad_norm": 0.18480805575851064,
805
  "learning_rate": 2.8527870691556404e-05,
806
- "loss": 0.5166,
807
  "step": 114
808
  },
809
  {
810
- "epoch": 3.1724137931034484,
811
- "grad_norm": 0.23503902506452476,
812
  "learning_rate": 2.778696009219548e-05,
813
- "loss": 0.5156,
814
  "step": 115
815
  },
816
  {
817
- "epoch": 3.2,
818
- "grad_norm": 0.18487291082471574,
819
  "learning_rate": 2.7050642317667164e-05,
820
- "loss": 0.5164,
821
  "step": 116
822
  },
823
  {
824
- "epoch": 3.227586206896552,
825
- "grad_norm": 0.2422568087782523,
826
  "learning_rate": 2.6319194266973256e-05,
827
- "loss": 0.5168,
828
  "step": 117
829
  },
830
  {
831
- "epoch": 3.2551724137931033,
832
- "grad_norm": 0.1902265878253806,
833
  "learning_rate": 2.5592891007811594e-05,
834
- "loss": 0.5299,
835
  "step": 118
836
  },
837
  {
838
- "epoch": 3.282758620689655,
839
- "grad_norm": 0.2214456953218186,
840
  "learning_rate": 2.4872005673134307e-05,
841
- "loss": 0.5182,
842
  "step": 119
843
  },
844
  {
845
- "epoch": 3.310344827586207,
846
- "grad_norm": 0.17984634469297897,
847
  "learning_rate": 2.4156809358433728e-05,
848
- "loss": 0.5113,
849
  "step": 120
850
  },
851
  {
852
- "epoch": 3.3379310344827586,
853
- "grad_norm": 0.21875998472911068,
854
  "learning_rate": 2.3447571019794438e-05,
855
- "loss": 0.5169,
856
  "step": 121
857
  },
858
  {
859
- "epoch": 3.3655172413793104,
860
- "grad_norm": 0.16326758193343743,
861
  "learning_rate": 2.274455737274987e-05,
862
- "loss": 0.522,
863
  "step": 122
864
  },
865
  {
866
- "epoch": 3.393103448275862,
867
- "grad_norm": 0.2429670769160995,
868
  "learning_rate": 2.2048032791981515e-05,
869
- "loss": 0.5344,
870
  "step": 123
871
  },
872
  {
873
- "epoch": 3.420689655172414,
874
- "grad_norm": 0.14437853786734225,
875
  "learning_rate": 2.135825921189846e-05,
876
- "loss": 0.5163,
877
  "step": 124
878
  },
879
  {
880
- "epoch": 3.4482758620689653,
881
- "grad_norm": 0.21797343034175723,
882
  "learning_rate": 2.067549602813446e-05,
883
  "loss": 0.5227,
884
  "step": 125
885
  },
886
  {
887
- "epoch": 3.475862068965517,
888
- "grad_norm": 0.16171061969635037,
889
  "learning_rate": 2.0000000000000012e-05,
890
- "loss": 0.5235,
891
  "step": 126
892
  },
893
  {
894
- "epoch": 3.503448275862069,
895
- "grad_norm": 0.17879521605019197,
896
  "learning_rate": 1.9332025153925486e-05,
897
- "loss": 0.5239,
898
  "step": 127
899
  },
900
  {
901
- "epoch": 3.5310344827586206,
902
- "grad_norm": 0.17694265501733974,
903
  "learning_rate": 1.867182268793236e-05,
904
- "loss": 0.5179,
905
  "step": 128
906
  },
907
  {
908
- "epoch": 3.5586206896551724,
909
- "grad_norm": 0.15579594604420435,
910
  "learning_rate": 1.8019640877167763e-05,
911
- "loss": 0.5246,
912
  "step": 129
913
  },
914
  {
915
- "epoch": 3.586206896551724,
916
- "grad_norm": 0.17610048435328357,
917
  "learning_rate": 1.7375724980538465e-05,
918
- "loss": 0.5136,
919
  "step": 130
920
  },
921
  {
922
- "epoch": 3.613793103448276,
923
- "grad_norm": 0.11425556810511402,
924
  "learning_rate": 1.6740317148478932e-05,
925
- "loss": 0.5164,
926
  "step": 131
927
  },
928
  {
929
- "epoch": 3.6413793103448278,
930
- "grad_norm": 0.1635419637304657,
931
  "learning_rate": 1.6113656331888563e-05,
932
- "loss": 0.512,
933
  "step": 132
934
  },
935
  {
936
- "epoch": 3.6689655172413795,
937
- "grad_norm": 0.11963505370102022,
938
  "learning_rate": 1.5495978192271887e-05,
939
- "loss": 0.5183,
940
  "step": 133
941
  },
942
  {
943
- "epoch": 3.696551724137931,
944
- "grad_norm": 0.12051650165728112,
945
  "learning_rate": 1.4887515013116067e-05,
946
- "loss": 0.5269,
947
  "step": 134
948
  },
949
  {
950
- "epoch": 3.7241379310344827,
951
- "grad_norm": 0.13753843204851934,
952
  "learning_rate": 1.4288495612538427e-05,
953
- "loss": 0.5248,
954
  "step": 135
955
  },
956
  {
957
- "epoch": 3.7517241379310344,
958
- "grad_norm": 0.11829012922751402,
959
  "learning_rate": 1.369914525723746e-05,
960
- "loss": 0.5252,
961
  "step": 136
962
  },
963
  {
964
- "epoch": 3.779310344827586,
965
- "grad_norm": 0.13693979706276124,
966
  "learning_rate": 1.3119685577779105e-05,
967
- "loss": 0.5172,
968
  "step": 137
969
  },
970
  {
971
- "epoch": 3.806896551724138,
972
- "grad_norm": 0.11476776048951762,
973
  "learning_rate": 1.2550334485250661e-05,
974
- "loss": 0.5273,
975
  "step": 138
976
  },
977
  {
978
- "epoch": 3.8344827586206898,
979
- "grad_norm": 0.10282974047324488,
980
  "learning_rate": 1.1991306089313261e-05,
981
- "loss": 0.5131,
982
  "step": 139
983
  },
984
  {
985
- "epoch": 3.862068965517241,
986
- "grad_norm": 0.12768590859383086,
987
  "learning_rate": 1.1442810617684046e-05,
988
- "loss": 0.5165,
989
  "step": 140
990
  },
991
  {
992
- "epoch": 3.889655172413793,
993
- "grad_norm": 0.10849675393238085,
994
  "learning_rate": 1.0905054337078051e-05,
995
- "loss": 0.5183,
996
  "step": 141
997
  },
998
  {
999
- "epoch": 3.9172413793103447,
1000
- "grad_norm": 0.10656182230683149,
1001
  "learning_rate": 1.0378239475639823e-05,
1002
- "loss": 0.5242,
1003
  "step": 142
1004
  },
1005
  {
1006
- "epoch": 3.9448275862068964,
1007
- "grad_norm": 0.11589471940857303,
1008
  "learning_rate": 9.862564146893571e-06,
1009
- "loss": 0.5241,
1010
  "step": 143
1011
  },
1012
  {
1013
- "epoch": 3.972413793103448,
1014
- "grad_norm": 0.09357600237822784,
1015
  "learning_rate": 9.358222275240884e-06,
1016
- "loss": 0.5187,
1017
  "step": 144
1018
  },
1019
  {
1020
- "epoch": 4.0,
1021
- "grad_norm": 0.2005801183502208,
1022
  "learning_rate": 8.8654035230336e-06,
1023
- "loss": 0.8967,
1024
  "step": 145
1025
  },
1026
  {
1027
- "epoch": 4.027586206896552,
1028
- "grad_norm": 0.14696406689886707,
1029
  "learning_rate": 8.384293219249633e-06,
1030
- "loss": 0.4983,
1031
  "step": 146
1032
  },
1033
  {
1034
- "epoch": 4.055172413793104,
1035
- "grad_norm": 0.11996489163720403,
1036
  "learning_rate": 7.915072289798247e-06,
1037
- "loss": 0.4848,
1038
  "step": 147
1039
  },
1040
  {
1041
- "epoch": 4.082758620689655,
1042
- "grad_norm": 0.11358461685367047,
1043
  "learning_rate": 7.457917189481301e-06,
1044
- "loss": 0.4872,
1045
  "step": 148
1046
  },
1047
  {
1048
- "epoch": 4.110344827586207,
1049
- "grad_norm": 0.11365275396741593,
1050
  "learning_rate": 7.0129998356357295e-06,
1051
- "loss": 0.4868,
1052
  "step": 149
1053
  },
1054
  {
1055
- "epoch": 4.137931034482759,
1056
- "grad_norm": 0.10882228461433846,
1057
  "learning_rate": 6.58048754348255e-06,
1058
- "loss": 0.4939,
1059
  "step": 150
1060
  },
1061
  {
1062
- "epoch": 4.165517241379311,
1063
- "grad_norm": 0.11095520891254505,
1064
  "learning_rate": 6.160542963206357e-06,
1065
- "loss": 0.488,
1066
  "step": 151
1067
  },
1068
  {
1069
- "epoch": 4.1931034482758625,
1070
- "grad_norm": 0.11999116105780049,
1071
  "learning_rate": 5.753324018789346e-06,
1072
- "loss": 0.5033,
1073
  "step": 152
1074
  },
1075
  {
1076
- "epoch": 4.220689655172414,
1077
- "grad_norm": 0.11645518370728632,
1078
  "learning_rate": 5.358983848622452e-06,
1079
- "loss": 0.4901,
1080
  "step": 153
1081
  },
1082
  {
1083
- "epoch": 4.248275862068965,
1084
- "grad_norm": 0.10736136122526953,
1085
  "learning_rate": 4.97767074791637e-06,
1086
- "loss": 0.4982,
1087
  "step": 154
1088
  },
1089
  {
1090
- "epoch": 4.275862068965517,
1091
- "grad_norm": 0.09879345526449772,
1092
  "learning_rate": 4.609528112933688e-06,
1093
- "loss": 0.4903,
1094
  "step": 155
1095
  },
1096
  {
1097
- "epoch": 4.303448275862069,
1098
- "grad_norm": 0.10011026403485264,
1099
  "learning_rate": 4.254694387063514e-06,
1100
- "loss": 0.4869,
1101
  "step": 156
1102
  },
1103
  {
1104
- "epoch": 4.3310344827586205,
1105
- "grad_norm": 0.10184449212963452,
1106
  "learning_rate": 3.913303008758491e-06,
1107
- "loss": 0.4925,
1108
  "step": 157
1109
  },
1110
  {
1111
- "epoch": 4.358620689655172,
1112
- "grad_norm": 0.10450583586781954,
1113
  "learning_rate": 3.585482361354138e-06,
1114
- "loss": 0.4989,
1115
  "step": 158
1116
  },
1117
  {
1118
- "epoch": 4.386206896551724,
1119
- "grad_norm": 0.0968156701618347,
1120
  "learning_rate": 3.2713557247890447e-06,
1121
- "loss": 0.499,
1122
  "step": 159
1123
  },
1124
  {
1125
- "epoch": 4.413793103448276,
1126
- "grad_norm": 0.09141538856548265,
1127
  "learning_rate": 2.9710412292443868e-06,
1128
- "loss": 0.4854,
1129
  "step": 160
1130
  },
1131
  {
1132
- "epoch": 4.441379310344828,
1133
- "grad_norm": 0.09480481626345198,
1134
  "learning_rate": 2.6846518107199782e-06,
1135
- "loss": 0.4937,
1136
  "step": 161
1137
  },
1138
  {
1139
- "epoch": 4.468965517241379,
1140
- "grad_norm": 0.09244475605472917,
1141
  "learning_rate": 2.4122951685636674e-06,
1142
- "loss": 0.4846,
1143
  "step": 162
1144
  },
1145
  {
1146
- "epoch": 4.496551724137931,
1147
- "grad_norm": 0.0955378295114542,
1148
  "learning_rate": 2.1540737249699893e-06,
1149
- "loss": 0.5049,
1150
  "step": 163
1151
  },
1152
  {
1153
- "epoch": 4.524137931034483,
1154
- "grad_norm": 0.09415683648753735,
1155
  "learning_rate": 1.9100845864633875e-06,
1156
- "loss": 0.4995,
1157
  "step": 164
1158
  },
1159
  {
1160
- "epoch": 4.551724137931035,
1161
- "grad_norm": 0.09256086502058866,
1162
  "learning_rate": 1.6804195073804442e-06,
1163
- "loss": 0.4887,
1164
  "step": 165
1165
  },
1166
  {
1167
- "epoch": 4.5793103448275865,
1168
- "grad_norm": 0.08984083528669612,
1169
  "learning_rate": 1.4651648553647869e-06,
1170
- "loss": 0.4971,
1171
  "step": 166
1172
  },
1173
  {
1174
- "epoch": 4.606896551724138,
1175
- "grad_norm": 0.08794201397224419,
1176
  "learning_rate": 1.2644015788877684e-06,
1177
- "loss": 0.4872,
1178
  "step": 167
1179
  },
1180
  {
1181
- "epoch": 4.63448275862069,
1182
- "grad_norm": 0.08527818313124648,
1183
  "learning_rate": 1.0782051768070477e-06,
1184
- "loss": 0.4921,
1185
  "step": 168
1186
  },
1187
  {
1188
- "epoch": 4.662068965517241,
1189
- "grad_norm": 0.08619600676422086,
1190
  "learning_rate": 9.066456699745774e-07,
1191
- "loss": 0.4999,
1192
  "step": 169
1193
  },
1194
  {
1195
- "epoch": 4.689655172413794,
1196
- "grad_norm": 0.08400292356812496,
1197
  "learning_rate": 7.497875749046124e-07,
1198
- "loss": 0.498,
1199
  "step": 170
1200
  },
1201
  {
1202
- "epoch": 4.7172413793103445,
1203
- "grad_norm": 0.08429763197590825,
1204
  "learning_rate": 6.076898795116792e-07,
1205
- "loss": 0.4935,
1206
  "step": 171
1207
  },
1208
  {
1209
- "epoch": 4.744827586206896,
1210
- "grad_norm": 0.08549163461863384,
1211
  "learning_rate": 4.804060209276396e-07,
1212
- "loss": 0.4944,
1213
  "step": 172
1214
  },
1215
  {
1216
- "epoch": 4.772413793103448,
1217
- "grad_norm": 0.08401106378330236,
1218
  "learning_rate": 3.679838654061874e-07,
1219
- "loss": 0.491,
1220
  "step": 173
1221
  },
1222
  {
1223
- "epoch": 4.8,
1224
- "grad_norm": 0.0841142588132846,
1225
  "learning_rate": 2.704656903222791e-07,
1226
- "loss": 0.4906,
1227
  "step": 174
1228
  },
1229
  {
1230
- "epoch": 4.827586206896552,
1231
- "grad_norm": 0.08502878964043316,
1232
  "learning_rate": 1.8788816827336686e-07,
1233
- "loss": 0.4972,
1234
  "step": 175
1235
  },
1236
  {
1237
- "epoch": 4.855172413793103,
1238
- "grad_norm": 0.08487729047940562,
1239
  "learning_rate": 1.2028235328831906e-07,
1240
- "loss": 0.4991,
1241
  "step": 176
1242
  },
1243
  {
1244
- "epoch": 4.882758620689655,
1245
- "grad_norm": 0.08075987729602521,
1246
  "learning_rate": 6.767366914927298e-08,
1247
- "loss": 0.4877,
1248
  "step": 177
1249
  },
1250
  {
1251
- "epoch": 4.910344827586207,
1252
- "grad_norm": 0.08252425353088089,
1253
  "learning_rate": 3.0081899830798345e-08,
1254
- "loss": 0.5049,
1255
  "step": 178
1256
  },
1257
  {
1258
- "epoch": 4.937931034482759,
1259
- "grad_norm": 0.0815920458724209,
1260
  "learning_rate": 7.521182059946342e-09,
1261
- "loss": 0.4965,
1262
  "step": 179
1263
  },
1264
  {
1265
- "epoch": 4.9655172413793105,
1266
- "grad_norm": 0.08271877616161065,
1267
  "learning_rate": 0.0,
1268
- "loss": 0.4967,
1269
  "step": 180
1270
  },
1271
  {
1272
- "epoch": 4.9655172413793105,
1273
  "step": 180,
1274
- "total_flos": 4.786232862271603e+18,
1275
- "train_loss": 0.0,
1276
- "train_runtime": 15.6924,
1277
- "train_samples_per_second": 5912.423,
1278
- "train_steps_per_second": 11.471
1279
  }
1280
  ],
1281
  "logging_steps": 1,
@@ -1295,7 +1295,7 @@
1295
  "attributes": {}
1296
  }
1297
  },
1298
- "total_flos": 4.786232862271603e+18,
1299
  "train_batch_size": 1,
1300
  "trial_name": null,
1301
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.9862068965517246,
5
  "eval_steps": 500,
6
  "global_step": 180,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.027586206896551724,
13
+ "grad_norm": 6.057142949758365,
14
  "learning_rate": 4.444444444444444e-06,
15
+ "loss": 1.1152,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.05517241379310345,
20
+ "grad_norm": 6.145910541936948,
21
  "learning_rate": 8.888888888888888e-06,
22
+ "loss": 1.1226,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.08275862068965517,
27
+ "grad_norm": 4.6156584836388985,
28
  "learning_rate": 1.3333333333333333e-05,
29
+ "loss": 1.0668,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.1103448275862069,
34
+ "grad_norm": 2.0844917792361026,
35
  "learning_rate": 1.7777777777777777e-05,
36
+ "loss": 0.9858,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.13793103448275862,
41
+ "grad_norm": 5.639314445722297,
42
  "learning_rate": 2.2222222222222227e-05,
43
+ "loss": 1.0206,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.16551724137931034,
48
+ "grad_norm": 9.814320245486522,
49
  "learning_rate": 2.6666666666666667e-05,
50
+ "loss": 1.0636,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.19310344827586207,
55
+ "grad_norm": 5.826021228513547,
56
  "learning_rate": 3.111111111111112e-05,
57
+ "loss": 0.9962,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.2206896551724138,
62
+ "grad_norm": 4.240105877151636,
63
  "learning_rate": 3.555555555555555e-05,
64
+ "loss": 0.9422,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.2482758620689655,
69
+ "grad_norm": 2.948003996753527,
70
  "learning_rate": 4e-05,
71
+ "loss": 0.908,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.27586206896551724,
76
+ "grad_norm": 2.442813041969753,
77
  "learning_rate": 4.444444444444445e-05,
78
+ "loss": 0.8726,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.30344827586206896,
83
+ "grad_norm": 1.78751318355134,
84
  "learning_rate": 4.88888888888889e-05,
85
+ "loss": 0.8519,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.3310344827586207,
90
+ "grad_norm": 2.145676514809894,
91
  "learning_rate": 5.333333333333333e-05,
92
+ "loss": 0.8338,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.3586206896551724,
97
+ "grad_norm": 1.4973609466110933,
98
  "learning_rate": 5.777777777777778e-05,
99
+ "loss": 0.8234,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.38620689655172413,
104
+ "grad_norm": 1.4701216908020476,
105
  "learning_rate": 6.222222222222223e-05,
106
+ "loss": 0.8208,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.41379310344827586,
111
+ "grad_norm": 2.2690605841110383,
112
  "learning_rate": 6.666666666666667e-05,
113
+ "loss": 0.8051,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.4413793103448276,
118
+ "grad_norm": 1.459759924944365,
119
  "learning_rate": 7.11111111111111e-05,
120
+ "loss": 0.7941,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.4689655172413793,
125
+ "grad_norm": 1.9924380747391626,
126
  "learning_rate": 7.555555555555556e-05,
127
+ "loss": 0.808,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.496551724137931,
132
+ "grad_norm": 1.012667067650111,
133
  "learning_rate": 8e-05,
134
+ "loss": 0.7795,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.5241379310344828,
139
+ "grad_norm": 1.8264998941678103,
140
  "learning_rate": 7.999247881794007e-05,
141
+ "loss": 0.7755,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.5517241379310345,
146
+ "grad_norm": 1.7591806028810983,
147
  "learning_rate": 7.996991810016922e-05,
148
+ "loss": 0.7841,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.5793103448275863,
153
+ "grad_norm": 1.8035602837249733,
154
  "learning_rate": 7.993232633085074e-05,
155
+ "loss": 0.7739,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.6068965517241379,
160
+ "grad_norm": 1.1926701865629463,
161
  "learning_rate": 7.987971764671168e-05,
162
+ "loss": 0.7766,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.6344827586206897,
167
+ "grad_norm": 1.5260088465114232,
168
  "learning_rate": 7.981211183172663e-05,
169
+ "loss": 0.7519,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.6620689655172414,
174
+ "grad_norm": 1.2196288042610952,
175
  "learning_rate": 7.972953430967773e-05,
176
+ "loss": 0.7535,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.6896551724137931,
181
+ "grad_norm": 1.3358221205077483,
182
  "learning_rate": 7.963201613459381e-05,
183
+ "loss": 0.7539,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.7172413793103448,
188
+ "grad_norm": 1.4743232308059264,
189
  "learning_rate": 7.951959397907237e-05,
190
+ "loss": 0.7489,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.7448275862068966,
195
+ "grad_norm": 1.0482717244241522,
196
  "learning_rate": 7.939231012048833e-05,
197
+ "loss": 0.7499,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.7724137931034483,
202
+ "grad_norm": 1.607141576026994,
203
  "learning_rate": 7.925021242509539e-05,
204
+ "loss": 0.7517,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.8,
209
+ "grad_norm": 1.0444369512935325,
210
  "learning_rate": 7.909335433002543e-05,
211
+ "loss": 0.7343,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.8275862068965517,
216
+ "grad_norm": 1.3715967605776578,
217
  "learning_rate": 7.892179482319297e-05,
218
+ "loss": 0.7327,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.8551724137931035,
223
+ "grad_norm": 1.4794587118534477,
224
  "learning_rate": 7.873559842111225e-05,
225
+ "loss": 0.7311,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.8827586206896552,
230
+ "grad_norm": 1.0276679782161036,
231
  "learning_rate": 7.853483514463521e-05,
232
+ "loss": 0.7412,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.9103448275862069,
237
+ "grad_norm": 1.562602474322965,
238
  "learning_rate": 7.831958049261956e-05,
239
+ "loss": 0.7206,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.9379310344827586,
244
+ "grad_norm": 1.1589015896595778,
245
  "learning_rate": 7.808991541353662e-05,
246
+ "loss": 0.7253,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.9655172413793104,
251
+ "grad_norm": 1.2870157718913948,
252
  "learning_rate": 7.784592627503004e-05,
253
+ "loss": 0.7171,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.993103448275862,
258
+ "grad_norm": 1.157478869559292,
259
  "learning_rate": 7.758770483143634e-05,
260
+ "loss": 0.7076,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 1.0206896551724138,
265
+ "grad_norm": 0.835008338168959,
266
  "learning_rate": 7.731534818928004e-05,
267
+ "loss": 0.6999,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 1.0482758620689656,
272
+ "grad_norm": 1.184986109553545,
273
  "learning_rate": 7.702895877075563e-05,
274
+ "loss": 0.6842,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 1.0758620689655172,
279
+ "grad_norm": 0.7037458788649661,
280
  "learning_rate": 7.672864427521097e-05,
281
+ "loss": 0.6737,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 1.103448275862069,
286
+ "grad_norm": 0.8909082172104459,
287
  "learning_rate": 7.641451763864587e-05,
288
+ "loss": 0.6711,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 1.1310344827586207,
293
+ "grad_norm": 0.7897531063470068,
294
  "learning_rate": 7.608669699124153e-05,
295
+ "loss": 0.6758,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 1.1586206896551725,
300
+ "grad_norm": 0.865820595929729,
301
  "learning_rate": 7.57453056129365e-05,
302
+ "loss": 0.6642,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 1.186206896551724,
307
+ "grad_norm": 0.9066372016353027,
308
  "learning_rate": 7.539047188706631e-05,
309
+ "loss": 0.6673,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 1.2137931034482758,
314
+ "grad_norm": 0.9578735934793379,
315
  "learning_rate": 7.502232925208365e-05,
316
+ "loss": 0.6573,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 1.2413793103448276,
321
+ "grad_norm": 1.228545635851328,
322
  "learning_rate": 7.464101615137756e-05,
323
+ "loss": 0.6691,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 1.2689655172413792,
328
+ "grad_norm": 0.784258383433901,
329
  "learning_rate": 7.424667598121067e-05,
330
+ "loss": 0.6645,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 1.296551724137931,
335
+ "grad_norm": 0.5488455071631521,
336
  "learning_rate": 7.383945703679365e-05,
337
+ "loss": 0.6701,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 1.3241379310344827,
342
+ "grad_norm": 0.6919336141598718,
343
  "learning_rate": 7.341951245651747e-05,
344
+ "loss": 0.6479,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 1.3517241379310345,
349
+ "grad_norm": 0.7253772457368683,
350
  "learning_rate": 7.298700016436427e-05,
351
+ "loss": 0.6645,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 1.3793103448275863,
356
+ "grad_norm": 0.6959501973757715,
357
  "learning_rate": 7.254208281051871e-05,
358
+ "loss": 0.6587,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 1.4068965517241379,
363
+ "grad_norm": 0.514312927167637,
364
  "learning_rate": 7.208492771020176e-05,
365
+ "loss": 0.6524,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 1.4344827586206896,
370
+ "grad_norm": 0.7823603739568513,
371
  "learning_rate": 7.161570678075038e-05,
372
+ "loss": 0.6406,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 1.4620689655172414,
377
+ "grad_norm": 0.7568072322952989,
378
  "learning_rate": 7.113459647696641e-05,
379
+ "loss": 0.6443,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 1.489655172413793,
384
+ "grad_norm": 0.5845095753668036,
385
  "learning_rate": 7.064177772475912e-05,
386
+ "loss": 0.6509,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 1.5172413793103448,
391
+ "grad_norm": 0.549538090384788,
392
  "learning_rate": 7.013743585310642e-05,
393
+ "loss": 0.6455,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 1.5448275862068965,
398
+ "grad_norm": 0.42853146855834945,
399
  "learning_rate": 6.96217605243602e-05,
400
+ "loss": 0.6515,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 1.5724137931034483,
405
+ "grad_norm": 0.5919464795735628,
406
  "learning_rate": 6.909494566292195e-05,
407
+ "loss": 0.6521,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 1.6,
412
+ "grad_norm": 0.5612335032451046,
413
  "learning_rate": 6.855718938231597e-05,
414
+ "loss": 0.6377,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 1.6275862068965519,
419
+ "grad_norm": 0.4559541732365318,
420
  "learning_rate": 6.800869391068674e-05,
421
+ "loss": 0.636,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 1.6551724137931034,
426
+ "grad_norm": 0.5658087527174088,
427
  "learning_rate": 6.744966551474936e-05,
428
+ "loss": 0.6438,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 1.6827586206896552,
433
+ "grad_norm": 0.6190951960162644,
434
  "learning_rate": 6.688031442222091e-05,
435
+ "loss": 0.6411,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 1.7103448275862068,
440
+ "grad_norm": 0.7195796324328294,
441
  "learning_rate": 6.630085474276256e-05,
442
+ "loss": 0.6427,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 1.7379310344827585,
447
+ "grad_norm": 0.8359117093224574,
448
  "learning_rate": 6.571150438746157e-05,
449
+ "loss": 0.6468,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 1.7655172413793103,
454
+ "grad_norm": 0.8054507281657433,
455
  "learning_rate": 6.511248498688396e-05,
456
+ "loss": 0.6363,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 1.793103448275862,
461
+ "grad_norm": 0.4186180019625103,
462
  "learning_rate": 6.450402180772811e-05,
463
+ "loss": 0.6358,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 1.8206896551724139,
468
+ "grad_norm": 0.5158849627943599,
469
  "learning_rate": 6.388634366811146e-05,
470
+ "loss": 0.6426,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 1.8482758620689657,
475
+ "grad_norm": 0.7115229805464107,
476
  "learning_rate": 6.325968285152107e-05,
477
+ "loss": 0.6371,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 1.8758620689655172,
482
+ "grad_norm": 0.40684116131056497,
483
  "learning_rate": 6.262427501946155e-05,
484
+ "loss": 0.6388,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 1.903448275862069,
489
+ "grad_norm": 0.4645874829456271,
490
  "learning_rate": 6.198035912283225e-05,
491
+ "loss": 0.6421,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 1.9310344827586206,
496
+ "grad_norm": 0.5276269421219535,
497
  "learning_rate": 6.132817731206766e-05,
498
+ "loss": 0.6349,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 1.9586206896551723,
503
+ "grad_norm": 0.2637377892392192,
504
  "learning_rate": 6.0667974846074524e-05,
505
+ "loss": 0.6422,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 1.986206896551724,
510
+ "grad_norm": 0.4710066472248594,
511
  "learning_rate": 6.000000000000001e-05,
512
+ "loss": 0.6443,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 2.013793103448276,
517
+ "grad_norm": 0.49340518112375503,
518
  "learning_rate": 5.9324503971865545e-05,
519
+ "loss": 0.6066,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 2.0413793103448277,
524
+ "grad_norm": 0.3207843106148629,
525
  "learning_rate": 5.8641740788101566e-05,
526
+ "loss": 0.5757,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 2.0689655172413794,
531
+ "grad_norm": 0.5170184075766957,
532
  "learning_rate": 5.79519672080185e-05,
533
+ "loss": 0.5782,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 2.0965517241379312,
538
+ "grad_norm": 0.4440866091720341,
539
  "learning_rate": 5.7255442627250146e-05,
540
+ "loss": 0.5668,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 2.1241379310344826,
545
+ "grad_norm": 0.43592016113519555,
546
  "learning_rate": 5.6552428980205575e-05,
547
+ "loss": 0.5724,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 2.1517241379310343,
552
+ "grad_norm": 0.6321290886680759,
553
  "learning_rate": 5.584319064156628e-05,
554
+ "loss": 0.5788,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 2.179310344827586,
559
+ "grad_norm": 0.7349080038701521,
560
  "learning_rate": 5.5127994326865706e-05,
561
+ "loss": 0.5722,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 2.206896551724138,
566
+ "grad_norm": 0.9939418365212196,
567
  "learning_rate": 5.440710899218842e-05,
568
+ "loss": 0.5913,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 2.2344827586206897,
573
+ "grad_norm": 1.053283529090608,
574
  "learning_rate": 5.368080573302676e-05,
575
+ "loss": 0.5813,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 2.2620689655172415,
580
+ "grad_norm": 0.5335709817584277,
581
  "learning_rate": 5.294935768233285e-05,
582
+ "loss": 0.5819,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 2.2896551724137932,
587
+ "grad_norm": 0.5556730125717949,
588
  "learning_rate": 5.2213039907804535e-05,
589
+ "loss": 0.5711,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 2.317241379310345,
594
+ "grad_norm": 0.7237898281118994,
595
  "learning_rate": 5.1472129308443616e-05,
596
+ "loss": 0.5808,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 2.344827586206897,
601
+ "grad_norm": 0.4483667877084763,
602
  "learning_rate": 5.07269045104255e-05,
603
+ "loss": 0.5751,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 2.372413793103448,
608
+ "grad_norm": 0.45516961470100403,
609
  "learning_rate": 4.9977645762319255e-05,
610
+ "loss": 0.5825,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 2.4,
615
+ "grad_norm": 0.5326361342187356,
616
  "learning_rate": 4.922463482969761e-05,
617
+ "loss": 0.5745,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 2.4275862068965517,
622
+ "grad_norm": 0.33794202259450734,
623
  "learning_rate": 4.846815488917644e-05,
624
+ "loss": 0.567,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 2.4551724137931035,
629
+ "grad_norm": 0.4717206710356316,
630
  "learning_rate": 4.7708490421923596e-05,
631
+ "loss": 0.5701,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 2.4827586206896552,
636
+ "grad_norm": 0.35160862153917766,
637
  "learning_rate": 4.694592710667723e-05,
638
+ "loss": 0.5683,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 2.510344827586207,
643
+ "grad_norm": 0.41235441403045187,
644
  "learning_rate": 4.618075171231363e-05,
645
+ "loss": 0.5663,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 2.5379310344827584,
650
+ "grad_norm": 0.3834318800021635,
651
  "learning_rate": 4.541325199000525e-05,
652
+ "loss": 0.5738,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 2.56551724137931,
657
+ "grad_norm": 0.3359652319248755,
658
  "learning_rate": 4.464371656500921e-05,
659
+ "loss": 0.5708,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 2.593103448275862,
664
+ "grad_norm": 0.4058892708987728,
665
  "learning_rate": 4.387243482812717e-05,
666
+ "loss": 0.5735,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 2.6206896551724137,
671
+ "grad_norm": 0.24135648366203907,
672
  "learning_rate": 4.309969682687724e-05,
673
+ "loss": 0.5599,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 2.6482758620689655,
678
+ "grad_norm": 0.455207820387286,
679
  "learning_rate": 4.2325793156419035e-05,
680
+ "loss": 0.5696,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 2.6758620689655173,
685
+ "grad_norm": 0.2481077148761728,
686
  "learning_rate": 4.155101485027268e-05,
687
+ "loss": 0.5646,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 2.703448275862069,
692
+ "grad_norm": 0.41544801681517823,
693
  "learning_rate": 4.077565327087298e-05,
694
+ "loss": 0.5725,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 2.731034482758621,
699
+ "grad_norm": 0.2763049947727795,
700
  "learning_rate": 4e-05,
701
+ "loss": 0.5742,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 2.7586206896551726,
706
+ "grad_norm": 0.35790420887723046,
707
  "learning_rate": 3.9224346729127034e-05,
708
+ "loss": 0.5641,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 2.7862068965517244,
713
+ "grad_norm": 0.2733644332961225,
714
  "learning_rate": 3.844898514972733e-05,
715
+ "loss": 0.5659,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 2.8137931034482757,
720
+ "grad_norm": 0.2845568055283348,
721
  "learning_rate": 3.767420684358097e-05,
722
+ "loss": 0.5672,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 2.8413793103448275,
727
+ "grad_norm": 0.285482339765262,
728
  "learning_rate": 3.690030317312277e-05,
729
+ "loss": 0.567,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 2.8689655172413793,
734
+ "grad_norm": 0.20475680115724987,
735
  "learning_rate": 3.612756517187284e-05,
736
+ "loss": 0.5718,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 2.896551724137931,
741
+ "grad_norm": 0.2326541772198023,
742
  "learning_rate": 3.535628343499079e-05,
743
+ "loss": 0.5772,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 2.924137931034483,
748
+ "grad_norm": 0.18921206967423485,
749
  "learning_rate": 3.458674800999477e-05,
750
+ "loss": 0.5783,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 2.9517241379310346,
755
+ "grad_norm": 0.2038750201327149,
756
  "learning_rate": 3.3819248287686386e-05,
757
+ "loss": 0.5703,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 2.979310344827586,
762
+ "grad_norm": 0.14379516032972245,
763
  "learning_rate": 3.305407289332279e-05,
764
+ "loss": 0.5601,
765
  "step": 108
766
  },
767
  {
768
+ "epoch": 3.027586206896552,
769
+ "grad_norm": 0.2768205747671116,
770
  "learning_rate": 3.229150957807641e-05,
771
+ "loss": 0.523,
772
  "step": 109
773
  },
774
  {
775
+ "epoch": 3.0551724137931036,
776
+ "grad_norm": 0.21016853224005227,
777
  "learning_rate": 3.153184511082359e-05,
778
+ "loss": 0.5191,
779
  "step": 110
780
  },
781
  {
782
+ "epoch": 3.0827586206896553,
783
+ "grad_norm": 0.20830139751443208,
784
  "learning_rate": 3.07753651703024e-05,
785
+ "loss": 0.5249,
786
  "step": 111
787
  },
788
  {
789
+ "epoch": 3.110344827586207,
790
+ "grad_norm": 0.2136209425276454,
791
  "learning_rate": 3.0022354237680752e-05,
792
+ "loss": 0.5229,
793
  "step": 112
794
  },
795
  {
796
+ "epoch": 3.1379310344827585,
797
+ "grad_norm": 0.21097547894166163,
798
  "learning_rate": 2.9273095489574502e-05,
799
+ "loss": 0.5201,
800
  "step": 113
801
  },
802
  {
803
+ "epoch": 3.1655172413793102,
804
+ "grad_norm": 0.1896812969267311,
805
  "learning_rate": 2.8527870691556404e-05,
806
+ "loss": 0.5152,
807
  "step": 114
808
  },
809
  {
810
+ "epoch": 3.193103448275862,
811
+ "grad_norm": 0.19182005710613134,
812
  "learning_rate": 2.778696009219548e-05,
813
+ "loss": 0.5163,
814
  "step": 115
815
  },
816
  {
817
+ "epoch": 3.220689655172414,
818
+ "grad_norm": 0.20717320480790488,
819
  "learning_rate": 2.7050642317667164e-05,
820
+ "loss": 0.5152,
821
  "step": 116
822
  },
823
  {
824
+ "epoch": 3.2482758620689656,
825
+ "grad_norm": 0.1934750793364237,
826
  "learning_rate": 2.6319194266973256e-05,
827
+ "loss": 0.5256,
828
  "step": 117
829
  },
830
  {
831
+ "epoch": 3.2758620689655173,
832
+ "grad_norm": 0.1908018204961606,
833
  "learning_rate": 2.5592891007811594e-05,
834
+ "loss": 0.5144,
835
  "step": 118
836
  },
837
  {
838
+ "epoch": 3.303448275862069,
839
+ "grad_norm": 0.1931425320727,
840
  "learning_rate": 2.4872005673134307e-05,
841
+ "loss": 0.5127,
842
  "step": 119
843
  },
844
  {
845
+ "epoch": 3.3310344827586205,
846
+ "grad_norm": 0.1873332364850258,
847
  "learning_rate": 2.4156809358433728e-05,
848
+ "loss": 0.5213,
849
  "step": 120
850
  },
851
  {
852
+ "epoch": 3.3586206896551722,
853
+ "grad_norm": 0.1931646495691283,
854
  "learning_rate": 2.3447571019794438e-05,
855
+ "loss": 0.52,
856
  "step": 121
857
  },
858
  {
859
+ "epoch": 3.386206896551724,
860
+ "grad_norm": 0.17748716266389702,
861
  "learning_rate": 2.274455737274987e-05,
862
+ "loss": 0.5292,
863
  "step": 122
864
  },
865
  {
866
+ "epoch": 3.413793103448276,
867
+ "grad_norm": 0.17696869636060245,
868
  "learning_rate": 2.2048032791981515e-05,
869
+ "loss": 0.5186,
870
  "step": 123
871
  },
872
  {
873
+ "epoch": 3.4413793103448276,
874
+ "grad_norm": 0.17353697184306396,
875
  "learning_rate": 2.135825921189846e-05,
876
+ "loss": 0.52,
877
  "step": 124
878
  },
879
  {
880
+ "epoch": 3.4689655172413794,
881
+ "grad_norm": 0.18510424589309554,
882
  "learning_rate": 2.067549602813446e-05,
883
  "loss": 0.5227,
884
  "step": 125
885
  },
886
  {
887
+ "epoch": 3.496551724137931,
888
+ "grad_norm": 0.15226535060354468,
889
  "learning_rate": 2.0000000000000012e-05,
890
+ "loss": 0.5216,
891
  "step": 126
892
  },
893
  {
894
+ "epoch": 3.524137931034483,
895
+ "grad_norm": 0.170175313841858,
896
  "learning_rate": 1.9332025153925486e-05,
897
+ "loss": 0.5188,
898
  "step": 127
899
  },
900
  {
901
+ "epoch": 3.5517241379310347,
902
+ "grad_norm": 0.15637491216188124,
903
  "learning_rate": 1.867182268793236e-05,
904
+ "loss": 0.5214,
905
  "step": 128
906
  },
907
  {
908
+ "epoch": 3.5793103448275865,
909
+ "grad_norm": 0.14907015306747484,
910
  "learning_rate": 1.8019640877167763e-05,
911
+ "loss": 0.5169,
912
  "step": 129
913
  },
914
  {
915
+ "epoch": 3.606896551724138,
916
+ "grad_norm": 0.13930060635910274,
917
  "learning_rate": 1.7375724980538465e-05,
918
+ "loss": 0.5178,
919
  "step": 130
920
  },
921
  {
922
+ "epoch": 3.6344827586206896,
923
+ "grad_norm": 0.14366484263376328,
924
  "learning_rate": 1.6740317148478932e-05,
925
+ "loss": 0.5107,
926
  "step": 131
927
  },
928
  {
929
+ "epoch": 3.6620689655172414,
930
+ "grad_norm": 0.1259014819623676,
931
  "learning_rate": 1.6113656331888563e-05,
932
+ "loss": 0.5131,
933
  "step": 132
934
  },
935
  {
936
+ "epoch": 3.689655172413793,
937
+ "grad_norm": 0.14855964720690903,
938
  "learning_rate": 1.5495978192271887e-05,
939
+ "loss": 0.5217,
940
  "step": 133
941
  },
942
  {
943
+ "epoch": 3.717241379310345,
944
+ "grad_norm": 0.1479696755604304,
945
  "learning_rate": 1.4887515013116067e-05,
946
+ "loss": 0.5293,
947
  "step": 134
948
  },
949
  {
950
+ "epoch": 3.7448275862068967,
951
+ "grad_norm": 0.1271428368738868,
952
  "learning_rate": 1.4288495612538427e-05,
953
+ "loss": 0.5252,
954
  "step": 135
955
  },
956
  {
957
+ "epoch": 3.772413793103448,
958
+ "grad_norm": 0.12856403412888792,
959
  "learning_rate": 1.369914525723746e-05,
960
+ "loss": 0.5192,
961
  "step": 136
962
  },
963
  {
964
+ "epoch": 3.8,
965
+ "grad_norm": 0.118525496035851,
966
  "learning_rate": 1.3119685577779105e-05,
967
+ "loss": 0.5231,
968
  "step": 137
969
  },
970
  {
971
+ "epoch": 3.8275862068965516,
972
+ "grad_norm": 0.11984692729474616,
973
  "learning_rate": 1.2550334485250661e-05,
974
+ "loss": 0.5147,
975
  "step": 138
976
  },
977
  {
978
+ "epoch": 3.8551724137931034,
979
+ "grad_norm": 0.12228015600802276,
980
  "learning_rate": 1.1991306089313261e-05,
981
+ "loss": 0.5138,
982
  "step": 139
983
  },
984
  {
985
+ "epoch": 3.882758620689655,
986
+ "grad_norm": 0.11172721477526992,
987
  "learning_rate": 1.1442810617684046e-05,
988
+ "loss": 0.5162,
989
  "step": 140
990
  },
991
  {
992
+ "epoch": 3.910344827586207,
993
+ "grad_norm": 0.10750828265688152,
994
  "learning_rate": 1.0905054337078051e-05,
995
+ "loss": 0.5234,
996
  "step": 141
997
  },
998
  {
999
+ "epoch": 3.9379310344827587,
1000
+ "grad_norm": 0.11418030466140167,
1001
  "learning_rate": 1.0378239475639823e-05,
1002
+ "loss": 0.5258,
1003
  "step": 142
1004
  },
1005
  {
1006
+ "epoch": 3.9655172413793105,
1007
+ "grad_norm": 0.10340131054796338,
1008
  "learning_rate": 9.862564146893571e-06,
1009
+ "loss": 0.5255,
1010
  "step": 143
1011
  },
1012
  {
1013
+ "epoch": 3.9931034482758623,
1014
+ "grad_norm": 0.11264648441495413,
1015
  "learning_rate": 9.358222275240884e-06,
1016
+ "loss": 0.5047,
1017
  "step": 144
1018
  },
1019
  {
1020
+ "epoch": 4.020689655172414,
1021
+ "grad_norm": 0.13999590558231068,
1022
  "learning_rate": 8.8654035230336e-06,
1023
+ "loss": 0.5049,
1024
  "step": 145
1025
  },
1026
  {
1027
+ "epoch": 4.048275862068966,
1028
+ "grad_norm": 0.12964283490610493,
1029
  "learning_rate": 8.384293219249633e-06,
1030
+ "loss": 0.4855,
1031
  "step": 146
1032
  },
1033
  {
1034
+ "epoch": 4.075862068965518,
1035
+ "grad_norm": 0.11608137909176354,
1036
  "learning_rate": 7.915072289798247e-06,
1037
+ "loss": 0.4838,
1038
  "step": 147
1039
  },
1040
  {
1041
+ "epoch": 4.103448275862069,
1042
+ "grad_norm": 0.11905983657930706,
1043
  "learning_rate": 7.457917189481301e-06,
1044
+ "loss": 0.4859,
1045
  "step": 148
1046
  },
1047
  {
1048
+ "epoch": 4.13103448275862,
1049
+ "grad_norm": 0.11427541811021874,
1050
  "learning_rate": 7.0129998356357295e-06,
1051
+ "loss": 0.492,
1052
  "step": 149
1053
  },
1054
  {
1055
+ "epoch": 4.158620689655172,
1056
+ "grad_norm": 0.11070784808304583,
1057
  "learning_rate": 6.58048754348255e-06,
1058
+ "loss": 0.4879,
1059
  "step": 150
1060
  },
1061
  {
1062
+ "epoch": 4.186206896551724,
1063
+ "grad_norm": 0.12188307993915756,
1064
  "learning_rate": 6.160542963206357e-06,
1065
+ "loss": 0.4934,
1066
  "step": 151
1067
  },
1068
  {
1069
+ "epoch": 4.213793103448276,
1070
+ "grad_norm": 0.16152583695371422,
1071
  "learning_rate": 5.753324018789346e-06,
1072
+ "loss": 0.4947,
1073
  "step": 152
1074
  },
1075
  {
1076
+ "epoch": 4.241379310344827,
1077
+ "grad_norm": 0.11998316007847164,
1078
  "learning_rate": 5.358983848622452e-06,
1079
+ "loss": 0.4898,
1080
  "step": 153
1081
  },
1082
  {
1083
+ "epoch": 4.268965517241379,
1084
+ "grad_norm": 0.11862957258816627,
1085
  "learning_rate": 4.97767074791637e-06,
1086
+ "loss": 0.4971,
1087
  "step": 154
1088
  },
1089
  {
1090
+ "epoch": 4.296551724137931,
1091
+ "grad_norm": 0.1012872945696045,
1092
  "learning_rate": 4.609528112933688e-06,
1093
+ "loss": 0.4827,
1094
  "step": 155
1095
  },
1096
  {
1097
+ "epoch": 4.324137931034483,
1098
+ "grad_norm": 0.10220903393664306,
1099
  "learning_rate": 4.254694387063514e-06,
1100
+ "loss": 0.4881,
1101
  "step": 156
1102
  },
1103
  {
1104
+ "epoch": 4.3517241379310345,
1105
+ "grad_norm": 0.10939237995735153,
1106
  "learning_rate": 3.913303008758491e-06,
1107
+ "loss": 0.5004,
1108
  "step": 157
1109
  },
1110
  {
1111
+ "epoch": 4.379310344827586,
1112
+ "grad_norm": 0.10312457830805638,
1113
  "learning_rate": 3.585482361354138e-06,
1114
+ "loss": 0.4957,
1115
  "step": 158
1116
  },
1117
  {
1118
+ "epoch": 4.406896551724138,
1119
+ "grad_norm": 0.09947763462895909,
1120
  "learning_rate": 3.2713557247890447e-06,
1121
+ "loss": 0.4872,
1122
  "step": 159
1123
  },
1124
  {
1125
+ "epoch": 4.43448275862069,
1126
+ "grad_norm": 0.09774274637727455,
1127
  "learning_rate": 2.9710412292443868e-06,
1128
+ "loss": 0.4883,
1129
  "step": 160
1130
  },
1131
  {
1132
+ "epoch": 4.462068965517242,
1133
+ "grad_norm": 0.09376017366721634,
1134
  "learning_rate": 2.6846518107199782e-06,
1135
+ "loss": 0.4875,
1136
  "step": 161
1137
  },
1138
  {
1139
+ "epoch": 4.489655172413793,
1140
+ "grad_norm": 0.09560458098591364,
1141
  "learning_rate": 2.4122951685636674e-06,
1142
+ "loss": 0.4958,
1143
  "step": 162
1144
  },
1145
  {
1146
+ "epoch": 4.517241379310345,
1147
+ "grad_norm": 0.09249329390203823,
1148
  "learning_rate": 2.1540737249699893e-06,
1149
+ "loss": 0.5004,
1150
  "step": 163
1151
  },
1152
  {
1153
+ "epoch": 4.544827586206896,
1154
+ "grad_norm": 0.09283868317617097,
1155
  "learning_rate": 1.9100845864633875e-06,
1156
+ "loss": 0.4908,
1157
  "step": 164
1158
  },
1159
  {
1160
+ "epoch": 4.572413793103449,
1161
+ "grad_norm": 0.08806202611058704,
1162
  "learning_rate": 1.6804195073804442e-06,
1163
+ "loss": 0.4966,
1164
  "step": 165
1165
  },
1166
  {
1167
+ "epoch": 4.6,
1168
+ "grad_norm": 0.08616358519747361,
1169
  "learning_rate": 1.4651648553647869e-06,
1170
+ "loss": 0.4848,
1171
  "step": 166
1172
  },
1173
  {
1174
+ "epoch": 4.627586206896551,
1175
+ "grad_norm": 0.08709433695781935,
1176
  "learning_rate": 1.2644015788877684e-06,
1177
+ "loss": 0.4854,
1178
  "step": 167
1179
  },
1180
  {
1181
+ "epoch": 4.655172413793103,
1182
+ "grad_norm": 0.08509054532560552,
1183
  "learning_rate": 1.0782051768070477e-06,
1184
+ "loss": 0.4979,
1185
  "step": 168
1186
  },
1187
  {
1188
+ "epoch": 4.682758620689655,
1189
+ "grad_norm": 0.0876465160396204,
1190
  "learning_rate": 9.066456699745774e-07,
1191
+ "loss": 0.4942,
1192
  "step": 169
1193
  },
1194
  {
1195
+ "epoch": 4.710344827586207,
1196
+ "grad_norm": 0.08495623263419229,
1197
  "learning_rate": 7.497875749046124e-07,
1198
+ "loss": 0.4942,
1199
  "step": 170
1200
  },
1201
  {
1202
+ "epoch": 4.7379310344827585,
1203
+ "grad_norm": 0.08566068053991785,
1204
  "learning_rate": 6.076898795116792e-07,
1205
+ "loss": 0.4924,
1206
  "step": 171
1207
  },
1208
  {
1209
+ "epoch": 4.76551724137931,
1210
+ "grad_norm": 0.08613139456520355,
1211
  "learning_rate": 4.804060209276396e-07,
1212
+ "loss": 0.4908,
1213
  "step": 172
1214
  },
1215
  {
1216
+ "epoch": 4.793103448275862,
1217
+ "grad_norm": 0.08723765769552372,
1218
  "learning_rate": 3.679838654061874e-07,
1219
+ "loss": 0.4942,
1220
  "step": 173
1221
  },
1222
  {
1223
+ "epoch": 4.820689655172414,
1224
+ "grad_norm": 0.08376548526232536,
1225
  "learning_rate": 2.704656903222791e-07,
1226
+ "loss": 0.488,
1227
  "step": 174
1228
  },
1229
  {
1230
+ "epoch": 4.848275862068966,
1231
+ "grad_norm": 0.08618546331155713,
1232
  "learning_rate": 1.8788816827336686e-07,
1233
+ "loss": 0.4953,
1234
  "step": 175
1235
  },
1236
  {
1237
+ "epoch": 4.875862068965517,
1238
+ "grad_norm": 0.08260421466524005,
1239
  "learning_rate": 1.2028235328831906e-07,
1240
+ "loss": 0.4922,
1241
  "step": 176
1242
  },
1243
  {
1244
+ "epoch": 4.903448275862069,
1245
+ "grad_norm": 0.0828537576684744,
1246
  "learning_rate": 6.767366914927298e-08,
1247
+ "loss": 0.5023,
1248
  "step": 177
1249
  },
1250
  {
1251
+ "epoch": 4.931034482758621,
1252
+ "grad_norm": 0.08601398618423486,
1253
  "learning_rate": 3.0081899830798345e-08,
1254
+ "loss": 0.494,
1255
  "step": 178
1256
  },
1257
  {
1258
+ "epoch": 4.958620689655173,
1259
+ "grad_norm": 0.08241186214721642,
1260
  "learning_rate": 7.521182059946342e-09,
1261
+ "loss": 0.4894,
1262
  "step": 179
1263
  },
1264
  {
1265
+ "epoch": 4.9862068965517246,
1266
+ "grad_norm": 0.0840600437175742,
1267
  "learning_rate": 0.0,
1268
+ "loss": 0.4915,
1269
  "step": 180
1270
  },
1271
  {
1272
+ "epoch": 4.9862068965517246,
1273
  "step": 180,
1274
+ "total_flos": 4.652519618982707e+18,
1275
+ "train_loss": 0.20232847813102933,
1276
+ "train_runtime": 7864.7522,
1277
+ "train_samples_per_second": 11.797,
1278
+ "train_steps_per_second": 0.023
1279
  }
1280
  ],
1281
  "logging_steps": 1,
 
1295
  "attributes": {}
1296
  }
1297
  },
1298
+ "total_flos": 4.652519618982707e+18,
1299
  "train_batch_size": 1,
1300
  "trial_name": null,
1301
  "trial_params": null
training_loss.png CHANGED