yalhessi commited on
Commit
97c8e02
·
verified ·
1 Parent(s): 18479a7

Training in progress, epoch 11, checkpoint

Browse files
checkpoint-39589/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e0e896e5b9aa7a8070f513d699b0fade916fc69a30e71b2ded22fa2cc5ff7e8
3
  size 541459256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba8ec9437404b2d8a5db200fc9cc3c2c3d7537e3f24d28aaad63eadb68e9336b
3
  size 541459256
checkpoint-39589/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ae385dc74df08c5418afe1aee1056d37d3026477db7c565f95117e5aed36725
3
  size 33662074
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d663b77754909ee65a553cc25dac214d3d233363e4deb6567e6194bcd88c3be3
3
  size 33662074
checkpoint-39589/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3dbac0a94b21cbd53b317c27ef150f76fd41a2dc5f8b2243d4e90ec3d5968fa
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90fc1fd0351f7fc1588603543e24f0c373712a77a187f7e99806ccaac72874b8
3
  size 15024
checkpoint-39589/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d144213e04fc250373f8810fa00b1c35a95468b03cdba13a1d3d30173520c57
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e59fea5df44e0be462caf0fb7bd297c644410317897f18927b1b2f6d1a9573
3
  size 15024
checkpoint-39589/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a58068e9a3b9efa3ab61ba45b1549e1f01d87a8282059f80d6e6447b982912e
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b1912b5fd588236455f3a968c3056f5b4acb61c7f9fe8042d1962d81992a61
3
  size 15024
checkpoint-39589/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e216f335f45f5ebac3ec088ce10c48e557a4d12ea156c48a5b133f4f482fe1b8
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ddab00274d728ff1748039a30a5f2f729ea7704e5ec12f5e5f024b0200da99
3
  size 15024
checkpoint-39589/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66a56648c1d159f41a8b9f69962096495d1772d34fc84794a76bc2f6e49ce0b8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87743db94a674c3a3a961be2dbb481406ecd03f7371aa9f0f7237b4465875326
3
  size 1064
checkpoint-39589/trainer_state.json CHANGED
@@ -10,987 +10,987 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.13892747985551543,
13
- "grad_norm": 0.6994414925575256,
14
- "learning_rate": 0.00039537834583680656,
15
- "loss": 0.2943,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.2000555709919422,
20
- "eval_loss": 0.21401312947273254,
21
- "eval_runtime": 16.5317,
22
- "eval_samples_per_second": 30.245,
23
- "eval_steps_per_second": 3.811,
24
  "step": 720
25
  },
26
  {
27
  "epoch": 0.27785495971103086,
28
- "grad_norm": 0.5945971608161926,
29
- "learning_rate": 0.00039075669167361305,
30
- "loss": 0.1986,
31
  "step": 1000
32
  },
33
  {
34
  "epoch": 0.4001111419838844,
35
- "eval_loss": 0.17625106871128082,
36
- "eval_runtime": 16.4823,
37
- "eval_samples_per_second": 30.336,
38
  "eval_steps_per_second": 3.822,
39
  "step": 1440
40
  },
41
  {
42
  "epoch": 0.41678243956654626,
43
- "grad_norm": 0.3619824945926666,
44
- "learning_rate": 0.0003861257756784292,
45
- "loss": 0.1802,
46
  "step": 1500
47
  },
48
  {
49
  "epoch": 0.5557099194220617,
50
- "grad_norm": 0.40362638235092163,
51
- "learning_rate": 0.00038149485968324534,
52
- "loss": 0.1686,
53
  "step": 2000
54
  },
55
  {
56
  "epoch": 0.6001667129758266,
57
- "eval_loss": 0.16156445443630219,
58
- "eval_runtime": 16.3535,
59
- "eval_samples_per_second": 30.575,
60
- "eval_steps_per_second": 3.852,
61
  "step": 2160
62
  },
63
  {
64
  "epoch": 0.6946373992775771,
65
- "grad_norm": 0.4962793290615082,
66
- "learning_rate": 0.0003768639436880615,
67
- "loss": 0.1577,
68
  "step": 2500
69
  },
70
  {
71
  "epoch": 0.8002222839677688,
72
- "eval_loss": 0.15043224394321442,
73
- "eval_runtime": 16.3325,
74
- "eval_samples_per_second": 30.614,
75
- "eval_steps_per_second": 3.857,
76
  "step": 2880
77
  },
78
  {
79
  "epoch": 0.8335648791330925,
80
- "grad_norm": 0.41315382719039917,
81
- "learning_rate": 0.0003722330276928777,
82
- "loss": 0.1493,
83
  "step": 3000
84
  },
85
  {
86
  "epoch": 0.972492358988608,
87
- "grad_norm": 0.37953054904937744,
88
  "learning_rate": 0.0003676113735296842,
89
- "loss": 0.1477,
90
  "step": 3500
91
  },
92
  {
93
  "epoch": 1.000277854959711,
94
- "eval_loss": 0.14538082480430603,
95
- "eval_runtime": 16.4466,
96
- "eval_samples_per_second": 30.401,
97
- "eval_steps_per_second": 3.831,
98
  "step": 3600
99
  },
100
  {
101
  "epoch": 1.1114198388441234,
102
- "grad_norm": 0.4146048128604889,
103
  "learning_rate": 0.00036298045753450036,
104
- "loss": 0.1318,
105
  "step": 4000
106
  },
107
  {
108
  "epoch": 1.2003334259516532,
109
- "eval_loss": 0.13961325585842133,
110
- "eval_runtime": 16.5835,
111
- "eval_samples_per_second": 30.15,
112
- "eval_steps_per_second": 3.799,
113
  "step": 4320
114
  },
115
  {
116
  "epoch": 1.2503473186996388,
117
- "grad_norm": 0.6914293766021729,
118
  "learning_rate": 0.0003583495415393165,
119
- "loss": 0.1303,
120
  "step": 4500
121
  },
122
  {
123
  "epoch": 1.3892747985551543,
124
- "grad_norm": 0.39365154504776,
125
- "learning_rate": 0.00035371862554413265,
126
- "loss": 0.1269,
127
  "step": 5000
128
  },
129
  {
130
  "epoch": 1.4003889969435954,
131
- "eval_loss": 0.14090509712696075,
132
- "eval_runtime": 16.4413,
133
- "eval_samples_per_second": 30.411,
134
- "eval_steps_per_second": 3.832,
135
  "step": 5040
136
  },
137
  {
138
  "epoch": 1.5282022784106695,
139
- "grad_norm": 0.5620034337043762,
140
  "learning_rate": 0.00034909697138093914,
141
- "loss": 0.1263,
142
  "step": 5500
143
  },
144
  {
145
  "epoch": 1.6004445679355377,
146
- "eval_loss": 0.13119302690029144,
147
- "eval_runtime": 16.5618,
148
- "eval_samples_per_second": 30.19,
149
- "eval_steps_per_second": 3.804,
150
  "step": 5760
151
  },
152
  {
153
  "epoch": 1.667129758266185,
154
- "grad_norm": 1.5726152658462524,
155
  "learning_rate": 0.0003444660553857553,
156
- "loss": 0.1258,
157
  "step": 6000
158
  },
159
  {
160
  "epoch": 1.8005001389274797,
161
- "eval_loss": 0.12919241189956665,
162
- "eval_runtime": 16.4031,
163
- "eval_samples_per_second": 30.482,
164
- "eval_steps_per_second": 3.841,
165
  "step": 6480
166
  },
167
  {
168
  "epoch": 1.8060572381217006,
169
- "grad_norm": 0.9997938871383667,
170
  "learning_rate": 0.0003398351393905715,
171
- "loss": 0.1212,
172
  "step": 6500
173
  },
174
  {
175
  "epoch": 1.9449847179772157,
176
- "grad_norm": 0.4134954810142517,
177
  "learning_rate": 0.00033520422339538766,
178
- "loss": 0.1201,
179
  "step": 7000
180
  },
181
  {
182
  "epoch": 2.000555709919422,
183
- "eval_loss": 0.12256824225187302,
184
- "eval_runtime": 16.8158,
185
- "eval_samples_per_second": 29.734,
186
- "eval_steps_per_second": 3.746,
187
  "step": 7200
188
  },
189
  {
190
  "epoch": 2.0839121978327313,
191
- "grad_norm": 0.40068477392196655,
192
  "learning_rate": 0.0003305733074002038,
193
- "loss": 0.108,
194
  "step": 7500
195
  },
196
  {
197
  "epoch": 2.2006112809113643,
198
- "eval_loss": 0.12798596918582916,
199
- "eval_runtime": 16.5301,
200
- "eval_samples_per_second": 30.248,
201
- "eval_steps_per_second": 3.811,
202
  "step": 7920
203
  },
204
  {
205
  "epoch": 2.222839677688247,
206
- "grad_norm": 0.39021220803260803,
207
  "learning_rate": 0.0003259423914050199,
208
- "loss": 0.1061,
209
  "step": 8000
210
  },
211
  {
212
  "epoch": 2.361767157543762,
213
- "grad_norm": 0.4577147960662842,
214
- "learning_rate": 0.0003213114754098361,
215
- "loss": 0.1055,
216
  "step": 8500
217
  },
218
  {
219
  "epoch": 2.4006668519033063,
220
- "eval_loss": 0.1270737648010254,
221
- "eval_runtime": 16.5227,
222
- "eval_samples_per_second": 30.261,
223
- "eval_steps_per_second": 3.813,
224
  "step": 8640
225
  },
226
  {
227
  "epoch": 2.5006946373992776,
228
- "grad_norm": 0.5035166144371033,
229
- "learning_rate": 0.00031668055941465225,
230
- "loss": 0.1033,
231
  "step": 9000
232
  },
233
  {
234
  "epoch": 2.600722422895249,
235
- "eval_loss": 0.12219922989606857,
236
- "eval_runtime": 16.637,
237
- "eval_samples_per_second": 30.054,
238
- "eval_steps_per_second": 3.787,
239
  "step": 9360
240
  },
241
  {
242
  "epoch": 2.639622117254793,
243
- "grad_norm": 0.6268121004104614,
244
- "learning_rate": 0.00031205890525145874,
245
- "loss": 0.1026,
246
  "step": 9500
247
  },
248
  {
249
  "epoch": 2.7785495971103087,
250
- "grad_norm": 0.37843775749206543,
251
- "learning_rate": 0.0003074279892562749,
252
- "loss": 0.1042,
253
  "step": 10000
254
  },
255
  {
256
  "epoch": 2.800777993887191,
257
- "eval_loss": 0.11693067103624344,
258
- "eval_runtime": 16.4326,
259
- "eval_samples_per_second": 30.427,
260
- "eval_steps_per_second": 3.834,
261
  "step": 10080
262
  },
263
  {
264
  "epoch": 2.917477076965824,
265
- "grad_norm": 0.47878336906433105,
266
- "learning_rate": 0.00030279707326109104,
267
- "loss": 0.1032,
268
  "step": 10500
269
  },
270
  {
271
  "epoch": 3.000833564879133,
272
- "eval_loss": 0.11544420570135117,
273
- "eval_runtime": 16.6741,
274
- "eval_samples_per_second": 29.987,
275
- "eval_steps_per_second": 3.778,
276
  "step": 10800
277
  },
278
  {
279
  "epoch": 3.0564045568213394,
280
- "grad_norm": 0.6917030215263367,
281
- "learning_rate": 0.0002981661572659072,
282
- "loss": 0.0954,
283
  "step": 11000
284
  },
285
  {
286
  "epoch": 3.1953320366768545,
287
- "grad_norm": 0.35634613037109375,
288
- "learning_rate": 0.0002935352412707234,
289
- "loss": 0.0874,
290
  "step": 11500
291
  },
292
  {
293
  "epoch": 3.2008891358710754,
294
- "eval_loss": 0.1193525642156601,
295
- "eval_runtime": 16.401,
296
- "eval_samples_per_second": 30.486,
297
- "eval_steps_per_second": 3.841,
298
  "step": 11520
299
  },
300
  {
301
  "epoch": 3.33425951653237,
302
- "grad_norm": 0.4005107283592224,
303
- "learning_rate": 0.0002889043252755395,
304
- "loss": 0.0902,
305
  "step": 12000
306
  },
307
  {
308
  "epoch": 3.4009447068630174,
309
- "eval_loss": 0.11315633356571198,
310
- "eval_runtime": 16.525,
311
- "eval_samples_per_second": 30.257,
312
- "eval_steps_per_second": 3.812,
313
  "step": 12240
314
  },
315
  {
316
  "epoch": 3.4731869963878856,
317
- "grad_norm": 0.348311185836792,
318
- "learning_rate": 0.0002842734092803557,
319
- "loss": 0.0891,
320
  "step": 12500
321
  },
322
  {
323
  "epoch": 3.6010002778549595,
324
- "eval_loss": 0.11703498661518097,
325
- "eval_runtime": 16.4464,
326
- "eval_samples_per_second": 30.402,
327
- "eval_steps_per_second": 3.831,
328
  "step": 12960
329
  },
330
  {
331
  "epoch": 3.612114476243401,
332
- "grad_norm": 0.3483453094959259,
333
- "learning_rate": 0.0002796424932851718,
334
- "loss": 0.0892,
335
  "step": 13000
336
  },
337
  {
338
  "epoch": 3.7510419560989163,
339
- "grad_norm": 0.8891735672950745,
340
- "learning_rate": 0.00027502083912197834,
341
- "loss": 0.0878,
342
  "step": 13500
343
  },
344
  {
345
  "epoch": 3.801055848846902,
346
- "eval_loss": 0.11358808726072311,
347
- "eval_runtime": 16.5506,
348
- "eval_samples_per_second": 30.21,
349
- "eval_steps_per_second": 3.807,
350
  "step": 13680
351
  },
352
  {
353
  "epoch": 3.889969435954432,
354
- "grad_norm": 2.020261764526367,
355
- "learning_rate": 0.00027038992312679446,
356
- "loss": 0.088,
357
  "step": 14000
358
  },
359
  {
360
  "epoch": 4.001111419838844,
361
- "eval_loss": 0.11110712587833405,
362
- "eval_runtime": 16.9629,
363
- "eval_samples_per_second": 29.476,
364
- "eval_steps_per_second": 3.714,
365
  "step": 14400
366
  },
367
  {
368
  "epoch": 4.0288969158099475,
369
- "grad_norm": 0.4606301486492157,
370
- "learning_rate": 0.00026575900713161064,
371
- "loss": 0.0848,
372
  "step": 14500
373
  },
374
  {
375
  "epoch": 4.167824395665463,
376
- "grad_norm": 0.5307084321975708,
377
- "learning_rate": 0.0002611280911364268,
378
- "loss": 0.0754,
379
  "step": 15000
380
  },
381
  {
382
  "epoch": 4.201166990830786,
383
- "eval_loss": 0.11462360620498657,
384
- "eval_runtime": 16.4785,
385
- "eval_samples_per_second": 30.343,
386
- "eval_steps_per_second": 3.823,
387
  "step": 15120
388
  },
389
  {
390
  "epoch": 4.306751875520978,
391
- "grad_norm": 0.45674943923950195,
392
- "learning_rate": 0.000256497175141243,
393
- "loss": 0.076,
394
  "step": 15500
395
  },
396
  {
397
  "epoch": 4.4012225618227285,
398
- "eval_loss": 0.11145643889904022,
399
- "eval_runtime": 16.4715,
400
- "eval_samples_per_second": 30.356,
401
- "eval_steps_per_second": 3.825,
402
  "step": 15840
403
  },
404
  {
405
  "epoch": 4.445679355376494,
406
- "grad_norm": 0.3333616554737091,
407
- "learning_rate": 0.00025186625914605905,
408
- "loss": 0.0768,
409
  "step": 16000
410
  },
411
  {
412
  "epoch": 4.584606835232009,
413
- "grad_norm": 0.3716529905796051,
414
  "learning_rate": 0.0002472446049828656,
415
- "loss": 0.0762,
416
  "step": 16500
417
  },
418
  {
419
  "epoch": 4.601278132814671,
420
- "eval_loss": 0.1125354915857315,
421
- "eval_runtime": 16.4547,
422
- "eval_samples_per_second": 30.386,
423
- "eval_steps_per_second": 3.829,
424
  "step": 16560
425
  },
426
  {
427
  "epoch": 4.723534315087524,
428
- "grad_norm": 0.5058280229568481,
429
  "learning_rate": 0.00024261368898768177,
430
- "loss": 0.0771,
431
  "step": 17000
432
  },
433
  {
434
  "epoch": 4.801333703806613,
435
- "eval_loss": 0.10654650628566742,
436
- "eval_runtime": 16.5091,
437
- "eval_samples_per_second": 30.286,
438
- "eval_steps_per_second": 3.816,
439
  "step": 17280
440
  },
441
  {
442
  "epoch": 4.86246179494304,
443
- "grad_norm": 0.37022796273231506,
444
  "learning_rate": 0.0002379920348244883,
445
- "loss": 0.0763,
446
  "step": 17500
447
  },
448
  {
449
  "epoch": 5.001389274798555,
450
- "grad_norm": 0.3737078607082367,
451
  "learning_rate": 0.00023336111882930443,
452
- "loss": 0.0749,
453
  "step": 18000
454
  },
455
  {
456
  "epoch": 5.001389274798555,
457
- "eval_loss": 0.10845010727643967,
458
- "eval_runtime": 16.6489,
459
- "eval_samples_per_second": 30.032,
460
- "eval_steps_per_second": 3.784,
461
  "step": 18000
462
  },
463
  {
464
  "epoch": 5.14031675465407,
465
- "grad_norm": 0.3841913640499115,
466
  "learning_rate": 0.00022873020283412058,
467
- "loss": 0.0642,
468
  "step": 18500
469
  },
470
  {
471
  "epoch": 5.201444845790498,
472
- "eval_loss": 0.11235029250383377,
473
- "eval_runtime": 16.682,
474
- "eval_samples_per_second": 29.972,
475
- "eval_steps_per_second": 3.777,
476
  "step": 18720
477
  },
478
  {
479
  "epoch": 5.279244234509586,
480
- "grad_norm": 0.31463760137557983,
481
  "learning_rate": 0.00022409928683893675,
482
- "loss": 0.0649,
483
  "step": 19000
484
  },
485
  {
486
  "epoch": 5.401500416782439,
487
- "eval_loss": 0.1092103123664856,
488
- "eval_runtime": 16.4715,
489
- "eval_samples_per_second": 30.355,
490
- "eval_steps_per_second": 3.825,
491
  "step": 19440
492
  },
493
  {
494
  "epoch": 5.418171714365101,
495
- "grad_norm": 0.30956658720970154,
496
  "learning_rate": 0.0002194683708437529,
497
- "loss": 0.0651,
498
  "step": 19500
499
  },
500
  {
501
  "epoch": 5.5570991942206165,
502
- "grad_norm": 0.42891213297843933,
503
  "learning_rate": 0.00021483745484856908,
504
- "loss": 0.0656,
505
  "step": 20000
506
  },
507
  {
508
  "epoch": 5.601555987774382,
509
- "eval_loss": 0.1072624996304512,
510
- "eval_runtime": 16.504,
511
- "eval_samples_per_second": 30.296,
512
- "eval_steps_per_second": 3.817,
513
  "step": 20160
514
  },
515
  {
516
  "epoch": 5.6960266740761325,
517
- "grad_norm": 0.6161347031593323,
518
- "learning_rate": 0.00021021580068537556,
519
- "loss": 0.0673,
520
  "step": 20500
521
  },
522
  {
523
  "epoch": 5.801611558766324,
524
- "eval_loss": 0.10548041760921478,
525
- "eval_runtime": 16.5903,
526
- "eval_samples_per_second": 30.138,
527
- "eval_steps_per_second": 3.797,
528
  "step": 20880
529
  },
530
  {
531
  "epoch": 5.834954153931648,
532
- "grad_norm": 0.6631760597229004,
533
- "learning_rate": 0.0002055848846901917,
534
- "loss": 0.0654,
535
  "step": 21000
536
  },
537
  {
538
  "epoch": 5.973881633787163,
539
- "grad_norm": 0.37077194452285767,
540
  "learning_rate": 0.0002009539686950079,
541
- "loss": 0.065,
542
  "step": 21500
543
  },
544
  {
545
  "epoch": 6.001667129758266,
546
- "eval_loss": 0.10457777976989746,
547
- "eval_runtime": 16.8365,
548
- "eval_samples_per_second": 29.697,
549
- "eval_steps_per_second": 3.742,
550
  "step": 21600
551
  },
552
  {
553
  "epoch": 6.112809113642679,
554
- "grad_norm": 0.4291210174560547,
555
  "learning_rate": 0.00019632305269982403,
556
- "loss": 0.0565,
557
  "step": 22000
558
  },
559
  {
560
  "epoch": 6.201722700750208,
561
- "eval_loss": 0.11106568574905396,
562
- "eval_runtime": 16.6145,
563
- "eval_samples_per_second": 30.094,
564
- "eval_steps_per_second": 3.792,
565
  "step": 22320
566
  },
567
  {
568
  "epoch": 6.251736593498194,
569
- "grad_norm": 0.5859522223472595,
570
  "learning_rate": 0.00019169213670464018,
571
- "loss": 0.0551,
572
  "step": 22500
573
  },
574
  {
575
  "epoch": 6.390664073353709,
576
- "grad_norm": 0.6046123504638672,
577
  "learning_rate": 0.00018706122070945633,
578
- "loss": 0.0566,
579
  "step": 23000
580
  },
581
  {
582
  "epoch": 6.401778271742151,
583
- "eval_loss": 0.11063603311777115,
584
- "eval_runtime": 16.5709,
585
- "eval_samples_per_second": 30.173,
586
- "eval_steps_per_second": 3.802,
587
  "step": 23040
588
  },
589
  {
590
  "epoch": 6.529591553209225,
591
- "grad_norm": 0.5555618405342102,
592
  "learning_rate": 0.0001824303047142725,
593
- "loss": 0.0567,
594
  "step": 23500
595
  },
596
  {
597
  "epoch": 6.601833842734093,
598
- "eval_loss": 0.11322695761919022,
599
- "eval_runtime": 16.5489,
600
- "eval_samples_per_second": 30.213,
601
- "eval_steps_per_second": 3.807,
602
  "step": 23760
603
  },
604
  {
605
  "epoch": 6.66851903306474,
606
- "grad_norm": 0.641233503818512,
607
  "learning_rate": 0.00017779938871908863,
608
- "loss": 0.057,
609
  "step": 24000
610
  },
611
  {
612
  "epoch": 6.801889413726035,
613
- "eval_loss": 0.10657580196857452,
614
- "eval_runtime": 16.5793,
615
- "eval_samples_per_second": 30.158,
616
- "eval_steps_per_second": 3.8,
617
  "step": 24480
618
  },
619
  {
620
  "epoch": 6.807446512920255,
621
- "grad_norm": 0.36209815740585327,
622
  "learning_rate": 0.0001731684727239048,
623
- "loss": 0.0574,
624
  "step": 24500
625
  },
626
  {
627
  "epoch": 6.946373992775771,
628
- "grad_norm": 0.49614134430885315,
629
  "learning_rate": 0.00016853755672872095,
630
- "loss": 0.0564,
631
  "step": 25000
632
  },
633
  {
634
  "epoch": 7.001944984717977,
635
- "eval_loss": 0.10740524530410767,
636
- "eval_runtime": 16.6216,
637
- "eval_samples_per_second": 30.081,
638
- "eval_steps_per_second": 3.79,
639
  "step": 25200
640
  },
641
  {
642
  "epoch": 7.085301472631286,
643
- "grad_norm": 0.518583357334137,
644
- "learning_rate": 0.0001639066407335371,
645
- "loss": 0.049,
646
  "step": 25500
647
  },
648
  {
649
  "epoch": 7.20200055570992,
650
- "eval_loss": 0.11705406755208969,
651
- "eval_runtime": 16.5295,
652
- "eval_samples_per_second": 30.249,
653
- "eval_steps_per_second": 3.811,
654
  "step": 25920
655
  },
656
  {
657
  "epoch": 7.2242289524868015,
658
- "grad_norm": 0.31886982917785645,
659
  "learning_rate": 0.00015928498657034364,
660
- "loss": 0.0469,
661
  "step": 26000
662
  },
663
  {
664
  "epoch": 7.3631564323423175,
665
- "grad_norm": 0.28966373205184937,
666
  "learning_rate": 0.00015465407057515976,
667
- "loss": 0.0478,
668
  "step": 26500
669
  },
670
  {
671
  "epoch": 7.402056126701861,
672
- "eval_loss": 0.10964089632034302,
673
- "eval_runtime": 16.5451,
674
- "eval_samples_per_second": 30.22,
675
- "eval_steps_per_second": 3.808,
676
  "step": 26640
677
  },
678
  {
679
  "epoch": 7.502083912197833,
680
- "grad_norm": 0.36664408445358276,
681
- "learning_rate": 0.0001500324164119663,
682
- "loss": 0.0481,
683
  "step": 27000
684
  },
685
  {
686
  "epoch": 7.602111697693804,
687
- "eval_loss": 0.11513197422027588,
688
- "eval_runtime": 16.6147,
689
- "eval_samples_per_second": 30.094,
690
- "eval_steps_per_second": 3.792,
691
  "step": 27360
692
  },
693
  {
694
  "epoch": 7.641011392053348,
695
- "grad_norm": 0.32852914929389954,
696
  "learning_rate": 0.00014540150041678245,
697
- "loss": 0.048,
698
  "step": 27500
699
  },
700
  {
701
  "epoch": 7.779938871908864,
702
- "grad_norm": 0.5736936330795288,
703
  "learning_rate": 0.0001407705844215986,
704
- "loss": 0.0483,
705
  "step": 28000
706
  },
707
  {
708
  "epoch": 7.802167268685746,
709
- "eval_loss": 0.10632374882698059,
710
- "eval_runtime": 16.6173,
711
- "eval_samples_per_second": 30.089,
712
- "eval_steps_per_second": 3.791,
713
  "step": 28080
714
  },
715
  {
716
  "epoch": 7.918866351764379,
717
- "grad_norm": 0.335509717464447,
718
  "learning_rate": 0.00013613966842641474,
719
- "loss": 0.0487,
720
  "step": 28500
721
  },
722
  {
723
  "epoch": 8.002222839677689,
724
- "eval_loss": 0.11238289624452591,
725
- "eval_runtime": 16.6415,
726
- "eval_samples_per_second": 30.045,
727
- "eval_steps_per_second": 3.786,
728
  "step": 28800
729
  },
730
  {
731
  "epoch": 8.057793831619895,
732
- "grad_norm": 0.8646371364593506,
733
- "learning_rate": 0.00013150875243123092,
734
- "loss": 0.0442,
735
  "step": 29000
736
  },
737
  {
738
  "epoch": 8.19672131147541,
739
- "grad_norm": 0.5025931000709534,
740
- "learning_rate": 0.00012687783643604704,
741
- "loss": 0.0411,
742
  "step": 29500
743
  },
744
  {
745
  "epoch": 8.20227841066963,
746
- "eval_loss": 0.1167159229516983,
747
- "eval_runtime": 16.2769,
748
- "eval_samples_per_second": 30.718,
749
- "eval_steps_per_second": 3.871,
750
  "step": 29520
751
  },
752
  {
753
  "epoch": 8.335648791330925,
754
- "grad_norm": 0.3653465509414673,
755
  "learning_rate": 0.00012225618227285358,
756
- "loss": 0.0398,
757
  "step": 30000
758
  },
759
  {
760
  "epoch": 8.402333981661572,
761
- "eval_loss": 0.11505118012428284,
762
- "eval_runtime": 16.3723,
763
- "eval_samples_per_second": 30.539,
764
- "eval_steps_per_second": 3.848,
765
  "step": 30240
766
  },
767
  {
768
  "epoch": 8.474576271186441,
769
- "grad_norm": 0.4740363359451294,
770
  "learning_rate": 0.00011762526627766973,
771
- "loss": 0.0401,
772
  "step": 30500
773
  },
774
  {
775
  "epoch": 8.602389552653515,
776
- "eval_loss": 0.11725138872861862,
777
- "eval_runtime": 16.2874,
778
- "eval_samples_per_second": 30.699,
779
- "eval_steps_per_second": 3.868,
780
  "step": 30960
781
  },
782
  {
783
  "epoch": 8.613503751041955,
784
- "grad_norm": 0.4850703775882721,
785
- "learning_rate": 0.00011299435028248589,
786
- "loss": 0.0411,
787
  "step": 31000
788
  },
789
  {
790
  "epoch": 8.752431230897471,
791
- "grad_norm": 0.4242146611213684,
792
- "learning_rate": 0.00010836343428730202,
793
- "loss": 0.0411,
794
  "step": 31500
795
  },
796
  {
797
  "epoch": 8.802445123645457,
798
- "eval_loss": 0.11391962319612503,
799
- "eval_runtime": 16.5683,
800
- "eval_samples_per_second": 30.178,
801
- "eval_steps_per_second": 3.802,
802
  "step": 31680
803
  },
804
  {
805
  "epoch": 8.891358710752987,
806
- "grad_norm": 0.44564756751060486,
807
- "learning_rate": 0.00010373251829211818,
808
- "loss": 0.0399,
809
  "step": 32000
810
  },
811
  {
812
  "epoch": 9.0025006946374,
813
- "eval_loss": 0.11336923390626907,
814
- "eval_runtime": 16.3871,
815
- "eval_samples_per_second": 30.512,
816
- "eval_steps_per_second": 3.844,
817
  "step": 32400
818
  },
819
  {
820
  "epoch": 9.030286190608502,
821
- "grad_norm": 0.33736202120780945,
822
  "learning_rate": 9.911086412892471e-05,
823
- "loss": 0.0385,
824
  "step": 32500
825
  },
826
  {
827
  "epoch": 9.169213670464018,
828
- "grad_norm": 0.47985410690307617,
829
  "learning_rate": 9.447994813374086e-05,
830
- "loss": 0.0333,
831
  "step": 33000
832
  },
833
  {
834
  "epoch": 9.202556265629342,
835
- "eval_loss": 0.12226500362157822,
836
- "eval_runtime": 16.4545,
837
- "eval_samples_per_second": 30.387,
838
- "eval_steps_per_second": 3.829,
839
  "step": 33120
840
  },
841
  {
842
  "epoch": 9.308141150319534,
843
- "grad_norm": 0.32917848229408264,
844
  "learning_rate": 8.9849032138557e-05,
845
- "loss": 0.034,
846
  "step": 33500
847
  },
848
  {
849
  "epoch": 9.402611836621285,
850
- "eval_loss": 0.11844275146722794,
851
- "eval_runtime": 16.455,
852
- "eval_samples_per_second": 30.386,
853
- "eval_steps_per_second": 3.829,
854
  "step": 33840
855
  },
856
  {
857
  "epoch": 9.447068630175048,
858
- "grad_norm": 0.3224587142467499,
859
  "learning_rate": 8.521811614337317e-05,
860
- "loss": 0.0338,
861
  "step": 34000
862
  },
863
  {
864
  "epoch": 9.585996110030564,
865
- "grad_norm": 0.32778987288475037,
866
- "learning_rate": 8.058720014818932e-05,
867
- "loss": 0.0345,
868
  "step": 34500
869
  },
870
  {
871
  "epoch": 9.602667407613225,
872
- "eval_loss": 0.1248505637049675,
873
- "eval_runtime": 16.2623,
874
- "eval_samples_per_second": 30.746,
875
- "eval_steps_per_second": 3.874,
876
  "step": 34560
877
  },
878
  {
879
  "epoch": 9.72492358988608,
880
- "grad_norm": 1.4036211967468262,
881
  "learning_rate": 7.596554598499583e-05,
882
- "loss": 0.0352,
883
  "step": 35000
884
  },
885
  {
886
  "epoch": 9.802722978605168,
887
- "eval_loss": 0.12011002004146576,
888
- "eval_runtime": 16.387,
889
- "eval_samples_per_second": 30.512,
890
- "eval_steps_per_second": 3.845,
891
  "step": 35280
892
  },
893
  {
894
  "epoch": 9.863851069741594,
895
- "grad_norm": 0.35703155398368835,
896
  "learning_rate": 7.133462998981199e-05,
897
- "loss": 0.0353,
898
  "step": 35500
899
  },
900
  {
901
  "epoch": 10.00277854959711,
902
- "grad_norm": 0.1466842144727707,
903
- "learning_rate": 6.670371399462814e-05,
904
- "loss": 0.0338,
905
  "step": 36000
906
  },
907
  {
908
  "epoch": 10.00277854959711,
909
- "eval_loss": 0.1183161661028862,
910
- "eval_runtime": 16.5682,
911
- "eval_samples_per_second": 30.178,
912
- "eval_steps_per_second": 3.802,
913
  "step": 36000
914
  },
915
  {
916
  "epoch": 10.141706029452626,
917
- "grad_norm": 0.2284984439611435,
918
- "learning_rate": 6.207279799944429e-05,
919
- "loss": 0.0288,
920
  "step": 36500
921
  },
922
  {
923
  "epoch": 10.202834120589053,
924
- "eval_loss": 0.12911224365234375,
925
- "eval_runtime": 16.44,
926
- "eval_samples_per_second": 30.414,
927
- "eval_steps_per_second": 3.832,
928
  "step": 36720
929
  },
930
  {
931
  "epoch": 10.28063350930814,
932
- "grad_norm": 0.28028494119644165,
933
- "learning_rate": 5.744188200426045e-05,
934
- "loss": 0.0283,
935
  "step": 37000
936
  },
937
  {
938
  "epoch": 10.402889691580995,
939
- "eval_loss": 0.13118575513362885,
940
- "eval_runtime": 16.5118,
941
- "eval_samples_per_second": 30.281,
942
- "eval_steps_per_second": 3.815,
943
  "step": 37440
944
  },
945
  {
946
  "epoch": 10.419560989163656,
947
- "grad_norm": 0.425870805978775,
948
- "learning_rate": 5.28109660090766e-05,
949
- "loss": 0.0296,
950
  "step": 37500
951
  },
952
  {
953
  "epoch": 10.558488469019172,
954
- "grad_norm": 0.455585777759552,
955
- "learning_rate": 4.818005001389275e-05,
956
- "loss": 0.0288,
957
  "step": 38000
958
  },
959
  {
960
  "epoch": 10.602945262572938,
961
- "eval_loss": 0.1287374049425125,
962
- "eval_runtime": 16.5018,
963
- "eval_samples_per_second": 30.3,
964
- "eval_steps_per_second": 3.818,
965
  "step": 38160
966
  },
967
  {
968
  "epoch": 10.697415948874687,
969
- "grad_norm": 0.2750612497329712,
970
- "learning_rate": 4.3549134018708904e-05,
971
- "loss": 0.0289,
972
  "step": 38500
973
  },
974
  {
975
  "epoch": 10.803000833564878,
976
- "eval_loss": 0.12641233205795288,
977
- "eval_runtime": 16.5216,
978
- "eval_samples_per_second": 30.263,
979
- "eval_steps_per_second": 3.813,
980
  "step": 38880
981
  },
982
  {
983
  "epoch": 10.836343428730203,
984
- "grad_norm": 0.37989404797554016,
985
- "learning_rate": 3.891821802352506e-05,
986
- "loss": 0.0296,
987
  "step": 39000
988
  },
989
  {
990
  "epoch": 10.975270908585719,
991
- "grad_norm": 0.3818288743495941,
992
- "learning_rate": 3.4287302028341206e-05,
993
- "loss": 0.0299,
994
  "step": 39500
995
  }
996
  ],
@@ -1011,7 +1011,7 @@
1011
  "attributes": {}
1012
  }
1013
  },
1014
- "total_flos": 7.40051768405341e+18,
1015
  "train_batch_size": 4,
1016
  "trial_name": null,
1017
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.13892747985551543,
13
+ "grad_norm": 0.4299773573875427,
14
+ "learning_rate": 0.0003953968695007873,
15
+ "loss": 0.2967,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.2000555709919422,
20
+ "eval_loss": 0.1983761489391327,
21
+ "eval_runtime": 16.4512,
22
+ "eval_samples_per_second": 30.393,
23
+ "eval_steps_per_second": 3.83,
24
  "step": 720
25
  },
26
  {
27
  "epoch": 0.27785495971103086,
28
+ "grad_norm": 0.5706949830055237,
29
+ "learning_rate": 0.0003907659535056034,
30
+ "loss": 0.1959,
31
  "step": 1000
32
  },
33
  {
34
  "epoch": 0.4001111419838844,
35
+ "eval_loss": 0.17383727431297302,
36
+ "eval_runtime": 16.4842,
37
+ "eval_samples_per_second": 30.332,
38
  "eval_steps_per_second": 3.822,
39
  "step": 1440
40
  },
41
  {
42
  "epoch": 0.41678243956654626,
43
+ "grad_norm": 0.5392869710922241,
44
+ "learning_rate": 0.0003861350375104196,
45
+ "loss": 0.1723,
46
  "step": 1500
47
  },
48
  {
49
  "epoch": 0.5557099194220617,
50
+ "grad_norm": 0.5005412697792053,
51
+ "learning_rate": 0.0003815041215152357,
52
+ "loss": 0.1636,
53
  "step": 2000
54
  },
55
  {
56
  "epoch": 0.6001667129758266,
57
+ "eval_loss": 0.16229495406150818,
58
+ "eval_runtime": 16.4963,
59
+ "eval_samples_per_second": 30.31,
60
+ "eval_steps_per_second": 3.819,
61
  "step": 2160
62
  },
63
  {
64
  "epoch": 0.6946373992775771,
65
+ "grad_norm": 0.370914489030838,
66
+ "learning_rate": 0.0003768732055200519,
67
+ "loss": 0.1539,
68
  "step": 2500
69
  },
70
  {
71
  "epoch": 0.8002222839677688,
72
+ "eval_loss": 0.1537286937236786,
73
+ "eval_runtime": 16.4662,
74
+ "eval_samples_per_second": 30.365,
75
+ "eval_steps_per_second": 3.826,
76
  "step": 2880
77
  },
78
  {
79
  "epoch": 0.8335648791330925,
80
+ "grad_norm": 0.41378697752952576,
81
+ "learning_rate": 0.000372242289524868,
82
+ "loss": 0.1445,
83
  "step": 3000
84
  },
85
  {
86
  "epoch": 0.972492358988608,
87
+ "grad_norm": 0.4000154137611389,
88
  "learning_rate": 0.0003676113735296842,
89
+ "loss": 0.1384,
90
  "step": 3500
91
  },
92
  {
93
  "epoch": 1.000277854959711,
94
+ "eval_loss": 0.15115150809288025,
95
+ "eval_runtime": 16.8021,
96
+ "eval_samples_per_second": 29.758,
97
+ "eval_steps_per_second": 3.75,
98
  "step": 3600
99
  },
100
  {
101
  "epoch": 1.1114198388441234,
102
+ "grad_norm": 0.4774770140647888,
103
  "learning_rate": 0.00036298045753450036,
104
+ "loss": 0.1317,
105
  "step": 4000
106
  },
107
  {
108
  "epoch": 1.2003334259516532,
109
+ "eval_loss": 0.14330309629440308,
110
+ "eval_runtime": 16.4814,
111
+ "eval_samples_per_second": 30.337,
112
+ "eval_steps_per_second": 3.822,
113
  "step": 4320
114
  },
115
  {
116
  "epoch": 1.2503473186996388,
117
+ "grad_norm": 0.4988621175289154,
118
  "learning_rate": 0.0003583495415393165,
119
+ "loss": 0.1283,
120
  "step": 4500
121
  },
122
  {
123
  "epoch": 1.3892747985551543,
124
+ "grad_norm": 0.5688238739967346,
125
+ "learning_rate": 0.000353727887376123,
126
+ "loss": 0.1237,
127
  "step": 5000
128
  },
129
  {
130
  "epoch": 1.4003889969435954,
131
+ "eval_loss": 0.13683326542377472,
132
+ "eval_runtime": 16.5065,
133
+ "eval_samples_per_second": 30.291,
134
+ "eval_steps_per_second": 3.817,
135
  "step": 5040
136
  },
137
  {
138
  "epoch": 1.5282022784106695,
139
+ "grad_norm": 0.5433902740478516,
140
  "learning_rate": 0.00034909697138093914,
141
+ "loss": 0.1174,
142
  "step": 5500
143
  },
144
  {
145
  "epoch": 1.6004445679355377,
146
+ "eval_loss": 0.13955478370189667,
147
+ "eval_runtime": 16.4989,
148
+ "eval_samples_per_second": 30.305,
149
+ "eval_steps_per_second": 3.818,
150
  "step": 5760
151
  },
152
  {
153
  "epoch": 1.667129758266185,
154
+ "grad_norm": 0.5300644040107727,
155
  "learning_rate": 0.0003444660553857553,
156
+ "loss": 0.1178,
157
  "step": 6000
158
  },
159
  {
160
  "epoch": 1.8005001389274797,
161
+ "eval_loss": 0.13158197700977325,
162
+ "eval_runtime": 16.533,
163
+ "eval_samples_per_second": 30.243,
164
+ "eval_steps_per_second": 3.811,
165
  "step": 6480
166
  },
167
  {
168
  "epoch": 1.8060572381217006,
169
+ "grad_norm": 0.3573897182941437,
170
  "learning_rate": 0.0003398351393905715,
171
+ "loss": 0.113,
172
  "step": 6500
173
  },
174
  {
175
  "epoch": 1.9449847179772157,
176
+ "grad_norm": 0.5019258260726929,
177
  "learning_rate": 0.00033520422339538766,
178
+ "loss": 0.1134,
179
  "step": 7000
180
  },
181
  {
182
  "epoch": 2.000555709919422,
183
+ "eval_loss": 0.13124322891235352,
184
+ "eval_runtime": 16.0696,
185
+ "eval_samples_per_second": 31.115,
186
+ "eval_steps_per_second": 3.92,
187
  "step": 7200
188
  },
189
  {
190
  "epoch": 2.0839121978327313,
191
+ "grad_norm": 0.4890081286430359,
192
  "learning_rate": 0.0003305733074002038,
193
+ "loss": 0.1075,
194
  "step": 7500
195
  },
196
  {
197
  "epoch": 2.2006112809113643,
198
+ "eval_loss": 0.12690122425556183,
199
+ "eval_runtime": 15.8697,
200
+ "eval_samples_per_second": 31.507,
201
+ "eval_steps_per_second": 3.97,
202
  "step": 7920
203
  },
204
  {
205
  "epoch": 2.222839677688247,
206
+ "grad_norm": 0.5411983132362366,
207
  "learning_rate": 0.0003259423914050199,
208
+ "loss": 0.104,
209
  "step": 8000
210
  },
211
  {
212
  "epoch": 2.361767157543762,
213
+ "grad_norm": 0.892245352268219,
214
+ "learning_rate": 0.00032132073724182645,
215
+ "loss": 0.1018,
216
  "step": 8500
217
  },
218
  {
219
  "epoch": 2.4006668519033063,
220
+ "eval_loss": 0.1253955215215683,
221
+ "eval_runtime": 15.9212,
222
+ "eval_samples_per_second": 31.405,
223
+ "eval_steps_per_second": 3.957,
224
  "step": 8640
225
  },
226
  {
227
  "epoch": 2.5006946373992776,
228
+ "grad_norm": 0.5154420137405396,
229
+ "learning_rate": 0.0003166898212466426,
230
+ "loss": 0.1018,
231
  "step": 9000
232
  },
233
  {
234
  "epoch": 2.600722422895249,
235
+ "eval_loss": 0.1270376443862915,
236
+ "eval_runtime": 15.8556,
237
+ "eval_samples_per_second": 31.535,
238
+ "eval_steps_per_second": 3.973,
239
  "step": 9360
240
  },
241
  {
242
  "epoch": 2.639622117254793,
243
+ "grad_norm": 0.4247698187828064,
244
+ "learning_rate": 0.0003120681670834491,
245
+ "loss": 0.0988,
246
  "step": 9500
247
  },
248
  {
249
  "epoch": 2.7785495971103087,
250
+ "grad_norm": 0.6174339652061462,
251
+ "learning_rate": 0.0003074372510882653,
252
+ "loss": 0.0931,
253
  "step": 10000
254
  },
255
  {
256
  "epoch": 2.800777993887191,
257
+ "eval_loss": 0.12492711842060089,
258
+ "eval_runtime": 15.9536,
259
+ "eval_samples_per_second": 31.341,
260
+ "eval_steps_per_second": 3.949,
261
  "step": 10080
262
  },
263
  {
264
  "epoch": 2.917477076965824,
265
+ "grad_norm": 0.3945905864238739,
266
+ "learning_rate": 0.0003028063350930814,
267
+ "loss": 0.0924,
268
  "step": 10500
269
  },
270
  {
271
  "epoch": 3.000833564879133,
272
+ "eval_loss": 0.12177152931690216,
273
+ "eval_runtime": 16.5123,
274
+ "eval_samples_per_second": 30.28,
275
+ "eval_steps_per_second": 3.815,
276
  "step": 10800
277
  },
278
  {
279
  "epoch": 3.0564045568213394,
280
+ "grad_norm": 0.4349508285522461,
281
+ "learning_rate": 0.0002981754190978976,
282
+ "loss": 0.0929,
283
  "step": 11000
284
  },
285
  {
286
  "epoch": 3.1953320366768545,
287
+ "grad_norm": 0.5195356011390686,
288
+ "learning_rate": 0.00029354450310271375,
289
+ "loss": 0.0897,
290
  "step": 11500
291
  },
292
  {
293
  "epoch": 3.2008891358710754,
294
+ "eval_loss": 0.12157219648361206,
295
+ "eval_runtime": 15.887,
296
+ "eval_samples_per_second": 31.472,
297
+ "eval_steps_per_second": 3.965,
298
  "step": 11520
299
  },
300
  {
301
  "epoch": 3.33425951653237,
302
+ "grad_norm": 0.38773760199546814,
303
+ "learning_rate": 0.0002889135871075299,
304
+ "loss": 0.0868,
305
  "step": 12000
306
  },
307
  {
308
  "epoch": 3.4009447068630174,
309
+ "eval_loss": 0.12406055629253387,
310
+ "eval_runtime": 15.9444,
311
+ "eval_samples_per_second": 31.359,
312
+ "eval_steps_per_second": 3.951,
313
  "step": 12240
314
  },
315
  {
316
  "epoch": 3.4731869963878856,
317
+ "grad_norm": 0.3054683804512024,
318
+ "learning_rate": 0.00028428267111234605,
319
+ "loss": 0.0865,
320
  "step": 12500
321
  },
322
  {
323
  "epoch": 3.6010002778549595,
324
+ "eval_loss": 0.11476034671068192,
325
+ "eval_runtime": 15.9006,
326
+ "eval_samples_per_second": 31.445,
327
+ "eval_steps_per_second": 3.962,
328
  "step": 12960
329
  },
330
  {
331
  "epoch": 3.612114476243401,
332
+ "grad_norm": 0.5311923623085022,
333
+ "learning_rate": 0.0002796610169491526,
334
+ "loss": 0.0845,
335
  "step": 13000
336
  },
337
  {
338
  "epoch": 3.7510419560989163,
339
+ "grad_norm": 0.7641647458076477,
340
+ "learning_rate": 0.0002750301009539687,
341
+ "loss": 0.084,
342
  "step": 13500
343
  },
344
  {
345
  "epoch": 3.801055848846902,
346
+ "eval_loss": 0.11587072908878326,
347
+ "eval_runtime": 15.933,
348
+ "eval_samples_per_second": 31.381,
349
+ "eval_steps_per_second": 3.954,
350
  "step": 13680
351
  },
352
  {
353
  "epoch": 3.889969435954432,
354
+ "grad_norm": 0.5842312574386597,
355
+ "learning_rate": 0.00027039918495878483,
356
+ "loss": 0.0815,
357
  "step": 14000
358
  },
359
  {
360
  "epoch": 4.001111419838844,
361
+ "eval_loss": 0.11761430650949478,
362
+ "eval_runtime": 16.0803,
363
+ "eval_samples_per_second": 31.094,
364
+ "eval_steps_per_second": 3.918,
365
  "step": 14400
366
  },
367
  {
368
  "epoch": 4.0288969158099475,
369
+ "grad_norm": 0.5182059407234192,
370
+ "learning_rate": 0.000265768268963601,
371
+ "loss": 0.0823,
372
  "step": 14500
373
  },
374
  {
375
  "epoch": 4.167824395665463,
376
+ "grad_norm": 0.3954576253890991,
377
+ "learning_rate": 0.0002611373529684172,
378
+ "loss": 0.0753,
379
  "step": 15000
380
  },
381
  {
382
  "epoch": 4.201166990830786,
383
+ "eval_loss": 0.11391445249319077,
384
+ "eval_runtime": 15.9483,
385
+ "eval_samples_per_second": 31.351,
386
+ "eval_steps_per_second": 3.95,
387
  "step": 15120
388
  },
389
  {
390
  "epoch": 4.306751875520978,
391
+ "grad_norm": 0.5974435210227966,
392
+ "learning_rate": 0.00025650643697323335,
393
+ "loss": 0.0762,
394
  "step": 15500
395
  },
396
  {
397
  "epoch": 4.4012225618227285,
398
+ "eval_loss": 0.11403658986091614,
399
+ "eval_runtime": 15.92,
400
+ "eval_samples_per_second": 31.407,
401
+ "eval_steps_per_second": 3.957,
402
  "step": 15840
403
  },
404
  {
405
  "epoch": 4.445679355376494,
406
+ "grad_norm": 0.4496535360813141,
407
+ "learning_rate": 0.0002518755209780495,
408
+ "loss": 0.0737,
409
  "step": 16000
410
  },
411
  {
412
  "epoch": 4.584606835232009,
413
+ "grad_norm": 0.5617558360099792,
414
  "learning_rate": 0.0002472446049828656,
415
+ "loss": 0.074,
416
  "step": 16500
417
  },
418
  {
419
  "epoch": 4.601278132814671,
420
+ "eval_loss": 0.11306341737508774,
421
+ "eval_runtime": 15.9244,
422
+ "eval_samples_per_second": 31.398,
423
+ "eval_steps_per_second": 3.956,
424
  "step": 16560
425
  },
426
  {
427
  "epoch": 4.723534315087524,
428
+ "grad_norm": 0.5999208092689514,
429
  "learning_rate": 0.00024261368898768177,
430
+ "loss": 0.0732,
431
  "step": 17000
432
  },
433
  {
434
  "epoch": 4.801333703806613,
435
+ "eval_loss": 0.11077062785625458,
436
+ "eval_runtime": 15.9311,
437
+ "eval_samples_per_second": 31.385,
438
+ "eval_steps_per_second": 3.955,
439
  "step": 17280
440
  },
441
  {
442
  "epoch": 4.86246179494304,
443
+ "grad_norm": 0.3961442708969116,
444
  "learning_rate": 0.0002379920348244883,
445
+ "loss": 0.0724,
446
  "step": 17500
447
  },
448
  {
449
  "epoch": 5.001389274798555,
450
+ "grad_norm": 0.507563054561615,
451
  "learning_rate": 0.00023336111882930443,
452
+ "loss": 0.0685,
453
  "step": 18000
454
  },
455
  {
456
  "epoch": 5.001389274798555,
457
+ "eval_loss": 0.11523561179637909,
458
+ "eval_runtime": 16.1329,
459
+ "eval_samples_per_second": 30.992,
460
+ "eval_steps_per_second": 3.905,
461
  "step": 18000
462
  },
463
  {
464
  "epoch": 5.14031675465407,
465
+ "grad_norm": 0.5651789307594299,
466
  "learning_rate": 0.00022873020283412058,
467
+ "loss": 0.0655,
468
  "step": 18500
469
  },
470
  {
471
  "epoch": 5.201444845790498,
472
+ "eval_loss": 0.11398093402385712,
473
+ "eval_runtime": 15.9337,
474
+ "eval_samples_per_second": 31.38,
475
+ "eval_steps_per_second": 3.954,
476
  "step": 18720
477
  },
478
  {
479
  "epoch": 5.279244234509586,
480
+ "grad_norm": 0.619132399559021,
481
  "learning_rate": 0.00022409928683893675,
482
+ "loss": 0.0664,
483
  "step": 19000
484
  },
485
  {
486
  "epoch": 5.401500416782439,
487
+ "eval_loss": 0.11203750967979431,
488
+ "eval_runtime": 15.8871,
489
+ "eval_samples_per_second": 31.472,
490
+ "eval_steps_per_second": 3.965,
491
  "step": 19440
492
  },
493
  {
494
  "epoch": 5.418171714365101,
495
+ "grad_norm": 0.4724760353565216,
496
  "learning_rate": 0.0002194683708437529,
497
+ "loss": 0.0636,
498
  "step": 19500
499
  },
500
  {
501
  "epoch": 5.5570991942206165,
502
+ "grad_norm": 0.5861866474151611,
503
  "learning_rate": 0.00021483745484856908,
504
+ "loss": 0.0648,
505
  "step": 20000
506
  },
507
  {
508
  "epoch": 5.601555987774382,
509
+ "eval_loss": 0.1131061241030693,
510
+ "eval_runtime": 15.952,
511
+ "eval_samples_per_second": 31.344,
512
+ "eval_steps_per_second": 3.949,
513
  "step": 20160
514
  },
515
  {
516
  "epoch": 5.6960266740761325,
517
+ "grad_norm": 0.5262423157691956,
518
+ "learning_rate": 0.0002102065388533852,
519
+ "loss": 0.063,
520
  "step": 20500
521
  },
522
  {
523
  "epoch": 5.801611558766324,
524
+ "eval_loss": 0.11248422414064407,
525
+ "eval_runtime": 15.9153,
526
+ "eval_samples_per_second": 31.416,
527
+ "eval_steps_per_second": 3.958,
528
  "step": 20880
529
  },
530
  {
531
  "epoch": 5.834954153931648,
532
+ "grad_norm": 0.6074294447898865,
533
+ "learning_rate": 0.00020557562285820135,
534
+ "loss": 0.0624,
535
  "step": 21000
536
  },
537
  {
538
  "epoch": 5.973881633787163,
539
+ "grad_norm": 0.5349674820899963,
540
  "learning_rate": 0.0002009539686950079,
541
+ "loss": 0.0609,
542
  "step": 21500
543
  },
544
  {
545
  "epoch": 6.001667129758266,
546
+ "eval_loss": 0.11405794322490692,
547
+ "eval_runtime": 16.1656,
548
+ "eval_samples_per_second": 30.93,
549
+ "eval_steps_per_second": 3.897,
550
  "step": 21600
551
  },
552
  {
553
  "epoch": 6.112809113642679,
554
+ "grad_norm": 0.36713194847106934,
555
  "learning_rate": 0.00019632305269982403,
556
+ "loss": 0.0576,
557
  "step": 22000
558
  },
559
  {
560
  "epoch": 6.201722700750208,
561
+ "eval_loss": 0.11051186919212341,
562
+ "eval_runtime": 15.9395,
563
+ "eval_samples_per_second": 31.369,
564
+ "eval_steps_per_second": 3.952,
565
  "step": 22320
566
  },
567
  {
568
  "epoch": 6.251736593498194,
569
+ "grad_norm": 0.4714512526988983,
570
  "learning_rate": 0.00019169213670464018,
571
+ "loss": 0.057,
572
  "step": 22500
573
  },
574
  {
575
  "epoch": 6.390664073353709,
576
+ "grad_norm": 0.3419685363769531,
577
  "learning_rate": 0.00018706122070945633,
578
+ "loss": 0.0572,
579
  "step": 23000
580
  },
581
  {
582
  "epoch": 6.401778271742151,
583
+ "eval_loss": 0.1142740249633789,
584
+ "eval_runtime": 15.9346,
585
+ "eval_samples_per_second": 31.378,
586
+ "eval_steps_per_second": 3.954,
587
  "step": 23040
588
  },
589
  {
590
  "epoch": 6.529591553209225,
591
+ "grad_norm": 0.43148958683013916,
592
  "learning_rate": 0.0001824303047142725,
593
+ "loss": 0.0554,
594
  "step": 23500
595
  },
596
  {
597
  "epoch": 6.601833842734093,
598
+ "eval_loss": 0.1115042194724083,
599
+ "eval_runtime": 15.9938,
600
+ "eval_samples_per_second": 31.262,
601
+ "eval_steps_per_second": 3.939,
602
  "step": 23760
603
  },
604
  {
605
  "epoch": 6.66851903306474,
606
+ "grad_norm": 0.5623305439949036,
607
  "learning_rate": 0.00017779938871908863,
608
+ "loss": 0.0538,
609
  "step": 24000
610
  },
611
  {
612
  "epoch": 6.801889413726035,
613
+ "eval_loss": 0.11134042590856552,
614
+ "eval_runtime": 15.9452,
615
+ "eval_samples_per_second": 31.357,
616
+ "eval_steps_per_second": 3.951,
617
  "step": 24480
618
  },
619
  {
620
  "epoch": 6.807446512920255,
621
+ "grad_norm": 0.4066413640975952,
622
  "learning_rate": 0.0001731684727239048,
623
+ "loss": 0.0534,
624
  "step": 24500
625
  },
626
  {
627
  "epoch": 6.946373992775771,
628
+ "grad_norm": 0.4021354019641876,
629
  "learning_rate": 0.00016853755672872095,
630
+ "loss": 0.052,
631
  "step": 25000
632
  },
633
  {
634
  "epoch": 7.001944984717977,
635
+ "eval_loss": 0.1132456511259079,
636
+ "eval_runtime": 16.1483,
637
+ "eval_samples_per_second": 30.963,
638
+ "eval_steps_per_second": 3.901,
639
  "step": 25200
640
  },
641
  {
642
  "epoch": 7.085301472631286,
643
+ "grad_norm": 0.46669623255729675,
644
+ "learning_rate": 0.00016391590256552746,
645
+ "loss": 0.0498,
646
  "step": 25500
647
  },
648
  {
649
  "epoch": 7.20200055570992,
650
+ "eval_loss": 0.11319959908723831,
651
+ "eval_runtime": 15.9896,
652
+ "eval_samples_per_second": 31.27,
653
+ "eval_steps_per_second": 3.94,
654
  "step": 25920
655
  },
656
  {
657
  "epoch": 7.2242289524868015,
658
+ "grad_norm": 0.2583458125591278,
659
  "learning_rate": 0.00015928498657034364,
660
+ "loss": 0.0487,
661
  "step": 26000
662
  },
663
  {
664
  "epoch": 7.3631564323423175,
665
+ "grad_norm": 0.3225070536136627,
666
  "learning_rate": 0.00015465407057515976,
667
+ "loss": 0.0485,
668
  "step": 26500
669
  },
670
  {
671
  "epoch": 7.402056126701861,
672
+ "eval_loss": 0.11151115596294403,
673
+ "eval_runtime": 15.9848,
674
+ "eval_samples_per_second": 31.28,
675
+ "eval_steps_per_second": 3.941,
676
  "step": 26640
677
  },
678
  {
679
  "epoch": 7.502083912197833,
680
+ "grad_norm": 0.4772126376628876,
681
+ "learning_rate": 0.00015002315457997593,
682
+ "loss": 0.0483,
683
  "step": 27000
684
  },
685
  {
686
  "epoch": 7.602111697693804,
687
+ "eval_loss": 0.11146976053714752,
688
+ "eval_runtime": 15.9333,
689
+ "eval_samples_per_second": 31.381,
690
+ "eval_steps_per_second": 3.954,
691
  "step": 27360
692
  },
693
  {
694
  "epoch": 7.641011392053348,
695
+ "grad_norm": 0.7825577855110168,
696
  "learning_rate": 0.00014540150041678245,
697
+ "loss": 0.0478,
698
  "step": 27500
699
  },
700
  {
701
  "epoch": 7.779938871908864,
702
+ "grad_norm": 0.465191513299942,
703
  "learning_rate": 0.0001407705844215986,
704
+ "loss": 0.0469,
705
  "step": 28000
706
  },
707
  {
708
  "epoch": 7.802167268685746,
709
+ "eval_loss": 0.11259140819311142,
710
+ "eval_runtime": 15.9752,
711
+ "eval_samples_per_second": 31.299,
712
+ "eval_steps_per_second": 3.944,
713
  "step": 28080
714
  },
715
  {
716
  "epoch": 7.918866351764379,
717
+ "grad_norm": 0.2701134979724884,
718
  "learning_rate": 0.00013613966842641474,
719
+ "loss": 0.0443,
720
  "step": 28500
721
  },
722
  {
723
  "epoch": 8.002222839677689,
724
+ "eval_loss": 0.11337699741125107,
725
+ "eval_runtime": 16.0812,
726
+ "eval_samples_per_second": 31.092,
727
+ "eval_steps_per_second": 3.918,
728
  "step": 28800
729
  },
730
  {
731
  "epoch": 8.057793831619895,
732
+ "grad_norm": 0.3994615375995636,
733
+ "learning_rate": 0.00013151801426322128,
734
+ "loss": 0.044,
735
  "step": 29000
736
  },
737
  {
738
  "epoch": 8.19672131147541,
739
+ "grad_norm": 0.46412038803100586,
740
+ "learning_rate": 0.00012688709826803743,
741
+ "loss": 0.0421,
742
  "step": 29500
743
  },
744
  {
745
  "epoch": 8.20227841066963,
746
+ "eval_loss": 0.11495082080364227,
747
+ "eval_runtime": 15.9654,
748
+ "eval_samples_per_second": 31.318,
749
+ "eval_steps_per_second": 3.946,
750
  "step": 29520
751
  },
752
  {
753
  "epoch": 8.335648791330925,
754
+ "grad_norm": 0.3810461759567261,
755
  "learning_rate": 0.00012225618227285358,
756
+ "loss": 0.0411,
757
  "step": 30000
758
  },
759
  {
760
  "epoch": 8.402333981661572,
761
+ "eval_loss": 0.11439384520053864,
762
+ "eval_runtime": 16.014,
763
+ "eval_samples_per_second": 31.223,
764
+ "eval_steps_per_second": 3.934,
765
  "step": 30240
766
  },
767
  {
768
  "epoch": 8.474576271186441,
769
+ "grad_norm": 0.4397641122341156,
770
  "learning_rate": 0.00011762526627766973,
771
+ "loss": 0.0412,
772
  "step": 30500
773
  },
774
  {
775
  "epoch": 8.602389552653515,
776
+ "eval_loss": 0.11167102307081223,
777
+ "eval_runtime": 15.9143,
778
+ "eval_samples_per_second": 31.418,
779
+ "eval_steps_per_second": 3.959,
780
  "step": 30960
781
  },
782
  {
783
  "epoch": 8.613503751041955,
784
+ "grad_norm": 0.7023443579673767,
785
+ "learning_rate": 0.00011300361211447625,
786
+ "loss": 0.041,
787
  "step": 31000
788
  },
789
  {
790
  "epoch": 8.752431230897471,
791
+ "grad_norm": 0.5792316198348999,
792
+ "learning_rate": 0.00010837269611929239,
793
+ "loss": 0.0391,
794
  "step": 31500
795
  },
796
  {
797
  "epoch": 8.802445123645457,
798
+ "eval_loss": 0.11271476745605469,
799
+ "eval_runtime": 15.9859,
800
+ "eval_samples_per_second": 31.278,
801
+ "eval_steps_per_second": 3.941,
802
  "step": 31680
803
  },
804
  {
805
  "epoch": 8.891358710752987,
806
+ "grad_norm": 0.44151026010513306,
807
+ "learning_rate": 0.00010374178012410855,
808
+ "loss": 0.0403,
809
  "step": 32000
810
  },
811
  {
812
  "epoch": 9.0025006946374,
813
+ "eval_loss": 0.11616696417331696,
814
+ "eval_runtime": 16.1103,
815
+ "eval_samples_per_second": 31.036,
816
+ "eval_steps_per_second": 3.911,
817
  "step": 32400
818
  },
819
  {
820
  "epoch": 9.030286190608502,
821
+ "grad_norm": 0.3094378411769867,
822
  "learning_rate": 9.911086412892471e-05,
823
+ "loss": 0.0386,
824
  "step": 32500
825
  },
826
  {
827
  "epoch": 9.169213670464018,
828
+ "grad_norm": 0.4858907163143158,
829
  "learning_rate": 9.447994813374086e-05,
830
+ "loss": 0.0354,
831
  "step": 33000
832
  },
833
  {
834
  "epoch": 9.202556265629342,
835
+ "eval_loss": 0.11926531791687012,
836
+ "eval_runtime": 16.0468,
837
+ "eval_samples_per_second": 31.159,
838
+ "eval_steps_per_second": 3.926,
839
  "step": 33120
840
  },
841
  {
842
  "epoch": 9.308141150319534,
843
+ "grad_norm": 0.40236544609069824,
844
  "learning_rate": 8.9849032138557e-05,
845
+ "loss": 0.0354,
846
  "step": 33500
847
  },
848
  {
849
  "epoch": 9.402611836621285,
850
+ "eval_loss": 0.12175790965557098,
851
+ "eval_runtime": 15.9794,
852
+ "eval_samples_per_second": 31.29,
853
+ "eval_steps_per_second": 3.943,
854
  "step": 33840
855
  },
856
  {
857
  "epoch": 9.447068630175048,
858
+ "grad_norm": 0.8239908218383789,
859
  "learning_rate": 8.521811614337317e-05,
860
+ "loss": 0.035,
861
  "step": 34000
862
  },
863
  {
864
  "epoch": 9.585996110030564,
865
+ "grad_norm": 0.2754063308238983,
866
+ "learning_rate": 8.059646198017968e-05,
867
+ "loss": 0.0352,
868
  "step": 34500
869
  },
870
  {
871
  "epoch": 9.602667407613225,
872
+ "eval_loss": 0.11963404715061188,
873
+ "eval_runtime": 15.9949,
874
+ "eval_samples_per_second": 31.26,
875
+ "eval_steps_per_second": 3.939,
876
  "step": 34560
877
  },
878
  {
879
  "epoch": 9.72492358988608,
880
+ "grad_norm": 0.4275870621204376,
881
  "learning_rate": 7.596554598499583e-05,
882
+ "loss": 0.0356,
883
  "step": 35000
884
  },
885
  {
886
  "epoch": 9.802722978605168,
887
+ "eval_loss": 0.12364204972982407,
888
+ "eval_runtime": 15.9291,
889
+ "eval_samples_per_second": 31.389,
890
+ "eval_steps_per_second": 3.955,
891
  "step": 35280
892
  },
893
  {
894
  "epoch": 9.863851069741594,
895
+ "grad_norm": 0.41111400723457336,
896
  "learning_rate": 7.133462998981199e-05,
897
+ "loss": 0.0322,
898
  "step": 35500
899
  },
900
  {
901
  "epoch": 10.00277854959711,
902
+ "grad_norm": 0.4361058175563812,
903
+ "learning_rate": 6.67129758266185e-05,
904
+ "loss": 0.0331,
905
  "step": 36000
906
  },
907
  {
908
  "epoch": 10.00277854959711,
909
+ "eval_loss": 0.1233987957239151,
910
+ "eval_runtime": 16.0606,
911
+ "eval_samples_per_second": 31.132,
912
+ "eval_steps_per_second": 3.923,
913
  "step": 36000
914
  },
915
  {
916
  "epoch": 10.141706029452626,
917
+ "grad_norm": 0.3986058235168457,
918
+ "learning_rate": 6.208205983143465e-05,
919
+ "loss": 0.032,
920
  "step": 36500
921
  },
922
  {
923
  "epoch": 10.202834120589053,
924
+ "eval_loss": 0.12648221850395203,
925
+ "eval_runtime": 15.987,
926
+ "eval_samples_per_second": 31.275,
927
+ "eval_steps_per_second": 3.941,
928
  "step": 36720
929
  },
930
  {
931
  "epoch": 10.28063350930814,
932
+ "grad_norm": 0.1770099699497223,
933
+ "learning_rate": 5.745114383625081e-05,
934
+ "loss": 0.0302,
935
  "step": 37000
936
  },
937
  {
938
  "epoch": 10.402889691580995,
939
+ "eval_loss": 0.1288907825946808,
940
+ "eval_runtime": 15.9585,
941
+ "eval_samples_per_second": 31.331,
942
+ "eval_steps_per_second": 3.948,
943
  "step": 37440
944
  },
945
  {
946
  "epoch": 10.419560989163656,
947
+ "grad_norm": 0.33153316378593445,
948
+ "learning_rate": 5.282022784106696e-05,
949
+ "loss": 0.0299,
950
  "step": 37500
951
  },
952
  {
953
  "epoch": 10.558488469019172,
954
+ "grad_norm": 0.4955579340457916,
955
+ "learning_rate": 4.818931184588312e-05,
956
+ "loss": 0.0301,
957
  "step": 38000
958
  },
959
  {
960
  "epoch": 10.602945262572938,
961
+ "eval_loss": 0.1280115395784378,
962
+ "eval_runtime": 15.9981,
963
+ "eval_samples_per_second": 31.254,
964
+ "eval_steps_per_second": 3.938,
965
  "step": 38160
966
  },
967
  {
968
  "epoch": 10.697415948874687,
969
+ "grad_norm": 0.6109702587127686,
970
+ "learning_rate": 4.355839585069927e-05,
971
+ "loss": 0.0295,
972
  "step": 38500
973
  },
974
  {
975
  "epoch": 10.803000833564878,
976
+ "eval_loss": 0.12585216760635376,
977
+ "eval_runtime": 15.9876,
978
+ "eval_samples_per_second": 31.274,
979
+ "eval_steps_per_second": 3.941,
980
  "step": 38880
981
  },
982
  {
983
  "epoch": 10.836343428730203,
984
+ "grad_norm": 0.3070131242275238,
985
+ "learning_rate": 3.8927479855515425e-05,
986
+ "loss": 0.0298,
987
  "step": 39000
988
  },
989
  {
990
  "epoch": 10.975270908585719,
991
+ "grad_norm": 0.5334280133247375,
992
+ "learning_rate": 3.429656386033158e-05,
993
+ "loss": 0.028,
994
  "step": 39500
995
  }
996
  ],
 
1011
  "attributes": {}
1012
  }
1013
  },
1014
+ "total_flos": 7.419404232849097e+18,
1015
  "train_batch_size": 4,
1016
  "trial_name": null,
1017
  "trial_params": null
checkpoint-39589/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58052fbf3a2b07f2a6024b5cc28db88f1f0e48109a11483aa716d00657e9906e
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2784cc9ebb113293b9d7c5af564dbf5463d67b520bf149c8840105fec4706ec
3
  size 5496