SystemAdmin123 commited on
Commit
943b793
·
verified ·
1 Parent(s): 6fd7976

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96eefb2ab4b204e9308bae75f26755bc252532bdfce90c247e78733a4199e1e2
3
  size 4939116424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f1bd28175fd7513f12729e17ee3571da7edbf57f145604ce22771573d115cc5
3
  size 4939116424
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4096a8c7f67ddbd2910b504765f448908fdd008051d764d2428a8cf62c11b3d
3
  size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e18b7d14aadcdd3ffd84212a90763f05677810d6606b0228ab48fc71aa8693
3
  size 4947390880
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c892e15db6ad07c12e5d0e95b0debfd095cf21e2bf401f6eca45fe7f25d85cb
3
  size 3590619888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74fed502be8e1cd05be29c5fb4cde46eadd9a078e63d9feccc111a8be0fd8b78
3
  size 3590619888
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ab754728a1608ca4beccfa68dff15ae8a33135223828648ca1ace40d391f5d2
3
- size 13688025904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd2e81881ebf33c7ad08f1ccd0211e7362bde834184d4d59f2f5bba227a0771
3
+ size 13688025584
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f20d155a14d34cd1bb6d04e5de90f3224906e1758821edd752a8f1a9085a2db
3
+ size 15024
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad18e870a176ca75a54f6620f83de92bdfd5a91302744d90bff8e5feae2fe0c5
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9009a52be47b75834407dc5e146ed5360e6f23a35bff27bab34ef6fb47df1661
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb494a08fd57e7c6f63f06826c872164986e81b271996be0496671f713bdcc3
3
+ size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40b6b717644e21f80a22ec98694b3a2fd9d62a6467e549d64314725dba905d52
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cd11d413bc67bf01de9a1a006e9e7655be307353028b25f5b3c299e5b6b7a44
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,619 +1,175 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.11839573775344088,
5
  "eval_steps": 200,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0001479946721918011,
13
- "eval_loss": 2.158451557159424,
14
- "eval_runtime": 117.4597,
15
- "eval_samples_per_second": 12.787,
16
- "eval_steps_per_second": 6.394,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.001479946721918011,
21
- "grad_norm": 17.875,
22
- "learning_rate": 1.6000000000000003e-05,
23
- "loss": 1.876,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.002959893443836022,
28
- "grad_norm": 11.9375,
29
- "learning_rate": 3.2000000000000005e-05,
30
- "loss": 1.8982,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 0.0044398401657540325,
35
- "grad_norm": 22.125,
36
- "learning_rate": 4.8e-05,
37
- "loss": 1.6963,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 0.005919786887672044,
42
- "grad_norm": 21.25,
43
- "learning_rate": 6.400000000000001e-05,
44
- "loss": 1.7986,
45
  "step": 40
46
  },
47
  {
48
- "epoch": 0.007399733609590055,
49
- "grad_norm": 26.375,
50
- "learning_rate": 8e-05,
51
- "loss": 1.3725,
52
  "step": 50
53
  },
54
  {
55
- "epoch": 0.008879680331508065,
56
- "grad_norm": 6.9375,
57
- "learning_rate": 9.6e-05,
58
- "loss": 1.9612,
59
  "step": 60
60
  },
61
  {
62
- "epoch": 0.010359627053426078,
63
- "grad_norm": 19.625,
64
- "learning_rate": 0.00011200000000000001,
65
- "loss": 2.0516,
66
  "step": 70
67
  },
68
  {
69
- "epoch": 0.011839573775344088,
70
- "grad_norm": 16.875,
71
- "learning_rate": 0.00012800000000000002,
72
- "loss": 1.5824,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.0133195204972621,
77
- "grad_norm": 21.5,
78
- "learning_rate": 0.000144,
79
- "loss": 2.2526,
80
  "step": 90
81
  },
82
  {
83
- "epoch": 0.01479946721918011,
84
- "grad_norm": 85.5,
85
- "learning_rate": 0.00016,
86
- "loss": 2.2656,
87
  "step": 100
88
  },
89
  {
90
- "epoch": 0.01627941394109812,
91
- "grad_norm": 58.25,
92
- "learning_rate": 0.00017600000000000002,
93
- "loss": 2.3181,
94
  "step": 110
95
  },
96
  {
97
- "epoch": 0.01775936066301613,
98
- "grad_norm": 14.25,
99
- "learning_rate": 0.000192,
100
- "loss": 2.4921,
101
  "step": 120
102
  },
103
  {
104
- "epoch": 0.01923930738493414,
105
- "grad_norm": 15.5625,
106
- "learning_rate": 0.0001999978128380225,
107
- "loss": 2.5357,
108
  "step": 130
109
  },
110
  {
111
- "epoch": 0.020719254106852155,
112
- "grad_norm": 23.75,
113
- "learning_rate": 0.0001999803161162393,
114
- "loss": 2.8389,
115
  "step": 140
116
  },
117
  {
118
- "epoch": 0.022199200828770166,
119
- "grad_norm": 115.5,
120
- "learning_rate": 0.00019994532573409262,
121
- "loss": 3.7381,
122
  "step": 150
123
  },
124
  {
125
- "epoch": 0.023679147550688177,
126
- "grad_norm": 17.25,
127
- "learning_rate": 0.00019989284781388617,
128
- "loss": 3.1991,
129
  "step": 160
130
  },
131
  {
132
- "epoch": 0.025159094272606188,
133
- "grad_norm": 19.0,
134
- "learning_rate": 0.00019982289153773646,
135
- "loss": 3.3157,
136
  "step": 170
137
  },
138
  {
139
- "epoch": 0.0266390409945242,
140
- "grad_norm": 11.3125,
141
- "learning_rate": 0.00019973546914596623,
142
- "loss": 3.309,
143
  "step": 180
144
  },
145
  {
146
- "epoch": 0.02811898771644221,
147
- "grad_norm": 105.0,
148
- "learning_rate": 0.00019963059593496268,
149
- "loss": 3.2528,
150
  "step": 190
151
  },
152
  {
153
- "epoch": 0.02959893443836022,
154
- "grad_norm": 137.0,
155
- "learning_rate": 0.00019950829025450114,
156
- "loss": 4.8144,
157
  "step": 200
158
  },
159
  {
160
- "epoch": 0.02959893443836022,
161
- "eval_loss": 4.562154293060303,
162
- "eval_runtime": 115.3725,
163
- "eval_samples_per_second": 13.019,
164
- "eval_steps_per_second": 6.509,
165
  "step": 200
166
- },
167
- {
168
- "epoch": 0.03107888116027823,
169
- "grad_norm": 14.625,
170
- "learning_rate": 0.0001993685735045343,
171
- "loss": 3.9662,
172
- "step": 210
173
- },
174
- {
175
- "epoch": 0.03255882788219624,
176
- "grad_norm": 18.0,
177
- "learning_rate": 0.0001992114701314478,
178
- "loss": 3.1902,
179
- "step": 220
180
- },
181
- {
182
- "epoch": 0.03403877460411425,
183
- "grad_norm": 24.625,
184
- "learning_rate": 0.000199037007623783,
185
- "loss": 3.4792,
186
- "step": 230
187
- },
188
- {
189
- "epoch": 0.03551872132603226,
190
- "grad_norm": 46.0,
191
- "learning_rate": 0.00019884521650742715,
192
- "loss": 3.7192,
193
- "step": 240
194
- },
195
- {
196
- "epoch": 0.036998668047950274,
197
- "grad_norm": 28.0,
198
- "learning_rate": 0.00019863613034027224,
199
- "loss": 3.6487,
200
- "step": 250
201
- },
202
- {
203
- "epoch": 0.03847861476986828,
204
- "grad_norm": 139.0,
205
- "learning_rate": 0.0001984097857063434,
206
- "loss": 4.3462,
207
- "step": 260
208
- },
209
- {
210
- "epoch": 0.039958561491786296,
211
- "grad_norm": 13.875,
212
- "learning_rate": 0.0001981662222093976,
213
- "loss": 3.3132,
214
- "step": 270
215
- },
216
- {
217
- "epoch": 0.04143850821370431,
218
- "grad_norm": 29.5,
219
- "learning_rate": 0.00019790548246599447,
220
- "loss": 3.3523,
221
- "step": 280
222
- },
223
- {
224
- "epoch": 0.04291845493562232,
225
- "grad_norm": 43.75,
226
- "learning_rate": 0.00019762761209803927,
227
- "loss": 3.8529,
228
- "step": 290
229
- },
230
- {
231
- "epoch": 0.04439840165754033,
232
- "grad_norm": 234.0,
233
- "learning_rate": 0.0001973326597248006,
234
- "loss": 4.7306,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.04587834837945834,
239
- "grad_norm": 17.625,
240
- "learning_rate": 0.00019702067695440332,
241
- "loss": 4.0088,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.04735829510137635,
246
- "grad_norm": 10.0625,
247
- "learning_rate": 0.00019669171837479873,
248
- "loss": 3.4104,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.04883824182329436,
253
- "grad_norm": 12.6875,
254
- "learning_rate": 0.00019634584154421317,
255
- "loss": 3.6337,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.050318188545212375,
260
- "grad_norm": 16.75,
261
- "learning_rate": 0.00019598310698107702,
262
- "loss": 3.6344,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.05179813526713038,
267
- "grad_norm": 21.625,
268
- "learning_rate": 0.00019560357815343577,
269
- "loss": 3.8887,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.0532780819890484,
274
- "grad_norm": 8.5625,
275
- "learning_rate": 0.00019520732146784491,
276
- "loss": 3.9023,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.054758028710966404,
281
- "grad_norm": 10.625,
282
- "learning_rate": 0.0001947944062577507,
283
- "loss": 3.7002,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.05623797543288442,
288
- "grad_norm": 34.25,
289
- "learning_rate": 0.00019436490477135878,
290
- "loss": 3.7505,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.057717922154802426,
295
- "grad_norm": 17.75,
296
- "learning_rate": 0.00019391889215899299,
297
- "loss": 3.9776,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.05919786887672044,
302
- "grad_norm": 41.25,
303
- "learning_rate": 0.0001934564464599461,
304
- "loss": 3.6903,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.05919786887672044,
309
- "eval_loss": 5.192676544189453,
310
- "eval_runtime": 115.1721,
311
- "eval_samples_per_second": 13.041,
312
- "eval_steps_per_second": 6.521,
313
- "step": 400
314
- },
315
- {
316
- "epoch": 0.06067781559863845,
317
- "grad_norm": 14.375,
318
- "learning_rate": 0.00019297764858882514,
319
- "loss": 4.2204,
320
- "step": 410
321
- },
322
- {
323
- "epoch": 0.06215776232055646,
324
- "grad_norm": 17.75,
325
- "learning_rate": 0.00019248258232139388,
326
- "loss": 3.7817,
327
- "step": 420
328
- },
329
- {
330
- "epoch": 0.06363770904247447,
331
- "grad_norm": 25.5,
332
- "learning_rate": 0.00019197133427991436,
333
- "loss": 3.8348,
334
- "step": 430
335
- },
336
- {
337
- "epoch": 0.06511765576439248,
338
- "grad_norm": 16.375,
339
- "learning_rate": 0.00019144399391799043,
340
- "loss": 4.1359,
341
- "step": 440
342
- },
343
- {
344
- "epoch": 0.0665976024863105,
345
- "grad_norm": 126.5,
346
- "learning_rate": 0.00019090065350491626,
347
- "loss": 3.8639,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.0680775492082285,
352
- "grad_norm": 10.5,
353
- "learning_rate": 0.0001903414081095315,
354
- "loss": 4.3344,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.06955749593014651,
359
- "grad_norm": 9.625,
360
- "learning_rate": 0.00018976635558358722,
361
- "loss": 3.7876,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.07103744265206452,
366
- "grad_norm": 12.75,
367
- "learning_rate": 0.00018917559654462474,
368
- "loss": 4.0847,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.07251738937398254,
373
- "grad_norm": 56.75,
374
- "learning_rate": 0.00018856923435837022,
375
- "loss": 4.2232,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.07399733609590055,
380
- "grad_norm": 49.0,
381
- "learning_rate": 0.0001879473751206489,
382
- "loss": 4.3389,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.07547728281781856,
387
- "grad_norm": 7.96875,
388
- "learning_rate": 0.00018731012763882133,
389
- "loss": 4.1522,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.07695722953973656,
394
- "grad_norm": 10.8125,
395
- "learning_rate": 0.00018665760341274505,
396
- "loss": 4.0533,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.07843717626165458,
401
- "grad_norm": 18.0,
402
- "learning_rate": 0.00018598991661526572,
403
- "loss": 4.0835,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.07991712298357259,
408
- "grad_norm": 34.5,
409
- "learning_rate": 0.00018530718407223974,
410
- "loss": 3.5388,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.0813970697054906,
415
- "grad_norm": 28.375,
416
- "learning_rate": 0.00018460952524209355,
417
- "loss": 4.2171,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.08287701642740862,
422
- "grad_norm": 13.125,
423
- "learning_rate": 0.00018389706219492147,
424
- "loss": 4.2511,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.08435696314932663,
429
- "grad_norm": 13.1875,
430
- "learning_rate": 0.00018316991959112716,
431
- "loss": 3.9025,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.08583690987124463,
436
- "grad_norm": 15.3125,
437
- "learning_rate": 0.00018242822465961176,
438
- "loss": 4.0034,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.08731685659316264,
443
- "grad_norm": 33.0,
444
- "learning_rate": 0.00018167210717551224,
445
- "loss": 4.0514,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.08879680331508066,
450
- "grad_norm": 35.0,
451
- "learning_rate": 0.00018090169943749476,
452
- "loss": 4.0116,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.08879680331508066,
457
- "eval_loss": 4.839527130126953,
458
- "eval_runtime": 115.7002,
459
- "eval_samples_per_second": 12.982,
460
- "eval_steps_per_second": 6.491,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.09027675003699867,
465
- "grad_norm": 9.9375,
466
- "learning_rate": 0.00018011713624460608,
467
- "loss": 4.2757,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.09175669675891668,
472
- "grad_norm": 9.375,
473
- "learning_rate": 0.00017931855487268782,
474
- "loss": 3.9496,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.09323664348083469,
479
- "grad_norm": 17.5,
480
- "learning_rate": 0.0001785060950503568,
481
- "loss": 4.0227,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.0947165902027527,
486
- "grad_norm": 12.0,
487
- "learning_rate": 0.00017767989893455698,
488
- "loss": 4.3345,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.09619653692467071,
493
- "grad_norm": 40.75,
494
- "learning_rate": 0.00017684011108568592,
495
- "loss": 3.2456,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.09767648364658872,
500
- "grad_norm": 6.03125,
501
- "learning_rate": 0.00017598687844230088,
502
- "loss": 4.296,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.09915643036850673,
507
- "grad_norm": 10.75,
508
- "learning_rate": 0.00017512035029540885,
509
- "loss": 3.8307,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.10063637709042475,
514
- "grad_norm": 11.9375,
515
- "learning_rate": 0.000174240678262345,
516
- "loss": 3.8659,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.10211632381234276,
521
- "grad_norm": 21.75,
522
- "learning_rate": 0.000173348016260244,
523
- "loss": 4.0579,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.10359627053426076,
528
- "grad_norm": 29.625,
529
- "learning_rate": 0.00017244252047910892,
530
- "loss": 3.8463,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.10507621725617877,
535
- "grad_norm": 12.75,
536
- "learning_rate": 0.00017152434935448256,
537
- "loss": 4.225,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.1065561639780968,
542
- "grad_norm": 9.0625,
543
- "learning_rate": 0.0001705936635397259,
544
- "loss": 3.5182,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.1080361107000148,
549
- "grad_norm": 14.75,
550
- "learning_rate": 0.00016965062587790823,
551
- "loss": 4.0649,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.10951605742193281,
556
- "grad_norm": 19.0,
557
- "learning_rate": 0.00016869540137331445,
558
- "loss": 4.2849,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.11099600414385082,
563
- "grad_norm": 34.5,
564
- "learning_rate": 0.00016772815716257412,
565
- "loss": 3.65,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.11247595086576884,
570
- "grad_norm": 11.0,
571
- "learning_rate": 0.00016674906248541726,
572
- "loss": 4.1331,
573
- "step": 760
574
- },
575
- {
576
- "epoch": 0.11395589758768684,
577
- "grad_norm": 38.0,
578
- "learning_rate": 0.00016575828865506245,
579
- "loss": 3.5679,
580
- "step": 770
581
- },
582
- {
583
- "epoch": 0.11543584430960485,
584
- "grad_norm": 9.8125,
585
- "learning_rate": 0.0001647560090282419,
586
- "loss": 3.573,
587
- "step": 780
588
- },
589
- {
590
- "epoch": 0.11691579103152286,
591
- "grad_norm": 14.0,
592
- "learning_rate": 0.000163742398974869,
593
- "loss": 3.9581,
594
- "step": 790
595
- },
596
- {
597
- "epoch": 0.11839573775344088,
598
- "grad_norm": 20.25,
599
- "learning_rate": 0.0001627176358473537,
600
- "loss": 4.0812,
601
- "step": 800
602
- },
603
- {
604
- "epoch": 0.11839573775344088,
605
- "eval_loss": 4.436325550079346,
606
- "eval_runtime": 113.0291,
607
- "eval_samples_per_second": 13.289,
608
- "eval_steps_per_second": 6.644,
609
- "step": 800
610
  }
611
  ],
612
  "logging_steps": 10,
613
- "max_steps": 2500,
614
  "num_input_tokens_seen": 0,
615
- "num_train_epochs": 1,
616
- "save_steps": 400,
617
  "stateful_callbacks": {
618
  "TrainerControl": {
619
  "args": {
@@ -626,7 +182,7 @@
626
  "attributes": {}
627
  }
628
  },
629
- "total_flos": 1.3006933354296115e+17,
630
  "train_batch_size": 2,
631
  "trial_name": null,
632
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.8518518518518519,
5
  "eval_steps": 200,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.009259259259259259,
13
+ "eval_loss": 2.1662161350250244,
14
+ "eval_runtime": 32.9047,
15
+ "eval_samples_per_second": 45.617,
16
+ "eval_steps_per_second": 5.713,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.09259259259259259,
21
+ "grad_norm": 2.796875,
22
+ "learning_rate": 6.666666666666667e-05,
23
+ "loss": 1.8575,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.18518518518518517,
28
+ "grad_norm": 1.75,
29
+ "learning_rate": 0.00013333333333333334,
30
+ "loss": 1.8329,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 0.2777777777777778,
35
+ "grad_norm": 2.15625,
36
+ "learning_rate": 0.0002,
37
+ "loss": 1.8697,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 0.37037037037037035,
42
+ "grad_norm": 2.0,
43
+ "learning_rate": 0.00019984815164333163,
44
+ "loss": 1.9683,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 0.46296296296296297,
49
+ "grad_norm": 1.3046875,
50
+ "learning_rate": 0.00019939306773179497,
51
+ "loss": 2.0113,
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 0.5555555555555556,
56
+ "grad_norm": 1.5703125,
57
+ "learning_rate": 0.00019863613034027224,
58
+ "loss": 2.0568,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 0.6481481481481481,
63
+ "grad_norm": 1.3984375,
64
+ "learning_rate": 0.00019757963826274357,
65
+ "loss": 2.1038,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 0.7407407407407407,
70
+ "grad_norm": 1.3203125,
71
+ "learning_rate": 0.00019622680003092503,
72
+ "loss": 2.1133,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.8333333333333334,
77
+ "grad_norm": 1.421875,
78
+ "learning_rate": 0.00019458172417006347,
79
+ "loss": 2.1036,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 0.9259259259259259,
84
+ "grad_norm": 1.453125,
85
+ "learning_rate": 0.00019264940672148018,
86
+ "loss": 2.1255,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 1.0185185185185186,
91
+ "grad_norm": 1.3984375,
92
+ "learning_rate": 0.00019043571606975777,
93
+ "loss": 1.8837,
94
  "step": 110
95
  },
96
  {
97
+ "epoch": 1.1111111111111112,
98
+ "grad_norm": 1.328125,
99
+ "learning_rate": 0.0001879473751206489,
100
+ "loss": 0.9892,
101
  "step": 120
102
  },
103
  {
104
+ "epoch": 1.2037037037037037,
105
+ "grad_norm": 1.1015625,
106
+ "learning_rate": 0.00018519194088383273,
107
+ "loss": 0.9478,
108
  "step": 130
109
  },
110
  {
111
+ "epoch": 1.2962962962962963,
112
+ "grad_norm": 1.21875,
113
+ "learning_rate": 0.0001821777815225245,
114
+ "loss": 0.9602,
115
  "step": 140
116
  },
117
  {
118
+ "epoch": 1.3888888888888888,
119
+ "grad_norm": 1.2265625,
120
+ "learning_rate": 0.00017891405093963938,
121
+ "loss": 0.9669,
122
  "step": 150
123
  },
124
  {
125
+ "epoch": 1.4814814814814814,
126
+ "grad_norm": 1.1640625,
127
+ "learning_rate": 0.00017541066097768963,
128
+ "loss": 0.9945,
129
  "step": 160
130
  },
131
  {
132
+ "epoch": 1.574074074074074,
133
+ "grad_norm": 1.1875,
134
+ "learning_rate": 0.00017167825131684513,
135
+ "loss": 0.9809,
136
  "step": 170
137
  },
138
  {
139
+ "epoch": 1.6666666666666665,
140
+ "grad_norm": 1.078125,
141
+ "learning_rate": 0.00016772815716257412,
142
+ "loss": 1.0197,
143
  "step": 180
144
  },
145
  {
146
+ "epoch": 1.7592592592592593,
147
+ "grad_norm": 1.2109375,
148
+ "learning_rate": 0.00016357237482099684,
149
+ "loss": 1.0018,
150
  "step": 190
151
  },
152
  {
153
+ "epoch": 1.8518518518518519,
154
+ "grad_norm": 1.3125,
155
+ "learning_rate": 0.00015922352526649803,
156
+ "loss": 1.0298,
157
  "step": 200
158
  },
159
  {
160
+ "epoch": 1.8518518518518519,
161
+ "eval_loss": 2.0912892818450928,
162
+ "eval_runtime": 31.2593,
163
+ "eval_samples_per_second": 48.018,
164
+ "eval_steps_per_second": 6.014,
165
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  }
167
  ],
168
  "logging_steps": 10,
169
+ "max_steps": 600,
170
  "num_input_tokens_seen": 0,
171
+ "num_train_epochs": 6,
172
+ "save_steps": 200,
173
  "stateful_callbacks": {
174
  "TrainerControl": {
175
  "args": {
 
182
  "attributes": {}
183
  }
184
  },
185
+ "total_flos": 1.2990695522435072e+17,
186
  "train_batch_size": 2,
187
  "trial_name": null,
188
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8953c8f78e882c2468a8fd9123e7e01d1ba2fa70223c087509fa59b852ab047
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7e4cc380bf07566f66cbd6e04a4bdc7a8bc1f51ced4f92d8dc2abee469a5015
3
  size 6840