SystemAdmin123 commited on
Commit
b857fbe
·
verified ·
1 Parent(s): c034c0f

Training in progress, step 800, checkpoint

Browse files
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a30c35cbc8785b8b002f6322557cf314425af21ab83b7f51c23859ba004e393f
3
  size 4939116424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96eefb2ab4b204e9308bae75f26755bc252532bdfce90c247e78733a4199e1e2
3
  size 4939116424
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c1c4622df49ffc72bcf3fda3e087cdd3fcdd3a1d56d2ecbbbba441054dbfaf2
3
  size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4096a8c7f67ddbd2910b504765f448908fdd008051d764d2428a8cf62c11b3d
3
  size 4947390880
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a848c3958efbb12660e0c66dfca39e33220fd5a19d9bef3eb49a4609fc0f8aab
3
  size 3590619888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c892e15db6ad07c12e5d0e95b0debfd095cf21e2bf401f6eca45fe7f25d85cb
3
  size 3590619888
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f707da0832c7ea8058a9b33d5b470abf8956d8086c479507d9e0898791f31e49
3
  size 13688025904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab754728a1608ca4beccfa68dff15ae8a33135223828648ca1ace40d391f5d2
3
  size 13688025904
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9feae33b2fec0a6229240e7adaee6ecc8f5cfdf1a8bd0e827b1d8a241424e3c0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c431bcafebc4c8ee346d130e382b11c81be579ca0bfd3918fae07b16e10b92f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a673aaf85c0fe6b6c29cb8f3e7dbd829eef637110e4ad9a775f3fcf001c92591
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40b6b717644e21f80a22ec98694b3a2fd9d62a6467e549d64314725dba905d52
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.05919786887672044,
5
  "eval_steps": 200,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -311,6 +311,302 @@
311
  "eval_samples_per_second": 13.041,
312
  "eval_steps_per_second": 6.521,
313
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  }
315
  ],
316
  "logging_steps": 10,
@@ -330,7 +626,7 @@
330
  "attributes": {}
331
  }
332
  },
333
- "total_flos": 6.511585861514035e+16,
334
  "train_batch_size": 2,
335
  "trial_name": null,
336
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.11839573775344088,
5
  "eval_steps": 200,
6
+ "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
311
  "eval_samples_per_second": 13.041,
312
  "eval_steps_per_second": 6.521,
313
  "step": 400
314
+ },
315
+ {
316
+ "epoch": 0.06067781559863845,
317
+ "grad_norm": 14.375,
318
+ "learning_rate": 0.00019297764858882514,
319
+ "loss": 4.2204,
320
+ "step": 410
321
+ },
322
+ {
323
+ "epoch": 0.06215776232055646,
324
+ "grad_norm": 17.75,
325
+ "learning_rate": 0.00019248258232139388,
326
+ "loss": 3.7817,
327
+ "step": 420
328
+ },
329
+ {
330
+ "epoch": 0.06363770904247447,
331
+ "grad_norm": 25.5,
332
+ "learning_rate": 0.00019197133427991436,
333
+ "loss": 3.8348,
334
+ "step": 430
335
+ },
336
+ {
337
+ "epoch": 0.06511765576439248,
338
+ "grad_norm": 16.375,
339
+ "learning_rate": 0.00019144399391799043,
340
+ "loss": 4.1359,
341
+ "step": 440
342
+ },
343
+ {
344
+ "epoch": 0.0665976024863105,
345
+ "grad_norm": 126.5,
346
+ "learning_rate": 0.00019090065350491626,
347
+ "loss": 3.8639,
348
+ "step": 450
349
+ },
350
+ {
351
+ "epoch": 0.0680775492082285,
352
+ "grad_norm": 10.5,
353
+ "learning_rate": 0.0001903414081095315,
354
+ "loss": 4.3344,
355
+ "step": 460
356
+ },
357
+ {
358
+ "epoch": 0.06955749593014651,
359
+ "grad_norm": 9.625,
360
+ "learning_rate": 0.00018976635558358722,
361
+ "loss": 3.7876,
362
+ "step": 470
363
+ },
364
+ {
365
+ "epoch": 0.07103744265206452,
366
+ "grad_norm": 12.75,
367
+ "learning_rate": 0.00018917559654462474,
368
+ "loss": 4.0847,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 0.07251738937398254,
373
+ "grad_norm": 56.75,
374
+ "learning_rate": 0.00018856923435837022,
375
+ "loss": 4.2232,
376
+ "step": 490
377
+ },
378
+ {
379
+ "epoch": 0.07399733609590055,
380
+ "grad_norm": 49.0,
381
+ "learning_rate": 0.0001879473751206489,
382
+ "loss": 4.3389,
383
+ "step": 500
384
+ },
385
+ {
386
+ "epoch": 0.07547728281781856,
387
+ "grad_norm": 7.96875,
388
+ "learning_rate": 0.00018731012763882133,
389
+ "loss": 4.1522,
390
+ "step": 510
391
+ },
392
+ {
393
+ "epoch": 0.07695722953973656,
394
+ "grad_norm": 10.8125,
395
+ "learning_rate": 0.00018665760341274505,
396
+ "loss": 4.0533,
397
+ "step": 520
398
+ },
399
+ {
400
+ "epoch": 0.07843717626165458,
401
+ "grad_norm": 18.0,
402
+ "learning_rate": 0.00018598991661526572,
403
+ "loss": 4.0835,
404
+ "step": 530
405
+ },
406
+ {
407
+ "epoch": 0.07991712298357259,
408
+ "grad_norm": 34.5,
409
+ "learning_rate": 0.00018530718407223974,
410
+ "loss": 3.5388,
411
+ "step": 540
412
+ },
413
+ {
414
+ "epoch": 0.0813970697054906,
415
+ "grad_norm": 28.375,
416
+ "learning_rate": 0.00018460952524209355,
417
+ "loss": 4.2171,
418
+ "step": 550
419
+ },
420
+ {
421
+ "epoch": 0.08287701642740862,
422
+ "grad_norm": 13.125,
423
+ "learning_rate": 0.00018389706219492147,
424
+ "loss": 4.2511,
425
+ "step": 560
426
+ },
427
+ {
428
+ "epoch": 0.08435696314932663,
429
+ "grad_norm": 13.1875,
430
+ "learning_rate": 0.00018316991959112716,
431
+ "loss": 3.9025,
432
+ "step": 570
433
+ },
434
+ {
435
+ "epoch": 0.08583690987124463,
436
+ "grad_norm": 15.3125,
437
+ "learning_rate": 0.00018242822465961176,
438
+ "loss": 4.0034,
439
+ "step": 580
440
+ },
441
+ {
442
+ "epoch": 0.08731685659316264,
443
+ "grad_norm": 33.0,
444
+ "learning_rate": 0.00018167210717551224,
445
+ "loss": 4.0514,
446
+ "step": 590
447
+ },
448
+ {
449
+ "epoch": 0.08879680331508066,
450
+ "grad_norm": 35.0,
451
+ "learning_rate": 0.00018090169943749476,
452
+ "loss": 4.0116,
453
+ "step": 600
454
+ },
455
+ {
456
+ "epoch": 0.08879680331508066,
457
+ "eval_loss": 4.839527130126953,
458
+ "eval_runtime": 115.7002,
459
+ "eval_samples_per_second": 12.982,
460
+ "eval_steps_per_second": 6.491,
461
+ "step": 600
462
+ },
463
+ {
464
+ "epoch": 0.09027675003699867,
465
+ "grad_norm": 9.9375,
466
+ "learning_rate": 0.00018011713624460608,
467
+ "loss": 4.2757,
468
+ "step": 610
469
+ },
470
+ {
471
+ "epoch": 0.09175669675891668,
472
+ "grad_norm": 9.375,
473
+ "learning_rate": 0.00017931855487268782,
474
+ "loss": 3.9496,
475
+ "step": 620
476
+ },
477
+ {
478
+ "epoch": 0.09323664348083469,
479
+ "grad_norm": 17.5,
480
+ "learning_rate": 0.0001785060950503568,
481
+ "loss": 4.0227,
482
+ "step": 630
483
+ },
484
+ {
485
+ "epoch": 0.0947165902027527,
486
+ "grad_norm": 12.0,
487
+ "learning_rate": 0.00017767989893455698,
488
+ "loss": 4.3345,
489
+ "step": 640
490
+ },
491
+ {
492
+ "epoch": 0.09619653692467071,
493
+ "grad_norm": 40.75,
494
+ "learning_rate": 0.00017684011108568592,
495
+ "loss": 3.2456,
496
+ "step": 650
497
+ },
498
+ {
499
+ "epoch": 0.09767648364658872,
500
+ "grad_norm": 6.03125,
501
+ "learning_rate": 0.00017598687844230088,
502
+ "loss": 4.296,
503
+ "step": 660
504
+ },
505
+ {
506
+ "epoch": 0.09915643036850673,
507
+ "grad_norm": 10.75,
508
+ "learning_rate": 0.00017512035029540885,
509
+ "loss": 3.8307,
510
+ "step": 670
511
+ },
512
+ {
513
+ "epoch": 0.10063637709042475,
514
+ "grad_norm": 11.9375,
515
+ "learning_rate": 0.000174240678262345,
516
+ "loss": 3.8659,
517
+ "step": 680
518
+ },
519
+ {
520
+ "epoch": 0.10211632381234276,
521
+ "grad_norm": 21.75,
522
+ "learning_rate": 0.000173348016260244,
523
+ "loss": 4.0579,
524
+ "step": 690
525
+ },
526
+ {
527
+ "epoch": 0.10359627053426076,
528
+ "grad_norm": 29.625,
529
+ "learning_rate": 0.00017244252047910892,
530
+ "loss": 3.8463,
531
+ "step": 700
532
+ },
533
+ {
534
+ "epoch": 0.10507621725617877,
535
+ "grad_norm": 12.75,
536
+ "learning_rate": 0.00017152434935448256,
537
+ "loss": 4.225,
538
+ "step": 710
539
+ },
540
+ {
541
+ "epoch": 0.1065561639780968,
542
+ "grad_norm": 9.0625,
543
+ "learning_rate": 0.0001705936635397259,
544
+ "loss": 3.5182,
545
+ "step": 720
546
+ },
547
+ {
548
+ "epoch": 0.1080361107000148,
549
+ "grad_norm": 14.75,
550
+ "learning_rate": 0.00016965062587790823,
551
+ "loss": 4.0649,
552
+ "step": 730
553
+ },
554
+ {
555
+ "epoch": 0.10951605742193281,
556
+ "grad_norm": 19.0,
557
+ "learning_rate": 0.00016869540137331445,
558
+ "loss": 4.2849,
559
+ "step": 740
560
+ },
561
+ {
562
+ "epoch": 0.11099600414385082,
563
+ "grad_norm": 34.5,
564
+ "learning_rate": 0.00016772815716257412,
565
+ "loss": 3.65,
566
+ "step": 750
567
+ },
568
+ {
569
+ "epoch": 0.11247595086576884,
570
+ "grad_norm": 11.0,
571
+ "learning_rate": 0.00016674906248541726,
572
+ "loss": 4.1331,
573
+ "step": 760
574
+ },
575
+ {
576
+ "epoch": 0.11395589758768684,
577
+ "grad_norm": 38.0,
578
+ "learning_rate": 0.00016575828865506245,
579
+ "loss": 3.5679,
580
+ "step": 770
581
+ },
582
+ {
583
+ "epoch": 0.11543584430960485,
584
+ "grad_norm": 9.8125,
585
+ "learning_rate": 0.0001647560090282419,
586
+ "loss": 3.573,
587
+ "step": 780
588
+ },
589
+ {
590
+ "epoch": 0.11691579103152286,
591
+ "grad_norm": 14.0,
592
+ "learning_rate": 0.000163742398974869,
593
+ "loss": 3.9581,
594
+ "step": 790
595
+ },
596
+ {
597
+ "epoch": 0.11839573775344088,
598
+ "grad_norm": 20.25,
599
+ "learning_rate": 0.0001627176358473537,
600
+ "loss": 4.0812,
601
+ "step": 800
602
+ },
603
+ {
604
+ "epoch": 0.11839573775344088,
605
+ "eval_loss": 4.436325550079346,
606
+ "eval_runtime": 113.0291,
607
+ "eval_samples_per_second": 13.289,
608
+ "eval_steps_per_second": 6.644,
609
+ "step": 800
610
  }
611
  ],
612
  "logging_steps": 10,
 
626
  "attributes": {}
627
  }
628
  },
629
+ "total_flos": 1.3006933354296115e+17,
630
  "train_batch_size": 2,
631
  "trial_name": null,
632
  "trial_params": null