SystemAdmin123 commited on
Commit
22b9323
·
verified ·
1 Parent(s): 31a7843

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef38e23f75049612d4593984a7a3ca4d8bb15ca4c375864be026d287d2b3887c
3
  size 4949934200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb956b595ae08ed63fbf1610a2cfe05de08b959291a75296518a168d063764b6
3
  size 4949934200
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60e07fbe280b472c7130a237eb85fec83e68ee6443f3e43bba1ecad5a003ea84
3
  size 1110862568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:463a2b5019a7f5d99135a4b074e05819fb63e66fc2befd429b6109b0a03d6ebb
3
  size 1110862568
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3bd6b82fe6f7cb79e53c940f5432d5d84b10deb3c0f16e37914c245e9f465d4
3
  size 6159835552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81a9ce88dbdaec68aeaab7e347846a6d2fe4b90053ffda8da98a7be27e9df5dc
3
  size 6159835552
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01bb9290ce3d59c6ff6ad761e9fc828c58d7a70fee34981771ade9f75e7a558c
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:893344d0f191044a7c114fbf834353023abd8a24ba38e1e169829989ca44cb8f
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e8b452ae4994b1cf755a70d101c2857ab3a01f90d161421b4c6b76a66a30614
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6013d90416725816cb54a25c41a3a14fbb2cd743723bd5c562900e4ffe4f22e3
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f818275221580f93945cc9ffeac693f8727beebedb70d39d160511e90bb32454
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f50a6fddae7f536fdcd52ed2c2881024b7c2eff412e5c9fe5ed024666af442a5
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fb27e90b9efb2bc1ba3aca816751650d05e50ae0229533972456bd829f8572c
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1daeb68d959a75fd6e1b227ebb24225f2543835019b429ede13432db5890912b
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e4c78e85c4ba926d25150d4aaddeaf5728dcb066f4afc01202e3e56f29a5487
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ad54995b081fae25638228c5d9c8f38ca277e5c5ad00bc3e49897b543f84405
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.090909090909092,
5
  "eval_steps": 200,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -311,6 +311,154 @@
311
  "eval_samples_per_second": 115.091,
312
  "eval_steps_per_second": 5.827,
313
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  }
315
  ],
316
  "logging_steps": 10,
@@ -325,12 +473,12 @@
325
  "should_evaluate": false,
326
  "should_log": false,
327
  "should_save": true,
328
- "should_training_stop": false
329
  },
330
  "attributes": {}
331
  }
332
  },
333
- "total_flos": 2.827003985259397e+17,
334
  "train_batch_size": 5,
335
  "trial_name": null,
336
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 13.636363636363637,
5
  "eval_steps": 200,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
311
  "eval_samples_per_second": 115.091,
312
  "eval_steps_per_second": 5.827,
313
  "step": 400
314
+ },
315
+ {
316
+ "epoch": 9.318181818181818,
317
+ "grad_norm": 0.171875,
318
+ "learning_rate": 5.000000000000002e-05,
319
+ "loss": 0.0071,
320
+ "step": 410
321
+ },
322
+ {
323
+ "epoch": 9.545454545454545,
324
+ "grad_norm": 0.16796875,
325
+ "learning_rate": 4.530518418775733e-05,
326
+ "loss": 0.0072,
327
+ "step": 420
328
+ },
329
+ {
330
+ "epoch": 9.772727272727273,
331
+ "grad_norm": 0.2392578125,
332
+ "learning_rate": 4.077647473350201e-05,
333
+ "loss": 0.0071,
334
+ "step": 430
335
+ },
336
+ {
337
+ "epoch": 10.0,
338
+ "grad_norm": 0.1572265625,
339
+ "learning_rate": 3.642762517900322e-05,
340
+ "loss": 0.0069,
341
+ "step": 440
342
+ },
343
+ {
344
+ "epoch": 10.227272727272727,
345
+ "grad_norm": 0.1904296875,
346
+ "learning_rate": 3.227184283742591e-05,
347
+ "loss": 0.0066,
348
+ "step": 450
349
+ },
350
+ {
351
+ "epoch": 10.454545454545455,
352
+ "grad_norm": 0.1416015625,
353
+ "learning_rate": 2.8321748683154893e-05,
354
+ "loss": 0.0066,
355
+ "step": 460
356
+ },
357
+ {
358
+ "epoch": 10.681818181818182,
359
+ "grad_norm": 0.15625,
360
+ "learning_rate": 2.4589339022310386e-05,
361
+ "loss": 0.0067,
362
+ "step": 470
363
+ },
364
+ {
365
+ "epoch": 10.909090909090908,
366
+ "grad_norm": 0.1376953125,
367
+ "learning_rate": 2.1085949060360654e-05,
368
+ "loss": 0.0062,
369
+ "step": 480
370
+ },
371
+ {
372
+ "epoch": 11.136363636363637,
373
+ "grad_norm": 0.15625,
374
+ "learning_rate": 1.7822218477475494e-05,
375
+ "loss": 0.0065,
376
+ "step": 490
377
+ },
378
+ {
379
+ "epoch": 11.363636363636363,
380
+ "grad_norm": 0.130859375,
381
+ "learning_rate": 1.4808059116167305e-05,
382
+ "loss": 0.0062,
383
+ "step": 500
384
+ },
385
+ {
386
+ "epoch": 11.590909090909092,
387
+ "grad_norm": 0.134765625,
388
+ "learning_rate": 1.2052624879351104e-05,
389
+ "loss": 0.0061,
390
+ "step": 510
391
+ },
392
+ {
393
+ "epoch": 11.818181818181818,
394
+ "grad_norm": 0.1455078125,
395
+ "learning_rate": 9.564283930242257e-06,
396
+ "loss": 0.0064,
397
+ "step": 520
398
+ },
399
+ {
400
+ "epoch": 12.045454545454545,
401
+ "grad_norm": 0.12255859375,
402
+ "learning_rate": 7.350593278519824e-06,
403
+ "loss": 0.0059,
404
+ "step": 530
405
+ },
406
+ {
407
+ "epoch": 12.272727272727273,
408
+ "grad_norm": 0.162109375,
409
+ "learning_rate": 5.418275829936537e-06,
410
+ "loss": 0.006,
411
+ "step": 540
412
+ },
413
+ {
414
+ "epoch": 12.5,
415
+ "grad_norm": 0.1318359375,
416
+ "learning_rate": 3.7731999690749585e-06,
417
+ "loss": 0.0061,
418
+ "step": 550
419
+ },
420
+ {
421
+ "epoch": 12.727272727272727,
422
+ "grad_norm": 0.11962890625,
423
+ "learning_rate": 2.420361737256438e-06,
424
+ "loss": 0.0064,
425
+ "step": 560
426
+ },
427
+ {
428
+ "epoch": 12.954545454545455,
429
+ "grad_norm": 0.1376953125,
430
+ "learning_rate": 1.3638696597277679e-06,
431
+ "loss": 0.0061,
432
+ "step": 570
433
+ },
434
+ {
435
+ "epoch": 13.181818181818182,
436
+ "grad_norm": 0.158203125,
437
+ "learning_rate": 6.069322682050516e-07,
438
+ "loss": 0.0063,
439
+ "step": 580
440
+ },
441
+ {
442
+ "epoch": 13.409090909090908,
443
+ "grad_norm": 0.11767578125,
444
+ "learning_rate": 1.518483566683826e-07,
445
+ "loss": 0.006,
446
+ "step": 590
447
+ },
448
+ {
449
+ "epoch": 13.636363636363637,
450
+ "grad_norm": 0.15625,
451
+ "learning_rate": 0.0,
452
+ "loss": 0.0063,
453
+ "step": 600
454
+ },
455
+ {
456
+ "epoch": 13.636363636363637,
457
+ "eval_loss": 4.043154239654541,
458
+ "eval_runtime": 13.032,
459
+ "eval_samples_per_second": 115.178,
460
+ "eval_steps_per_second": 5.832,
461
+ "step": 600
462
  }
463
  ],
464
  "logging_steps": 10,
 
473
  "should_evaluate": false,
474
  "should_log": false,
475
  "should_save": true,
476
+ "should_training_stop": true
477
  },
478
  "attributes": {}
479
  }
480
  },
481
+ "total_flos": 4.240152158483251e+17,
482
  "train_batch_size": 5,
483
  "trial_name": null,
484
  "trial_params": null