rakhman-llm commited on
Commit
965f01b
·
verified ·
1 Parent(s): 4015a9b

Training in progress, step 9500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2d552abe54bc8b78698f085328cba7d4ca9faffbe5c176d69ae6298a741dc1f
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e166df977f21f4e74668f730deac309df6a571f19acaa0dfa89c2e3c2819431
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23a4a556c668ac483f251fb2542a9567fdbeee24015d2afebd6c40ac1db320d9
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e99baec34b5da3301e09eb5c4f9d3ef873b304cf96feddd1cfe61690f168589
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d44945f90b9153cb2a69f9475450a2111e7f4db9372411c8f14f263baffa58d1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f8e7f84c942f83f5d877ebb335efd09f2d70c83c135e4e031bf84b0af23af9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5115d8af133b4318a75821bef26ae5de84c140c9bd70b7ac727a7737d85cfbd6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e8e6e071a0a58044a0e6d93d2bb93dd252e4dfa6d5810853dc9dadf39da6ab9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08268450945615768,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-9000",
4
- "epoch": 1.44,
5
  "eval_steps": 500,
6
- "global_step": 9000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1411,6 +1411,84 @@
1411
  "eval_samples_per_second": 17.12,
1412
  "eval_steps_per_second": 2.14,
1413
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1414
  }
1415
  ],
1416
  "logging_steps": 50,
@@ -1430,7 +1508,7 @@
1430
  "attributes": {}
1431
  }
1432
  },
1433
- "total_flos": 4.384496812032e+16,
1434
  "train_batch_size": 8,
1435
  "trial_name": null,
1436
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08252418041229248,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-9500",
4
+ "epoch": 1.52,
5
  "eval_steps": 500,
6
+ "global_step": 9500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1411
  "eval_samples_per_second": 17.12,
1412
  "eval_steps_per_second": 2.14,
1413
  "step": 9000
1414
+ },
1415
+ {
1416
+ "epoch": 1.448,
1417
+ "grad_norm": 5681.900390625,
1418
+ "learning_rate": 1.552e-05,
1419
+ "loss": 0.0549,
1420
+ "step": 9050
1421
+ },
1422
+ {
1423
+ "epoch": 1.456,
1424
+ "grad_norm": 5498.50048828125,
1425
+ "learning_rate": 1.544e-05,
1426
+ "loss": 0.0551,
1427
+ "step": 9100
1428
+ },
1429
+ {
1430
+ "epoch": 1.464,
1431
+ "grad_norm": 7044.8017578125,
1432
+ "learning_rate": 1.5360000000000002e-05,
1433
+ "loss": 0.0557,
1434
+ "step": 9150
1435
+ },
1436
+ {
1437
+ "epoch": 1.472,
1438
+ "grad_norm": 8311.8076171875,
1439
+ "learning_rate": 1.528e-05,
1440
+ "loss": 0.0559,
1441
+ "step": 9200
1442
+ },
1443
+ {
1444
+ "epoch": 1.48,
1445
+ "grad_norm": 10259.4189453125,
1446
+ "learning_rate": 1.5200000000000002e-05,
1447
+ "loss": 0.057,
1448
+ "step": 9250
1449
+ },
1450
+ {
1451
+ "epoch": 1.488,
1452
+ "grad_norm": 7944.630859375,
1453
+ "learning_rate": 1.5120000000000001e-05,
1454
+ "loss": 0.0541,
1455
+ "step": 9300
1456
+ },
1457
+ {
1458
+ "epoch": 1.496,
1459
+ "grad_norm": 9513.1875,
1460
+ "learning_rate": 1.504e-05,
1461
+ "loss": 0.0558,
1462
+ "step": 9350
1463
+ },
1464
+ {
1465
+ "epoch": 1.504,
1466
+ "grad_norm": 6013.54296875,
1467
+ "learning_rate": 1.4959999999999999e-05,
1468
+ "loss": 0.0532,
1469
+ "step": 9400
1470
+ },
1471
+ {
1472
+ "epoch": 1.512,
1473
+ "grad_norm": 7162.22314453125,
1474
+ "learning_rate": 1.488e-05,
1475
+ "loss": 0.0553,
1476
+ "step": 9450
1477
+ },
1478
+ {
1479
+ "epoch": 1.52,
1480
+ "grad_norm": 6351.9833984375,
1481
+ "learning_rate": 1.48e-05,
1482
+ "loss": 0.0548,
1483
+ "step": 9500
1484
+ },
1485
+ {
1486
+ "epoch": 1.52,
1487
+ "eval_loss": 0.08252418041229248,
1488
+ "eval_runtime": 116.7082,
1489
+ "eval_samples_per_second": 17.137,
1490
+ "eval_steps_per_second": 2.142,
1491
+ "step": 9500
1492
  }
1493
  ],
1494
  "logging_steps": 50,
 
1508
  "attributes": {}
1509
  }
1510
  },
1511
+ "total_flos": 4.628079968256e+16,
1512
  "train_batch_size": 8,
1513
  "trial_name": null,
1514
  "trial_params": null