rakhman-llm commited on
Commit
fc5c40b
·
verified ·
1 Parent(s): 980e4b9

Training in progress, step 12500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8551f34b08110ae0409b8440d3c6aebe368ed8d3db002ee5351bf9102cc149c5
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3440c751a156d2ea034af88bf09257dd0a13e2135cfbb097e39dfddddef310fd
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfd2de65b175040f5da9800eea2d0dc2b6a4d304b142893b8d0191497fd8ada1
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1457c08b9b3534e02e583e1c8e42d69598ca892990b039f5a31fe9230fe0935
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d27d7df73326e1c0aee717403c0adf323847184edf4276ba1b5631d4a53dd69
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f0a30224aa29ff1d82dc265e86d4dfad17d2e9441b1e7410af0f89b7ac502d4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e235e460f5bdfcaf65cc8c31e1e3b2c2350fb2d33d9a604b0cf64e4a8cef95de
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b03c363c915c0dcfc4a0726bfa268bae9d4c39d40263aa6bcf80af31957091aa
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.08194975554943085,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-12000",
4
- "epoch": 1.92,
5
  "eval_steps": 500,
6
- "global_step": 12000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1879,6 +1879,84 @@
1879
  "eval_samples_per_second": 17.123,
1880
  "eval_steps_per_second": 2.14,
1881
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1882
  }
1883
  ],
1884
  "logging_steps": 50,
@@ -1898,7 +1976,7 @@
1898
  "attributes": {}
1899
  }
1900
  },
1901
- "total_flos": 5.845995749376e+16,
1902
  "train_batch_size": 8,
1903
  "trial_name": null,
1904
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.08158940076828003,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.0,
5
  "eval_steps": 500,
6
+ "global_step": 12500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1879
  "eval_samples_per_second": 17.123,
1880
  "eval_steps_per_second": 2.14,
1881
  "step": 12000
1882
+ },
1883
+ {
1884
+ "epoch": 1.928,
1885
+ "grad_norm": 6469.14306640625,
1886
+ "learning_rate": 1.072e-05,
1887
+ "loss": 0.0584,
1888
+ "step": 12050
1889
+ },
1890
+ {
1891
+ "epoch": 1.936,
1892
+ "grad_norm": 7579.2998046875,
1893
+ "learning_rate": 1.0640000000000001e-05,
1894
+ "loss": 0.0573,
1895
+ "step": 12100
1896
+ },
1897
+ {
1898
+ "epoch": 1.944,
1899
+ "grad_norm": 8114.94921875,
1900
+ "learning_rate": 1.0559999999999999e-05,
1901
+ "loss": 0.0523,
1902
+ "step": 12150
1903
+ },
1904
+ {
1905
+ "epoch": 1.952,
1906
+ "grad_norm": 7263.44384765625,
1907
+ "learning_rate": 1.048e-05,
1908
+ "loss": 0.0517,
1909
+ "step": 12200
1910
+ },
1911
+ {
1912
+ "epoch": 1.96,
1913
+ "grad_norm": 8325.9580078125,
1914
+ "learning_rate": 1.04e-05,
1915
+ "loss": 0.0524,
1916
+ "step": 12250
1917
+ },
1918
+ {
1919
+ "epoch": 1.968,
1920
+ "grad_norm": 6577.01318359375,
1921
+ "learning_rate": 1.032e-05,
1922
+ "loss": 0.0533,
1923
+ "step": 12300
1924
+ },
1925
+ {
1926
+ "epoch": 1.976,
1927
+ "grad_norm": 6278.1826171875,
1928
+ "learning_rate": 1.024e-05,
1929
+ "loss": 0.0532,
1930
+ "step": 12350
1931
+ },
1932
+ {
1933
+ "epoch": 1.984,
1934
+ "grad_norm": 7769.2333984375,
1935
+ "learning_rate": 1.0160000000000001e-05,
1936
+ "loss": 0.0532,
1937
+ "step": 12400
1938
+ },
1939
+ {
1940
+ "epoch": 1.992,
1941
+ "grad_norm": 10089.91796875,
1942
+ "learning_rate": 1.008e-05,
1943
+ "loss": 0.0539,
1944
+ "step": 12450
1945
+ },
1946
+ {
1947
+ "epoch": 2.0,
1948
+ "grad_norm": 9177.8115234375,
1949
+ "learning_rate": 9.999999999999999e-06,
1950
+ "loss": 0.0588,
1951
+ "step": 12500
1952
+ },
1953
+ {
1954
+ "epoch": 2.0,
1955
+ "eval_loss": 0.08158940076828003,
1956
+ "eval_runtime": 116.7903,
1957
+ "eval_samples_per_second": 17.125,
1958
+ "eval_steps_per_second": 2.141,
1959
+ "step": 12500
1960
  }
1961
  ],
1962
  "logging_steps": 50,
 
1976
  "attributes": {}
1977
  }
1978
  },
1979
+ "total_flos": 6.0895789056e+16,
1980
  "train_batch_size": 8,
1981
  "trial_name": null,
1982
  "trial_params": null