SystemAdmin123 commited on
Commit
82aecc5
·
verified ·
1 Parent(s): eed8544

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:861ef298194ce5f5a95b7fb9f3de924006083ce8f6c72fe8d11804878d22cf8d
3
  size 4874664552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b69510103c86ab6c7d79f2705fcd3bc77def1938ff647a7523546cfaa6d7eb1
3
  size 4874664552
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37c1108f60a343761b5ac77501f9ccf49e589e8a6d041d69da488eae8b26115a
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99f48df3f5accdfbdae4f827024db4d1dabfa7cf1d3dd237e8d4c6cbab835ee5
3
  size 4932751008
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:273aefb95ae0322c1ea6456d07840a68e51e7b149e3a956ddfd8f8329c4d22f2
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:464a609182441fb88482d5a7a508cf91905e87e8d57b74fbf74ed4610d98783e
3
  size 4330865200
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0ef8a492aafc49d33deb349f2977b2c19503213b78477921a2921a786f30786
3
  size 1086998656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58e53b04f20e02489ca57de06513f0a94a759f82223847010fa4abd53f07e10b
3
  size 1086998656
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c60b475db0734986c42539b6a749e4d2824c9876d9e2050469711b4b0de5a01f
3
  size 15465450874
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c86931acc77817b6206577a3a39862adf7debd3891af1002261268d4e9e1acef
3
  size 15465450874
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f105b50093489fd740a8f6602ddfe873d388d5a0db4242d627bbb8ba93713d73
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69a04a1208f7a0d6f51f37a136b5c2e55bf3f53b3d0fd57164c5b83ca47a2645
3
+ size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ac67ce68566e593180659981e408823cfcf7642579fefd1cbac723e565bb9c8
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:080a7e72d6be938a9418e60003db90412af8a61e6434f9e9f1b598cca861dbcd
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3d114a75d37be476b865187eb2b3d29d9343b131614a08f42be0014f110ce6f
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fc5a0f78838743362c5d5378dff81ea2f7d0039da53a423f1759e861bc6b233
3
+ size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43a0c31b135429597fb4601695aee5de28b0f5fb6a2c4e45d278a9e4001d0c26
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86aa1c590799d718328ad7b7198db3fa4678198705c85eb25b7f257d9e38e2cd
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,113 +1,175 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5235602094240838,
5
- "eval_steps": 50,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.005235602094240838,
13
- "eval_loss": 2.384796619415283,
14
- "eval_runtime": 62.0394,
15
- "eval_samples_per_second": 24.194,
16
- "eval_steps_per_second": 6.061,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.05235602094240838,
21
- "grad_norm": 5.15625,
22
- "learning_rate": 3.6363636363636364e-05,
23
- "loss": 1.9242,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.10471204188481675,
28
- "grad_norm": 4.59375,
29
- "learning_rate": 7.272727272727273e-05,
30
- "loss": 1.845,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 0.15706806282722513,
35
- "grad_norm": 3.484375,
36
- "learning_rate": 0.00010909090909090909,
37
- "loss": 1.8311,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 0.2094240837696335,
42
- "grad_norm": 3.390625,
43
- "learning_rate": 0.00014545454545454546,
44
- "loss": 2.0034,
45
  "step": 40
46
  },
47
  {
48
- "epoch": 0.2617801047120419,
49
- "grad_norm": 3.59375,
50
- "learning_rate": 0.00018181818181818183,
51
- "loss": 2.2024,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 0.2617801047120419,
56
- "eval_loss": 2.1606948375701904,
57
- "eval_runtime": 60.3553,
58
- "eval_samples_per_second": 24.869,
59
- "eval_steps_per_second": 6.23,
60
  "step": 50
61
  },
62
  {
63
- "epoch": 0.31413612565445026,
64
- "grad_norm": 3.71875,
65
- "learning_rate": 0.00019998870284726968,
66
- "loss": 2.4031,
67
  "step": 60
68
  },
69
  {
70
- "epoch": 0.36649214659685864,
71
- "grad_norm": 3.359375,
72
- "learning_rate": 0.00019989834093992945,
73
- "loss": 2.5623,
74
  "step": 70
75
  },
76
  {
77
- "epoch": 0.418848167539267,
78
- "grad_norm": 3.578125,
79
- "learning_rate": 0.00019971769878721743,
80
- "loss": 2.734,
81
  "step": 80
82
  },
83
  {
84
- "epoch": 0.4712041884816754,
85
- "grad_norm": 3.21875,
86
- "learning_rate": 0.00019944693963927092,
87
- "loss": 2.7436,
88
  "step": 90
89
  },
90
  {
91
- "epoch": 0.5235602094240838,
92
- "grad_norm": 5.75,
93
- "learning_rate": 0.00019908630818686338,
94
- "loss": 2.8422,
95
  "step": 100
96
  },
97
  {
98
- "epoch": 0.5235602094240838,
99
- "eval_loss": 2.754179000854492,
100
- "eval_runtime": 57.9266,
101
- "eval_samples_per_second": 25.912,
102
- "eval_steps_per_second": 6.491,
103
- "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  }
105
  ],
106
  "logging_steps": 10,
107
- "max_steps": 1100,
108
  "num_input_tokens_seen": 0,
109
  "num_train_epochs": 6,
110
- "save_steps": 50,
111
  "stateful_callbacks": {
112
  "TrainerControl": {
113
  "args": {
@@ -120,7 +182,7 @@
120
  "attributes": {}
121
  }
122
  },
123
- "total_flos": 3.474614315384832e+16,
124
  "train_batch_size": 2,
125
  "trial_name": null,
126
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.1052631578947367,
5
+ "eval_steps": 200,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.010526315789473684,
13
+ "eval_loss": 2.373548746109009,
14
+ "eval_runtime": 30.8211,
15
+ "eval_samples_per_second": 48.7,
16
+ "eval_steps_per_second": 6.1,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.10526315789473684,
21
+ "grad_norm": 4.1875,
22
+ "learning_rate": 8e-05,
23
+ "loss": 1.916,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.21052631578947367,
28
+ "grad_norm": 3.21875,
29
+ "learning_rate": 0.00016,
30
+ "loss": 1.8779,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 0.3157894736842105,
35
+ "grad_norm": 2.515625,
36
+ "learning_rate": 0.00019994532573409262,
37
+ "loss": 2.1362,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 0.42105263157894735,
42
+ "grad_norm": 2.296875,
43
+ "learning_rate": 0.00019950829025450114,
44
+ "loss": 2.3736,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 0.5263157894736842,
49
+ "grad_norm": 3.15625,
50
+ "learning_rate": 0.00019863613034027224,
51
+ "loss": 2.4914,
 
 
 
 
 
 
 
 
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 0.631578947368421,
56
+ "grad_norm": 2.46875,
57
+ "learning_rate": 0.0001973326597248006,
58
+ "loss": 2.5562,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 0.7368421052631579,
63
+ "grad_norm": 3.953125,
64
+ "learning_rate": 0.00019560357815343577,
65
+ "loss": 2.7778,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 0.8421052631578947,
70
+ "grad_norm": 4.5625,
71
+ "learning_rate": 0.0001934564464599461,
72
+ "loss": 2.6995,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.9473684210526315,
77
+ "grad_norm": 17.0,
78
+ "learning_rate": 0.00019090065350491626,
79
+ "loss": 2.619,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 1.0526315789473684,
84
+ "grad_norm": 2.65625,
85
+ "learning_rate": 0.0001879473751206489,
86
+ "loss": 1.9637,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 1.1578947368421053,
91
+ "grad_norm": 2.15625,
92
+ "learning_rate": 0.00018460952524209355,
93
+ "loss": 1.1648,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 1.263157894736842,
98
+ "grad_norm": 1.8125,
99
+ "learning_rate": 0.00018090169943749476,
100
+ "loss": 1.2698,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 1.368421052631579,
105
+ "grad_norm": 2.265625,
106
+ "learning_rate": 0.00017684011108568592,
107
+ "loss": 1.2703,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 1.4736842105263157,
112
+ "grad_norm": 2.1875,
113
+ "learning_rate": 0.00017244252047910892,
114
+ "loss": 1.3674,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 1.5789473684210527,
119
+ "grad_norm": 2.0625,
120
+ "learning_rate": 0.00016772815716257412,
121
+ "loss": 1.3329,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 1.6842105263157894,
126
+ "grad_norm": 2.90625,
127
+ "learning_rate": 0.0001627176358473537,
128
+ "loss": 1.3702,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 1.7894736842105263,
133
+ "grad_norm": 2.078125,
134
+ "learning_rate": 0.00015743286626829437,
135
+ "loss": 1.3481,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 1.8947368421052633,
140
+ "grad_norm": 2.1875,
141
+ "learning_rate": 0.00015189695737812152,
142
+ "loss": 1.4838,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 2.0,
147
+ "grad_norm": 2.046875,
148
+ "learning_rate": 0.0001461341162978688,
149
+ "loss": 1.3654,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 2.1052631578947367,
154
+ "grad_norm": 1.4765625,
155
+ "learning_rate": 0.00014016954246529696,
156
+ "loss": 0.517,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 2.1052631578947367,
161
+ "eval_loss": 3.045600652694702,
162
+ "eval_runtime": 29.0213,
163
+ "eval_samples_per_second": 51.721,
164
+ "eval_steps_per_second": 6.478,
165
+ "step": 200
166
  }
167
  ],
168
  "logging_steps": 10,
169
+ "max_steps": 500,
170
  "num_input_tokens_seen": 0,
171
  "num_train_epochs": 6,
172
+ "save_steps": 200,
173
  "stateful_callbacks": {
174
  "TrainerControl": {
175
  "args": {
 
182
  "attributes": {}
183
  }
184
  },
185
+ "total_flos": 1.3898457261539328e+17,
186
  "train_batch_size": 2,
187
  "trial_name": null,
188
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32a7a8613e5fa3317cdc198f56f6d0577b15eb3e0cf0efd4aa72ac710a8260e0
3
  size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d11551a59df892a04244070ff3f59b02e2dcaa3eb1aab844864a42113437689
3
  size 6968