SystemAdmin123 commited on
Commit
f2056e0
·
verified ·
1 Parent(s): 06def12

Training in progress, step 40, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,31 +1,31 @@
1
  {
2
- "_name_or_path": "Xenova/tiny-random-Phi3ForCausalLM",
3
  "architectures": [
4
- "Phi3ForCausalLM"
5
  ],
 
6
  "attention_dropout": 0.0,
7
- "bos_token_id": 1,
8
- "embd_pdrop": 0.0,
9
- "eos_token_id": 32000,
10
  "hidden_act": "silu",
11
- "hidden_size": 32,
12
  "initializer_range": 0.02,
13
- "intermediate_size": 64,
14
- "max_position_embeddings": 4096,
15
- "model_type": "phi3",
16
- "num_attention_heads": 4,
 
17
  "num_hidden_layers": 2,
18
- "num_key_value_heads": 4,
19
- "original_max_position_embeddings": 4096,
20
- "pad_token_id": 32000,
21
- "resid_pdrop": 0.0,
22
- "rms_norm_eps": 1e-05,
23
  "rope_scaling": null,
24
  "rope_theta": 10000.0,
25
- "sliding_window": 2047,
26
  "tie_word_embeddings": false,
27
  "torch_dtype": "bfloat16",
28
  "transformers_version": "4.48.1",
29
  "use_cache": false,
30
- "vocab_size": 32011
31
  }
 
1
  {
2
+ "_name_or_path": "JackFram/llama-68m",
3
  "architectures": [
4
+ "LlamaForCausalLM"
5
  ],
6
+ "attention_bias": false,
7
  "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
  "hidden_act": "silu",
12
+ "hidden_size": 768,
13
  "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "max_position_embeddings": 2048,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
  "num_hidden_layers": 2,
20
+ "num_key_value_heads": 12,
21
+ "pad_token_id": 1,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
 
24
  "rope_scaling": null,
25
  "rope_theta": 10000.0,
 
26
  "tie_word_embeddings": false,
27
  "torch_dtype": "bfloat16",
28
  "transformers_version": "4.48.1",
29
  "use_cache": false,
30
+ "vocab_size": 32000
31
  }
last-checkpoint/generation_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 1,
4
  "do_sample": true,
5
- "eos_token_id": 32000,
6
- "pad_token_id": 32000,
7
  "transformers_version": "4.48.1"
8
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 0,
4
  "do_sample": true,
5
+ "eos_token_id": 2,
6
+ "pad_token_id": 1,
7
  "transformers_version": "4.48.1"
8
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1985749813398ab678fa16f868bbbedc4997e1118b6ce01f98888b06f4bc92c0
3
- size 4140280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb869319ab00023388d1e988acef0010027b3678790ccae9050ce5b80348b1f4
3
+ size 136062744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8084acbc3ed18c5405e1dae91524a8683c3de78b48f0c9f04ed0d349a7c73286
3
- size 36192998
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc3c66682da77ffab9c413076f3b17afc624d6825c4dac80307f4ad103e040b7
3
+ size 272133748
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6184eeaad9b63225dc77ef2ebed37b3393eb958bf1b0036e48dd0dd4dce8d3e8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e87af3203606e0bd212d0720508716542b16ace065167ca500c79080102e901
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b023e2a6d269c0076aa102ee08abf897f3861f92ad6f8e6c141a2add64f168fc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a54818910295f3e603bf553aaaad45da6124c88f04cf1b2dc460c5f51f1e31aa
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cc15daee6eb8334e4ef77378d3160328ff03a12360233ca932fdc7422e74a5c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:859fab48bc4a8b0e143d55bf56530d10e2f3a05817c672c76810d5bbeff713c6
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1799d707236b96f35b0a65f363960fd5940a5fdfa4742f09a2e63a97bbd9dc57
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef1e3070a4e9003f48f81647995cb67a2bb43aeca20dae5ff1b28e862e9ece94
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b12d1bdab4d9b46cb08e585841e7243af006aa3bab548a59d48e6556aeddac40
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db80576a4a3f0a67791683c470270d702c1053ee59ca6c748f869f2a12aec02
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6bf839b2526d77010bd7aa679f8e366e5d97bb7f4d317d2d1da257b8178f8e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85269b28aa9e3df2ea51cc614858de45ed722dad5054cab73eff8aabd6ddff08
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55273bfecbe6270bcc0713428d0aef85b6f753e4a96acc6a4ba319abd3048438
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2057c4a62d7e5382a52efe2b7db7f3a010767758bccf53fbfe956f58b713059e
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:212b99b6d3d9b2cedd149518a5819f7b601ae13b9e1660a64dbe679fa815027c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99fe452d5446af1441f5b4102b5c5c984ca4eacd8c770f8cd502e07b90ab7438
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8546ece63abe39ed6d5eb52d3760f17bec10518134f71da680520ffc565f9e9f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fde4abb1ba5c270405a21e3f71ef7e0ec6bd9f941b0624669b81510eb3186ff6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.796208530805687,
5
  "eval_steps": 200,
6
- "global_step": 360,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,270 +11,38 @@
11
  {
12
  "epoch": 0.018957345971563982,
13
  "eval_loss": 4.226168632507324,
14
- "eval_runtime": 1.4697,
15
- "eval_samples_per_second": 1021.29,
16
- "eval_steps_per_second": 127.916,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.1895734597156398,
21
- "grad_norm": 1.3090990781784058,
22
  "learning_rate": 2.6666666666666667e-05,
23
- "loss": 3.5627,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.3791469194312796,
28
- "grad_norm": 0.8479055762290955,
29
  "learning_rate": 5.333333333333333e-05,
30
- "loss": 3.5438,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.5687203791469194,
35
- "grad_norm": 1.6812483072280884,
36
  "learning_rate": 8e-05,
37
- "loss": 3.3698,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.7582938388625592,
42
- "grad_norm": 0.815318763256073,
43
  "learning_rate": 0.00010666666666666667,
44
- "loss": 3.2253,
45
  "step": 40
46
- },
47
- {
48
- "epoch": 0.9478672985781991,
49
- "grad_norm": 4.510071754455566,
50
- "learning_rate": 0.00013333333333333334,
51
- "loss": 3.1158,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 1.132701421800948,
56
- "grad_norm": 2.076462507247925,
57
- "learning_rate": 0.00016,
58
- "loss": 3.0493,
59
- "step": 60
60
- },
61
- {
62
- "epoch": 1.3222748815165877,
63
- "grad_norm": 0.9293704032897949,
64
- "learning_rate": 0.0001866666666666667,
65
- "loss": 2.9581,
66
- "step": 70
67
- },
68
- {
69
- "epoch": 1.5118483412322274,
70
- "grad_norm": 2.360853910446167,
71
- "learning_rate": 0.00019999790210013988,
72
- "loss": 2.9473,
73
- "step": 80
74
- },
75
- {
76
- "epoch": 1.7014218009478674,
77
- "grad_norm": 2.270008087158203,
78
- "learning_rate": 0.0001999811194293973,
79
- "loss": 2.8436,
80
- "step": 90
81
- },
82
- {
83
- "epoch": 1.890995260663507,
84
- "grad_norm": 0.6728243231773376,
85
- "learning_rate": 0.00019994755690455152,
86
- "loss": 2.868,
87
- "step": 100
88
- },
89
- {
90
- "epoch": 2.075829383886256,
91
- "grad_norm": 1.5473170280456543,
92
- "learning_rate": 0.0001998972201584088,
93
- "loss": 2.8562,
94
- "step": 110
95
- },
96
- {
97
- "epoch": 2.265402843601896,
98
- "grad_norm": 0.5135859251022339,
99
- "learning_rate": 0.00019983011763899673,
100
- "loss": 2.8013,
101
- "step": 120
102
- },
103
- {
104
- "epoch": 2.4549763033175354,
105
- "grad_norm": 2.5838098526000977,
106
- "learning_rate": 0.00019974626060814647,
107
- "loss": 2.7669,
108
- "step": 130
109
- },
110
- {
111
- "epoch": 2.6445497630331753,
112
- "grad_norm": 0.6559956073760986,
113
- "learning_rate": 0.00019964566313960264,
114
- "loss": 2.8232,
115
- "step": 140
116
- },
117
- {
118
- "epoch": 2.834123222748815,
119
- "grad_norm": 0.6770644187927246,
120
- "learning_rate": 0.0001995283421166614,
121
- "loss": 2.7517,
122
- "step": 150
123
- },
124
- {
125
- "epoch": 3.018957345971564,
126
- "grad_norm": 0.5535929203033447,
127
- "learning_rate": 0.0001993943172293368,
128
- "loss": 2.8265,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 3.2085308056872037,
133
- "grad_norm": 0.8914756774902344,
134
- "learning_rate": 0.00019924361097105623,
135
- "loss": 2.6735,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 3.3981042654028437,
140
- "grad_norm": 2.5695221424102783,
141
- "learning_rate": 0.0001990762486348855,
142
- "loss": 2.7361,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 3.5876777251184833,
147
- "grad_norm": 2.0282135009765625,
148
- "learning_rate": 0.00019889225830928365,
149
- "loss": 2.6677,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 3.7772511848341233,
154
- "grad_norm": 0.7966509461402893,
155
- "learning_rate": 0.00019869167087338907,
156
- "loss": 2.7418,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 3.7772511848341233,
161
- "eval_loss": 2.798069953918457,
162
- "eval_runtime": 1.5133,
163
- "eval_samples_per_second": 991.865,
164
- "eval_steps_per_second": 124.231,
165
- "step": 200
166
- },
167
- {
168
- "epoch": 3.966824644549763,
169
- "grad_norm": 0.5286687612533569,
170
- "learning_rate": 0.00019847451999183694,
171
- "loss": 2.6955,
172
- "step": 210
173
- },
174
- {
175
- "epoch": 4.151658767772512,
176
- "grad_norm": 1.0543094873428345,
177
- "learning_rate": 0.00019824084210910925,
178
- "loss": 2.6716,
179
- "step": 220
180
- },
181
- {
182
- "epoch": 4.341232227488152,
183
- "grad_norm": 1.5443323850631714,
184
- "learning_rate": 0.00019799067644341844,
185
- "loss": 2.5233,
186
- "step": 230
187
- },
188
- {
189
- "epoch": 4.530805687203792,
190
- "grad_norm": 1.3000928163528442,
191
- "learning_rate": 0.0001977240649801253,
192
- "loss": 2.678,
193
- "step": 240
194
- },
195
- {
196
- "epoch": 4.720379146919432,
197
- "grad_norm": 0.7852599620819092,
198
- "learning_rate": 0.00019744105246469263,
199
- "loss": 2.5693,
200
- "step": 250
201
- },
202
- {
203
- "epoch": 4.909952606635071,
204
- "grad_norm": 0.8456063866615295,
205
- "learning_rate": 0.00019714168639517544,
206
- "loss": 2.6705,
207
- "step": 260
208
- },
209
- {
210
- "epoch": 5.0947867298578196,
211
- "grad_norm": 1.3332568407058716,
212
- "learning_rate": 0.0001968260170142496,
213
- "loss": 2.5806,
214
- "step": 270
215
- },
216
- {
217
- "epoch": 5.2843601895734595,
218
- "grad_norm": 0.7185400724411011,
219
- "learning_rate": 0.00019649409730077935,
220
- "loss": 2.5836,
221
- "step": 280
222
- },
223
- {
224
- "epoch": 5.4739336492890995,
225
- "grad_norm": 4.849304676055908,
226
- "learning_rate": 0.000196145982960926,
227
- "loss": 2.4851,
228
- "step": 290
229
- },
230
- {
231
- "epoch": 5.6635071090047395,
232
- "grad_norm": 0.7679084539413452,
233
- "learning_rate": 0.00019578173241879872,
234
- "loss": 2.5896,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 5.8530805687203795,
239
- "grad_norm": 0.6689910888671875,
240
- "learning_rate": 0.00019540140680664913,
241
- "loss": 2.5587,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 6.037914691943128,
246
- "grad_norm": 0.5706362128257751,
247
- "learning_rate": 0.0001950050699546116,
248
- "loss": 2.5373,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 6.2274881516587675,
253
- "grad_norm": 2.4172613620758057,
254
- "learning_rate": 0.00019459278837999046,
255
- "loss": 2.3316,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 6.4170616113744074,
260
- "grad_norm": 0.724682092666626,
261
- "learning_rate": 0.00019416463127609656,
262
- "loss": 2.5669,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 6.606635071090047,
267
- "grad_norm": 0.4259670674800873,
268
- "learning_rate": 0.00019372067050063438,
269
- "loss": 2.4148,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 6.796208530805687,
274
- "grad_norm": 1.5209062099456787,
275
- "learning_rate": 0.00019326098056364222,
276
- "loss": 2.4746,
277
- "step": 360
278
  }
279
  ],
280
  "logging_steps": 10,
@@ -294,7 +62,7 @@
294
  "attributes": {}
295
  }
296
  },
297
- "total_flos": 5.410483850969088e+16,
298
  "train_batch_size": 1,
299
  "trial_name": null,
300
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7582938388625592,
5
  "eval_steps": 200,
6
+ "global_step": 40,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.018957345971563982,
13
  "eval_loss": 4.226168632507324,
14
+ "eval_runtime": 1.1061,
15
+ "eval_samples_per_second": 1357.025,
16
+ "eval_steps_per_second": 169.967,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.1895734597156398,
21
+ "grad_norm": 3.125,
22
  "learning_rate": 2.6666666666666667e-05,
23
+ "loss": 3.5678,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.3791469194312796,
28
+ "grad_norm": 1.65625,
29
  "learning_rate": 5.333333333333333e-05,
30
+ "loss": 3.5539,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.5687203791469194,
35
+ "grad_norm": 3.328125,
36
  "learning_rate": 8e-05,
37
+ "loss": 3.3496,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.7582938388625592,
42
+ "grad_norm": 1.6640625,
43
  "learning_rate": 0.00010666666666666667,
44
+ "loss": 3.1688,
45
  "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  }
47
  ],
48
  "logging_steps": 10,
 
62
  "attributes": {}
63
  }
64
  },
65
+ "total_flos": 5467804752936960.0,
66
  "train_batch_size": 1,
67
  "trial_name": null,
68
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab35bb512bd0815bfb48651f4d2ed30517d244444ffa68a91ce2ff049faa81b0
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5414fad3ccf622c5fff1f84e82069b56b14de9a52dbbb9ddf1d853ff6aff2a29
3
  size 6840