chansung commited on
Commit
b645907
·
verified ·
1 Parent(s): fde9cae

Model save

Browse files
Files changed (4) hide show
  1. README.md +8 -9
  2. all_results.json +5 -10
  3. train_results.json +5 -5
  4. trainer_state.json +456 -148
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
  base_model: mistralai/Mistral-7B-v0.3
3
  datasets:
4
- - llama-duo/synth_summarize_dataset_dedup
5
  library_name: peft
6
  license: apache-2.0
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # mistral-7b-0.3-gpt4o_100k_summarize-lora
21
 
22
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.8227
25
 
26
  ## Model description
27
 
@@ -41,14 +40,14 @@ More information needed
41
 
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
- - train_batch_size: 4
45
- - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 8
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 64
51
- - total_eval_batch_size: 32
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
@@ -58,7 +57,7 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.742 | 1.0 | 219 | 1.8227 |
62
 
63
 
64
  ### Framework versions
 
1
  ---
2
  base_model: mistralai/Mistral-7B-v0.3
3
  datasets:
4
+ - generator
5
  library_name: peft
6
  license: apache-2.0
7
  tags:
 
8
  - trl
9
  - sft
10
  - generated_from_trainer
 
18
 
19
  # mistral-7b-0.3-gpt4o_100k_summarize-lora
20
 
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.8165
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 0.0002
43
+ - train_batch_size: 2
44
+ - eval_batch_size: 2
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - num_devices: 8
48
  - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 32
50
+ - total_eval_batch_size: 16
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.7129 | 1.0 | 438 | 1.8165 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.8227200508117676,
4
- "eval_runtime": 1.5837,
5
- "eval_samples": 25,
6
- "eval_samples_per_second": 7.577,
7
- "eval_steps_per_second": 0.631,
8
- "total_flos": 6.128945867501076e+17,
9
- "train_loss": 0.8279621209183784,
10
- "train_runtime": 2699.8939,
11
  "train_samples": 115376,
12
- "train_samples_per_second": 5.187,
13
- "train_steps_per_second": 0.081
14
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 6.128945862132367e+17,
4
+ "train_loss": 0.7845394540595138,
5
+ "train_runtime": 3064.8006,
 
 
 
 
 
6
  "train_samples": 115376,
7
+ "train_samples_per_second": 4.569,
8
+ "train_steps_per_second": 0.143
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 6.128945867501076e+17,
4
- "train_loss": 0.8279621209183784,
5
- "train_runtime": 2699.8939,
6
  "train_samples": 115376,
7
- "train_samples_per_second": 5.187,
8
- "train_steps_per_second": 0.081
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 6.128945862132367e+17,
4
+ "train_loss": 0.7845394540595138,
5
+ "train_runtime": 3064.8006,
6
  "train_samples": 115376,
7
+ "train_samples_per_second": 4.569,
8
+ "train_steps_per_second": 0.143
9
  }
trainer_state.json CHANGED
@@ -3,339 +3,647 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 219,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0045662100456621,
13
- "grad_norm": 6.815864086151123,
14
- "learning_rate": 9.090909090909091e-06,
15
- "loss": 1.7729,
16
  "step": 1
17
  },
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.0228310502283105,
20
- "grad_norm": 5.818741798400879,
21
  "learning_rate": 4.545454545454546e-05,
22
- "loss": 1.7132,
23
- "step": 5
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.045662100456621,
27
- "grad_norm": 2.6188273429870605,
28
  "learning_rate": 9.090909090909092e-05,
29
- "loss": 1.4372,
30
- "step": 10
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.0684931506849315,
34
- "grad_norm": 1.4498921632766724,
35
  "learning_rate": 0.00013636363636363637,
36
- "loss": 1.2124,
37
- "step": 15
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.091324200913242,
41
- "grad_norm": 1.0618371963500977,
42
  "learning_rate": 0.00018181818181818183,
43
- "loss": 1.0648,
44
- "step": 20
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.1141552511415525,
48
- "grad_norm": 0.8222594857215881,
49
  "learning_rate": 0.00019988558131018186,
50
- "loss": 0.9718,
51
- "step": 25
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.136986301369863,
55
- "grad_norm": 0.621496856212616,
56
  "learning_rate": 0.00019918730395931649,
57
- "loss": 0.9004,
58
- "step": 30
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 0.1598173515981735,
62
- "grad_norm": 0.3935670554637909,
63
  "learning_rate": 0.00019785874696801202,
64
- "loss": 0.8609,
65
- "step": 35
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 0.182648401826484,
69
- "grad_norm": 0.37850192189216614,
70
  "learning_rate": 0.00019590835257019714,
71
- "loss": 0.8519,
72
- "step": 40
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 0.2054794520547945,
76
- "grad_norm": 0.33678698539733887,
77
  "learning_rate": 0.00019334851442746664,
78
- "loss": 0.8249,
79
- "step": 45
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 0.228310502283105,
83
- "grad_norm": 0.5497208833694458,
84
  "learning_rate": 0.00019019549887431877,
85
- "loss": 0.8155,
86
- "step": 50
 
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.2511415525114155,
90
- "grad_norm": 0.4304714798927307,
91
  "learning_rate": 0.00018646934155473022,
92
- "loss": 0.8072,
93
- "step": 55
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 0.273972602739726,
97
- "grad_norm": 0.3267618417739868,
98
  "learning_rate": 0.00018219372010688515,
99
- "loss": 0.792,
100
- "step": 60
 
 
 
 
 
 
 
101
  },
102
  {
103
  "epoch": 0.2968036529680365,
104
- "grad_norm": 0.5633127689361572,
105
  "learning_rate": 0.00017739580370507532,
106
- "loss": 0.7879,
107
- "step": 65
 
 
 
 
 
 
 
108
  },
109
  {
110
  "epoch": 0.319634703196347,
111
- "grad_norm": 0.39324459433555603,
112
  "learning_rate": 0.0001721060804148482,
113
- "loss": 0.7944,
114
- "step": 70
 
 
 
 
 
 
 
115
  },
116
  {
117
  "epoch": 0.3424657534246575,
118
- "grad_norm": 0.3261910080909729,
119
  "learning_rate": 0.0001663581634584641,
120
- "loss": 0.7872,
121
- "step": 75
 
 
 
 
 
 
 
122
  },
123
  {
124
  "epoch": 0.365296803652968,
125
- "grad_norm": 0.3310278356075287,
126
  "learning_rate": 0.0001601885776217367,
127
- "loss": 0.7789,
128
- "step": 80
 
 
 
 
 
 
 
129
  },
130
  {
131
  "epoch": 0.3881278538812785,
132
- "grad_norm": 0.356475830078125,
133
  "learning_rate": 0.0001536365271595212,
134
- "loss": 0.7805,
135
- "step": 85
 
 
 
 
 
 
 
136
  },
137
  {
138
  "epoch": 0.410958904109589,
139
- "grad_norm": 0.6090440154075623,
140
  "learning_rate": 0.0001467436466746814,
141
- "loss": 0.7725,
142
- "step": 90
 
 
 
 
 
 
 
143
  },
144
  {
145
  "epoch": 0.4337899543378995,
146
- "grad_norm": 0.648769736289978,
147
  "learning_rate": 0.0001395537365535585,
148
- "loss": 0.7696,
149
- "step": 95
 
 
 
 
 
 
 
150
  },
151
  {
152
  "epoch": 0.45662100456621,
153
- "grad_norm": 0.3896843492984772,
154
  "learning_rate": 0.00013211248463910262,
155
- "loss": 0.7703,
156
- "step": 100
 
 
 
 
 
 
 
157
  },
158
  {
159
  "epoch": 0.4794520547945205,
160
- "grad_norm": 0.40256544947624207,
161
  "learning_rate": 0.00012446717591027624,
162
- "loss": 0.7577,
163
- "step": 105
 
 
 
 
 
 
 
164
  },
165
  {
166
  "epoch": 0.502283105022831,
167
- "grad_norm": 0.3406723141670227,
168
  "learning_rate": 0.00011666639201255506,
169
- "loss": 0.7572,
170
- "step": 110
 
 
 
 
 
 
 
171
  },
172
  {
173
  "epoch": 0.5251141552511416,
174
- "grad_norm": 0.4368165135383606,
175
  "learning_rate": 0.0001087597025488413,
176
- "loss": 0.7578,
177
- "step": 115
 
 
 
 
 
 
 
178
  },
179
  {
180
  "epoch": 0.547945205479452,
181
- "grad_norm": 0.45874515175819397,
182
  "learning_rate": 0.00010079735009246167,
183
- "loss": 0.7593,
184
- "step": 120
 
 
 
 
 
 
 
185
  },
186
  {
187
  "epoch": 0.5707762557077626,
188
- "grad_norm": 0.44529327750205994,
189
  "learning_rate": 9.282993092381625e-05,
190
- "loss": 0.7479,
191
- "step": 125
 
 
 
 
 
 
 
192
  },
193
  {
194
  "epoch": 0.593607305936073,
195
- "grad_norm": 0.40876489877700806,
196
  "learning_rate": 8.490807351941753e-05,
197
- "loss": 0.7486,
198
- "step": 130
 
 
 
 
 
 
 
199
  },
200
  {
201
  "epoch": 0.6164383561643836,
202
- "grad_norm": 0.3738647699356079,
203
  "learning_rate": 7.708211683634112e-05,
204
- "loss": 0.7478,
205
- "step": 135
 
 
 
 
 
 
 
206
  },
207
  {
208
  "epoch": 0.639269406392694,
209
- "grad_norm": 0.3480348587036133,
210
  "learning_rate": 6.940179043641005e-05,
211
- "loss": 0.7401,
212
- "step": 140
 
 
 
 
 
 
 
213
  },
214
  {
215
  "epoch": 0.6621004566210046,
216
- "grad_norm": 0.3515714108943939,
217
  "learning_rate": 6.191589848274368e-05,
218
- "loss": 0.7477,
219
- "step": 145
 
 
 
 
 
 
 
220
  },
221
  {
222
  "epoch": 0.684931506849315,
223
- "grad_norm": 0.4522218108177185,
224
  "learning_rate": 5.467200961669619e-05,
225
- "loss": 0.7447,
226
- "step": 150
 
 
 
 
 
 
 
227
  },
228
  {
229
  "epoch": 0.7077625570776256,
230
- "grad_norm": 0.331890344619751,
231
  "learning_rate": 4.7716154685841944e-05,
232
- "loss": 0.746,
233
- "step": 155
 
 
 
 
 
 
 
234
  },
235
  {
236
  "epoch": 0.730593607305936,
237
- "grad_norm": 0.4267323613166809,
238
  "learning_rate": 4.109253424377772e-05,
239
- "loss": 0.7525,
240
- "step": 160
 
 
 
 
 
 
 
241
  },
242
  {
243
  "epoch": 0.7534246575342466,
244
- "grad_norm": 0.34801220893859863,
245
  "learning_rate": 3.4843237680415156e-05,
246
- "loss": 0.7408,
247
- "step": 165
 
 
 
 
 
 
 
248
  },
249
  {
250
  "epoch": 0.776255707762557,
251
- "grad_norm": 0.3299520015716553,
252
  "learning_rate": 2.9007975767533714e-05,
253
- "loss": 0.7324,
254
- "step": 170
 
 
 
 
 
 
 
255
  },
256
  {
257
  "epoch": 0.7990867579908676,
258
- "grad_norm": 0.3563040494918823,
259
  "learning_rate": 2.3623828319116748e-05,
260
- "loss": 0.7487,
261
- "step": 175
 
 
 
 
 
 
 
262
  },
263
  {
264
  "epoch": 0.821917808219178,
265
- "grad_norm": 0.355241984128952,
266
  "learning_rate": 1.8725008569947365e-05,
267
- "loss": 0.7521,
268
- "step": 180
 
 
 
 
 
 
 
269
  },
270
  {
271
  "epoch": 0.8447488584474886,
272
- "grad_norm": 0.32830801606178284,
273
  "learning_rate": 1.4342645769705977e-05,
274
- "loss": 0.737,
275
- "step": 185
 
 
 
 
 
 
 
276
  },
277
  {
278
  "epoch": 0.867579908675799,
279
- "grad_norm": 0.32326367497444153,
280
  "learning_rate": 1.0504587374062391e-05,
281
- "loss": 0.7397,
282
- "step": 190
 
 
 
 
 
 
 
283
  },
284
  {
285
  "epoch": 0.8904109589041096,
286
- "grad_norm": 0.33771994709968567,
287
  "learning_rate": 7.235222089726279e-06,
288
- "loss": 0.7373,
289
- "step": 195
 
 
 
 
 
 
 
290
  },
291
  {
292
  "epoch": 0.91324200913242,
293
- "grad_norm": 0.33751899003982544,
294
  "learning_rate": 4.555324897906132e-06,
295
- "loss": 0.7426,
296
- "step": 200
 
 
 
 
 
 
 
297
  },
298
  {
299
  "epoch": 0.9360730593607306,
300
- "grad_norm": 0.34009331464767456,
301
  "learning_rate": 2.4819250409651607e-06,
302
- "loss": 0.7403,
303
- "step": 205
 
 
 
 
 
 
 
304
  },
305
  {
306
  "epoch": 0.958904109589041,
307
- "grad_norm": 0.3508203327655792,
308
  "learning_rate": 1.0281978111449375e-06,
309
- "loss": 0.7469,
310
- "step": 210
 
 
 
 
 
 
 
311
  },
312
  {
313
  "epoch": 0.9817351598173516,
314
- "grad_norm": 0.3122396469116211,
315
  "learning_rate": 2.0338082897886079e-07,
316
- "loss": 0.742,
317
- "step": 215
 
 
 
 
 
 
 
318
  },
319
  {
320
  "epoch": 1.0,
321
- "eval_loss": 1.8227200508117676,
322
- "eval_runtime": 1.5688,
323
- "eval_samples_per_second": 7.649,
324
- "eval_steps_per_second": 0.637,
325
- "step": 219
326
  },
327
  {
328
  "epoch": 1.0,
329
- "step": 219,
330
- "total_flos": 6.128945867501076e+17,
331
- "train_loss": 0.8279621209183784,
332
- "train_runtime": 2699.8939,
333
- "train_samples_per_second": 5.187,
334
- "train_steps_per_second": 0.081
335
  }
336
  ],
337
  "logging_steps": 5,
338
- "max_steps": 219,
339
  "num_input_tokens_seen": 0,
340
  "num_train_epochs": 1,
341
  "save_steps": 100,
@@ -351,8 +659,8 @@
351
  "attributes": {}
352
  }
353
  },
354
- "total_flos": 6.128945867501076e+17,
355
- "train_batch_size": 4,
356
  "trial_name": null,
357
  "trial_params": null
358
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 438,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.00228310502283105,
13
+ "grad_norm": 7.218806743621826,
14
+ "learning_rate": 4.5454545454545455e-06,
15
+ "loss": 1.798,
16
  "step": 1
17
  },
18
+ {
19
+ "epoch": 0.01141552511415525,
20
+ "grad_norm": 7.102625846862793,
21
+ "learning_rate": 2.272727272727273e-05,
22
+ "loss": 1.7228,
23
+ "step": 5
24
+ },
25
  {
26
  "epoch": 0.0228310502283105,
27
+ "grad_norm": 4.343904495239258,
28
  "learning_rate": 4.545454545454546e-05,
29
+ "loss": 1.571,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.03424657534246575,
34
+ "grad_norm": 2.9180715084075928,
35
+ "learning_rate": 6.818181818181818e-05,
36
+ "loss": 1.34,
37
+ "step": 15
38
  },
39
  {
40
  "epoch": 0.045662100456621,
41
+ "grad_norm": 1.1530470848083496,
42
  "learning_rate": 9.090909090909092e-05,
43
+ "loss": 1.1862,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.05707762557077625,
48
+ "grad_norm": 1.0645655393600464,
49
+ "learning_rate": 0.00011363636363636365,
50
+ "loss": 1.0852,
51
+ "step": 25
52
  },
53
  {
54
  "epoch": 0.0684931506849315,
55
+ "grad_norm": 1.113412618637085,
56
  "learning_rate": 0.00013636363636363637,
57
+ "loss": 1.0102,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.07990867579908675,
62
+ "grad_norm": 0.9628555178642273,
63
+ "learning_rate": 0.0001590909090909091,
64
+ "loss": 0.9345,
65
+ "step": 35
66
  },
67
  {
68
  "epoch": 0.091324200913242,
69
+ "grad_norm": 0.5371753573417664,
70
  "learning_rate": 0.00018181818181818183,
71
+ "loss": 0.8871,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.10273972602739725,
76
+ "grad_norm": 0.7333775162696838,
77
+ "learning_rate": 0.00019999682111362368,
78
+ "loss": 0.8563,
79
+ "step": 45
80
  },
81
  {
82
  "epoch": 0.1141552511415525,
83
+ "grad_norm": 0.5362383127212524,
84
  "learning_rate": 0.00019988558131018186,
85
+ "loss": 0.8434,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.12557077625570776,
90
+ "grad_norm": 0.42146092653274536,
91
+ "learning_rate": 0.0001996155992365444,
92
+ "loss": 0.8227,
93
+ "step": 55
94
  },
95
  {
96
  "epoch": 0.136986301369863,
97
+ "grad_norm": 0.42583712935447693,
98
  "learning_rate": 0.00019918730395931649,
99
+ "loss": 0.8087,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.14840182648401826,
104
+ "grad_norm": 0.4173405170440674,
105
+ "learning_rate": 0.00019860137614295168,
106
+ "loss": 0.8127,
107
+ "step": 65
108
  },
109
  {
110
  "epoch": 0.1598173515981735,
111
+ "grad_norm": 0.5237720608711243,
112
  "learning_rate": 0.00019785874696801202,
113
+ "loss": 0.7888,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.17123287671232876,
118
+ "grad_norm": 0.4207506477832794,
119
+ "learning_rate": 0.0001969605966512975,
120
+ "loss": 0.7991,
121
+ "step": 75
122
  },
123
  {
124
  "epoch": 0.182648401826484,
125
+ "grad_norm": 0.5145552158355713,
126
  "learning_rate": 0.00019590835257019714,
127
+ "loss": 0.8001,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.19406392694063926,
132
+ "grad_norm": 0.4617029130458832,
133
+ "learning_rate": 0.00019470368699424218,
134
+ "loss": 0.7777,
135
+ "step": 85
136
  },
137
  {
138
  "epoch": 0.2054794520547945,
139
+ "grad_norm": 0.48738017678260803,
140
  "learning_rate": 0.00019334851442746664,
141
+ "loss": 0.7803,
142
+ "step": 90
143
+ },
144
+ {
145
+ "epoch": 0.21689497716894976,
146
+ "grad_norm": 0.4272419512271881,
147
+ "learning_rate": 0.00019184498856579868,
148
+ "loss": 0.7721,
149
+ "step": 95
150
  },
151
  {
152
  "epoch": 0.228310502283105,
153
+ "grad_norm": 0.8292232155799866,
154
  "learning_rate": 0.00019019549887431877,
155
+ "loss": 0.774,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.23972602739726026,
160
+ "grad_norm": 0.48018935322761536,
161
+ "learning_rate": 0.00018840266678982342,
162
+ "loss": 0.7785,
163
+ "step": 105
164
  },
165
  {
166
  "epoch": 0.2511415525114155,
167
+ "grad_norm": 0.5646579265594482,
168
  "learning_rate": 0.00018646934155473022,
169
+ "loss": 0.7567,
170
+ "step": 110
171
+ },
172
+ {
173
+ "epoch": 0.2625570776255708,
174
+ "grad_norm": 0.5426482558250427,
175
+ "learning_rate": 0.00018439859568894463,
176
+ "loss": 0.76,
177
+ "step": 115
178
  },
179
  {
180
  "epoch": 0.273972602739726,
181
+ "grad_norm": 0.4849863350391388,
182
  "learning_rate": 0.00018219372010688515,
183
+ "loss": 0.7513,
184
+ "step": 120
185
+ },
186
+ {
187
+ "epoch": 0.2853881278538813,
188
+ "grad_norm": 0.4911242425441742,
189
+ "learning_rate": 0.00017985821888742685,
190
+ "loss": 0.7547,
191
+ "step": 125
192
  },
193
  {
194
  "epoch": 0.2968036529680365,
195
+ "grad_norm": 0.4471426010131836,
196
  "learning_rate": 0.00017739580370507532,
197
+ "loss": 0.7517,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.3082191780821918,
202
+ "grad_norm": 0.5667721629142761,
203
+ "learning_rate": 0.00017481038793122088,
204
+ "loss": 0.7578,
205
+ "step": 135
206
  },
207
  {
208
  "epoch": 0.319634703196347,
209
+ "grad_norm": 0.5815710425376892,
210
  "learning_rate": 0.0001721060804148482,
211
+ "loss": 0.7629,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.3310502283105023,
216
+ "grad_norm": 0.4607318043708801,
217
+ "learning_rate": 0.00016928717895258437,
218
+ "loss": 0.7602,
219
+ "step": 145
220
  },
221
  {
222
  "epoch": 0.3424657534246575,
223
+ "grad_norm": 0.5995905995368958,
224
  "learning_rate": 0.0001663581634584641,
225
+ "loss": 0.75,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.3538812785388128,
230
+ "grad_norm": 0.5096511244773865,
231
+ "learning_rate": 0.00016332368884426626,
232
+ "loss": 0.7439,
233
+ "step": 155
234
  },
235
  {
236
  "epoch": 0.365296803652968,
237
+ "grad_norm": 0.8967055678367615,
238
  "learning_rate": 0.0001601885776217367,
239
+ "loss": 0.7527,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.3767123287671233,
244
+ "grad_norm": 0.5147667527198792,
245
+ "learning_rate": 0.00015695781223845441,
246
+ "loss": 0.7527,
247
+ "step": 165
248
  },
249
  {
250
  "epoch": 0.3881278538812785,
251
+ "grad_norm": 0.4564310312271118,
252
  "learning_rate": 0.0001536365271595212,
253
+ "loss": 0.7487,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.3995433789954338,
258
+ "grad_norm": 0.7292072176933289,
259
+ "learning_rate": 0.00015023000070765884,
260
+ "loss": 0.7482,
261
+ "step": 175
262
  },
263
  {
264
  "epoch": 0.410958904109589,
265
+ "grad_norm": 0.8427252769470215,
266
  "learning_rate": 0.0001467436466746814,
267
+ "loss": 0.7409,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.4223744292237443,
272
+ "grad_norm": 0.631044328212738,
273
+ "learning_rate": 0.00014318300571767513,
274
+ "loss": 0.7451,
275
+ "step": 185
276
  },
277
  {
278
  "epoch": 0.4337899543378995,
279
+ "grad_norm": 0.4628809690475464,
280
  "learning_rate": 0.0001395537365535585,
281
+ "loss": 0.7409,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.4452054794520548,
286
+ "grad_norm": 0.4346175193786621,
287
+ "learning_rate": 0.00013586160696601665,
288
+ "loss": 0.747,
289
+ "step": 195
290
  },
291
  {
292
  "epoch": 0.45662100456621,
293
+ "grad_norm": 0.4906691312789917,
294
  "learning_rate": 0.00013211248463910262,
295
+ "loss": 0.7389,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.4680365296803653,
300
+ "grad_norm": 0.4580436050891876,
301
+ "learning_rate": 0.00012831232783207277,
302
+ "loss": 0.7277,
303
+ "step": 205
304
  },
305
  {
306
  "epoch": 0.4794520547945205,
307
+ "grad_norm": 0.4663284122943878,
308
  "learning_rate": 0.00012446717591027624,
309
+ "loss": 0.7325,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.4908675799086758,
314
+ "grad_norm": 0.5113467574119568,
315
+ "learning_rate": 0.00012058313974714746,
316
+ "loss": 0.7364,
317
+ "step": 215
318
  },
319
  {
320
  "epoch": 0.502283105022831,
321
+ "grad_norm": 0.4941682517528534,
322
  "learning_rate": 0.00011666639201255506,
323
+ "loss": 0.7251,
324
+ "step": 220
325
+ },
326
+ {
327
+ "epoch": 0.5136986301369864,
328
+ "grad_norm": 0.47159168124198914,
329
+ "learning_rate": 0.00011272315736294108,
330
+ "loss": 0.7278,
331
+ "step": 225
332
  },
333
  {
334
  "epoch": 0.5251141552511416,
335
+ "grad_norm": 0.44525182247161865,
336
  "learning_rate": 0.0001087597025488413,
337
+ "loss": 0.7368,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 0.5365296803652968,
342
+ "grad_norm": 0.5244260430335999,
343
+ "learning_rate": 0.00010478232645550782,
344
+ "loss": 0.7374,
345
+ "step": 235
346
  },
347
  {
348
  "epoch": 0.547945205479452,
349
+ "grad_norm": 0.4565676152706146,
350
  "learning_rate": 0.00010079735009246167,
351
+ "loss": 0.7308,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 0.5593607305936074,
356
+ "grad_norm": 0.5089186429977417,
357
+ "learning_rate": 9.681110654788482e-05,
358
+ "loss": 0.7312,
359
+ "step": 245
360
  },
361
  {
362
  "epoch": 0.5707762557077626,
363
+ "grad_norm": 0.502402663230896,
364
  "learning_rate": 9.282993092381625e-05,
365
+ "loss": 0.7128,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 0.5821917808219178,
370
+ "grad_norm": 0.4394870102405548,
371
+ "learning_rate": 8.886015026814736e-05,
372
+ "loss": 0.7235,
373
+ "step": 255
374
  },
375
  {
376
  "epoch": 0.593607305936073,
377
+ "grad_norm": 0.4642430245876312,
378
  "learning_rate": 8.490807351941753e-05,
379
+ "loss": 0.7238,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 0.6050228310502284,
384
+ "grad_norm": 0.5627766847610474,
385
+ "learning_rate": 8.097998148038985e-05,
386
+ "loss": 0.7244,
387
+ "step": 265
388
  },
389
  {
390
  "epoch": 0.6164383561643836,
391
+ "grad_norm": 0.5340884327888489,
392
  "learning_rate": 7.708211683634112e-05,
393
+ "loss": 0.7196,
394
+ "step": 270
395
+ },
396
+ {
397
+ "epoch": 0.6278538812785388,
398
+ "grad_norm": 0.48936447501182556,
399
+ "learning_rate": 7.322067423393002e-05,
400
+ "loss": 0.7142,
401
+ "step": 275
402
  },
403
  {
404
  "epoch": 0.639269406392694,
405
+ "grad_norm": 0.6838403940200806,
406
  "learning_rate": 6.940179043641005e-05,
407
+ "loss": 0.7134,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 0.6506849315068494,
412
+ "grad_norm": 0.46401193737983704,
413
+ "learning_rate": 6.563153457083315e-05,
414
+ "loss": 0.7232,
415
+ "step": 285
416
  },
417
  {
418
  "epoch": 0.6621004566210046,
419
+ "grad_norm": 0.4747474789619446,
420
  "learning_rate": 6.191589848274368e-05,
421
+ "loss": 0.7198,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 0.6735159817351598,
426
+ "grad_norm": 0.4562244713306427,
427
+ "learning_rate": 5.82607872136913e-05,
428
+ "loss": 0.7236,
429
+ "step": 295
430
  },
431
  {
432
  "epoch": 0.684931506849315,
433
+ "grad_norm": 0.469294011592865,
434
  "learning_rate": 5.467200961669619e-05,
435
+ "loss": 0.716,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 0.6963470319634704,
440
+ "grad_norm": 0.46684587001800537,
441
+ "learning_rate": 5.115526912458113e-05,
442
+ "loss": 0.7129,
443
+ "step": 305
444
  },
445
  {
446
  "epoch": 0.7077625570776256,
447
+ "grad_norm": 0.5353609323501587,
448
  "learning_rate": 4.7716154685841944e-05,
449
+ "loss": 0.7276,
450
+ "step": 310
451
+ },
452
+ {
453
+ "epoch": 0.7191780821917808,
454
+ "grad_norm": 0.4474339783191681,
455
+ "learning_rate": 4.4360131882460555e-05,
456
+ "loss": 0.7328,
457
+ "step": 315
458
  },
459
  {
460
  "epoch": 0.730593607305936,
461
+ "grad_norm": 0.504355251789093,
462
  "learning_rate": 4.109253424377772e-05,
463
+ "loss": 0.7221,
464
+ "step": 320
465
+ },
466
+ {
467
+ "epoch": 0.7420091324200914,
468
+ "grad_norm": 0.524207592010498,
469
+ "learning_rate": 3.791855477022903e-05,
470
+ "loss": 0.7156,
471
+ "step": 325
472
  },
473
  {
474
  "epoch": 0.7534246575342466,
475
+ "grad_norm": 0.5037496089935303,
476
  "learning_rate": 3.4843237680415156e-05,
477
+ "loss": 0.7157,
478
+ "step": 330
479
+ },
480
+ {
481
+ "epoch": 0.7648401826484018,
482
+ "grad_norm": 0.4428478479385376,
483
+ "learning_rate": 3.1871470394622404e-05,
484
+ "loss": 0.7022,
485
+ "step": 335
486
  },
487
  {
488
  "epoch": 0.776255707762557,
489
+ "grad_norm": 0.5166884660720825,
490
  "learning_rate": 2.9007975767533714e-05,
491
+ "loss": 0.7137,
492
+ "step": 340
493
+ },
494
+ {
495
+ "epoch": 0.7876712328767124,
496
+ "grad_norm": 0.4624291956424713,
497
+ "learning_rate": 2.625730458247362e-05,
498
+ "loss": 0.7221,
499
+ "step": 345
500
  },
501
  {
502
  "epoch": 0.7990867579908676,
503
+ "grad_norm": 0.4491420090198517,
504
  "learning_rate": 2.3623828319116748e-05,
505
+ "loss": 0.7237,
506
+ "step": 350
507
+ },
508
+ {
509
+ "epoch": 0.8105022831050228,
510
+ "grad_norm": 0.46221333742141724,
511
+ "learning_rate": 2.1111732206152424e-05,
512
+ "loss": 0.7258,
513
+ "step": 355
514
  },
515
  {
516
  "epoch": 0.821917808219178,
517
+ "grad_norm": 0.4614256024360657,
518
  "learning_rate": 1.8725008569947365e-05,
519
+ "loss": 0.7266,
520
+ "step": 360
521
+ },
522
+ {
523
+ "epoch": 0.8333333333333334,
524
+ "grad_norm": 0.4534947872161865,
525
+ "learning_rate": 1.6467450489776582e-05,
526
+ "loss": 0.7149,
527
+ "step": 365
528
  },
529
  {
530
  "epoch": 0.8447488584474886,
531
+ "grad_norm": 0.43620970845222473,
532
  "learning_rate": 1.4342645769705977e-05,
533
+ "loss": 0.7093,
534
+ "step": 370
535
+ },
536
+ {
537
+ "epoch": 0.8561643835616438,
538
+ "grad_norm": 0.44368976354599,
539
+ "learning_rate": 1.2353971236706564e-05,
540
+ "loss": 0.7237,
541
+ "step": 375
542
  },
543
  {
544
  "epoch": 0.867579908675799,
545
+ "grad_norm": 0.4331362247467041,
546
  "learning_rate": 1.0504587374062391e-05,
547
+ "loss": 0.7055,
548
+ "step": 380
549
+ },
550
+ {
551
+ "epoch": 0.8789954337899544,
552
+ "grad_norm": 0.5180200934410095,
553
+ "learning_rate": 8.797433298600622e-06,
554
+ "loss": 0.7083,
555
+ "step": 385
556
  },
557
  {
558
  "epoch": 0.8904109589041096,
559
+ "grad_norm": 0.4550034999847412,
560
  "learning_rate": 7.235222089726279e-06,
561
+ "loss": 0.7156,
562
+ "step": 390
563
+ },
564
+ {
565
+ "epoch": 0.9018264840182648,
566
+ "grad_norm": 0.4615817964076996,
567
+ "learning_rate": 5.82043647768502e-06,
568
+ "loss": 0.7232,
569
+ "step": 395
570
  },
571
  {
572
  "epoch": 0.91324200913242,
573
+ "grad_norm": 0.4302138388156891,
574
  "learning_rate": 4.555324897906132e-06,
575
+ "loss": 0.7115,
576
+ "step": 400
577
+ },
578
+ {
579
+ "epoch": 0.9246575342465754,
580
+ "grad_norm": 0.442900687456131,
581
+ "learning_rate": 3.441897917696679e-06,
582
+ "loss": 0.7153,
583
+ "step": 405
584
  },
585
  {
586
  "epoch": 0.9360730593607306,
587
+ "grad_norm": 0.45728132128715515,
588
  "learning_rate": 2.4819250409651607e-06,
589
+ "loss": 0.715,
590
+ "step": 410
591
+ },
592
+ {
593
+ "epoch": 0.9474885844748858,
594
+ "grad_norm": 0.4729623794555664,
595
+ "learning_rate": 1.6769318960533464e-06,
596
+ "loss": 0.7167,
597
+ "step": 415
598
  },
599
  {
600
  "epoch": 0.958904109589041,
601
+ "grad_norm": 0.43312644958496094,
602
  "learning_rate": 1.0281978111449375e-06,
603
+ "loss": 0.7266,
604
+ "step": 420
605
+ },
606
+ {
607
+ "epoch": 0.9703196347031964,
608
+ "grad_norm": 0.4580814838409424,
609
+ "learning_rate": 5.367537811046485e-07,
610
+ "loss": 0.7222,
611
+ "step": 425
612
  },
613
  {
614
  "epoch": 0.9817351598173516,
615
+ "grad_norm": 0.4317879378795624,
616
  "learning_rate": 2.0338082897886079e-07,
617
+ "loss": 0.7099,
618
+ "step": 430
619
+ },
620
+ {
621
+ "epoch": 0.9931506849315068,
622
+ "grad_norm": 0.4474068880081177,
623
+ "learning_rate": 2.8608764761639538e-08,
624
+ "loss": 0.7129,
625
+ "step": 435
626
  },
627
  {
628
  "epoch": 1.0,
629
+ "eval_loss": 1.8165334463119507,
630
+ "eval_runtime": 0.9134,
631
+ "eval_samples_per_second": 13.137,
632
+ "eval_steps_per_second": 1.095,
633
+ "step": 438
634
  },
635
  {
636
  "epoch": 1.0,
637
+ "step": 438,
638
+ "total_flos": 6.128945862132367e+17,
639
+ "train_loss": 0.7845394540595138,
640
+ "train_runtime": 3064.8006,
641
+ "train_samples_per_second": 4.569,
642
+ "train_steps_per_second": 0.143
643
  }
644
  ],
645
  "logging_steps": 5,
646
+ "max_steps": 438,
647
  "num_input_tokens_seen": 0,
648
  "num_train_epochs": 1,
649
  "save_steps": 100,
 
659
  "attributes": {}
660
  }
661
  },
662
+ "total_flos": 6.128945862132367e+17,
663
+ "train_batch_size": 2,
664
  "trial_name": null,
665
  "trial_params": null
666
  }