Applied AI commited on
Commit
099b201
·
verified ·
1 Parent(s): 6545507

Model save

Browse files
Files changed (5) hide show
  1. README.md +2 -6
  2. all_results.json +4 -4
  3. model.safetensors +1 -1
  4. train_results.json +4 -4
  5. trainer_state.json +89 -89
README.md CHANGED
@@ -2,15 +2,11 @@
2
  license: mit
3
  base_model: gpt2
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
12
  datasets:
13
- - appliedai-qx/sample-dataset-ah
14
  model-index:
15
  - name: gpt2
16
  results: []
@@ -21,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # gpt2
23
 
24
- This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on the appliedai-qx/sample-dataset-ah dataset.
25
 
26
  ## Model description
27
 
 
2
  license: mit
3
  base_model: gpt2
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: gpt2
12
  results: []
 
17
 
18
  # gpt2
19
 
20
+ This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on the generator dataset.
21
 
22
  ## Model description
23
 
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 7357983621120000.0,
4
- "train_loss": 1.386462221362374,
5
- "train_runtime": 54.2438,
6
  "train_samples": 10000,
7
- "train_samples_per_second": 258.942,
8
- "train_steps_per_second": 4.056
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 7357983621120000.0,
4
+ "train_loss": 1.3830752210183577,
5
+ "train_runtime": 55.0309,
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 255.239,
8
+ "train_steps_per_second": 3.998
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be7733f81180fdd29d3314b1b3412be39b7b836f121522629c20b72e54edbb5d
3
  size 248894656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c41f0cbc1c377cca8babf7afd81c7f49cfaff8ba4c2bb995d16ba8207d5a008c
3
  size 248894656
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 7357983621120000.0,
4
- "train_loss": 1.386462221362374,
5
- "train_runtime": 54.2438,
6
  "train_samples": 10000,
7
- "train_samples_per_second": 258.942,
8
- "train_steps_per_second": 4.056
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 7357983621120000.0,
4
+ "train_loss": 1.3830752210183577,
5
+ "train_runtime": 55.0309,
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 255.239,
8
+ "train_steps_per_second": 3.998
9
  }
trainer_state.json CHANGED
@@ -17,320 +17,320 @@
17
  },
18
  {
19
  "epoch": 0.022727272727272728,
20
- "grad_norm": 3.28125,
21
  "learning_rate": 4.545454545454546e-05,
22
- "loss": 1.872,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.045454545454545456,
27
  "grad_norm": 1.7578125,
28
  "learning_rate": 9.090909090909092e-05,
29
- "loss": 1.8049,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.06818181818181818,
34
  "grad_norm": 1.4375,
35
  "learning_rate": 0.00013636363636363637,
36
- "loss": 1.74,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.09090909090909091,
41
- "grad_norm": 2.0,
42
  "learning_rate": 0.00018181818181818183,
43
- "loss": 1.6831,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.11363636363636363,
48
- "grad_norm": 1.609375,
49
  "learning_rate": 0.0001998867339183008,
50
- "loss": 1.607,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.13636363636363635,
55
- "grad_norm": 1.6796875,
56
  "learning_rate": 0.00019919548128307954,
57
- "loss": 1.5824,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.1590909090909091,
62
- "grad_norm": 1.25,
63
  "learning_rate": 0.00019788024462147788,
64
- "loss": 1.5335,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.18181818181818182,
69
- "grad_norm": 0.9453125,
70
  "learning_rate": 0.00019594929736144976,
71
- "loss": 1.4989,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.20454545454545456,
76
- "grad_norm": 1.5546875,
77
  "learning_rate": 0.00019341478602651069,
78
- "loss": 1.4829,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.22727272727272727,
83
- "grad_norm": 1.3203125,
84
  "learning_rate": 0.00019029265382866214,
85
- "loss": 1.4675,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.25,
90
- "grad_norm": 1.3828125,
91
  "learning_rate": 0.00018660254037844388,
92
- "loss": 1.4425,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.2727272727272727,
97
- "grad_norm": 1.2890625,
98
  "learning_rate": 0.0001823676581429833,
99
- "loss": 1.4339,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.29545454545454547,
104
- "grad_norm": 0.7890625,
105
  "learning_rate": 0.0001776146464291757,
106
- "loss": 1.4086,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.3181818181818182,
111
- "grad_norm": 0.7421875,
112
  "learning_rate": 0.00017237340381050703,
113
- "loss": 1.3789,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.3409090909090909,
118
- "grad_norm": 0.71484375,
119
  "learning_rate": 0.00016667690005162916,
120
- "loss": 1.3825,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.36363636363636365,
125
- "grad_norm": 0.6484375,
126
  "learning_rate": 0.00016056096871376667,
127
- "loss": 1.3689,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.38636363636363635,
132
- "grad_norm": 0.671875,
133
  "learning_rate": 0.00015406408174555976,
134
- "loss": 1.363,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.4090909090909091,
139
- "grad_norm": 0.68359375,
140
  "learning_rate": 0.0001472271074772683,
141
- "loss": 1.3277,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.4318181818181818,
146
- "grad_norm": 0.7890625,
147
  "learning_rate": 0.00014009305354066137,
148
- "loss": 1.3438,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.45454545454545453,
153
- "grad_norm": 0.8125,
154
  "learning_rate": 0.00013270679633174218,
155
- "loss": 1.3325,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.4772727272727273,
160
- "grad_norm": 0.9921875,
161
  "learning_rate": 0.0001251147987181079,
162
- "loss": 1.3127,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.5,
167
- "grad_norm": 0.82421875,
168
  "learning_rate": 0.00011736481776669306,
169
- "loss": 1.3142,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.5227272727272727,
174
- "grad_norm": 0.7109375,
175
  "learning_rate": 0.00010950560433041826,
176
- "loss": 1.3031,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.5454545454545454,
181
- "grad_norm": 0.67578125,
182
  "learning_rate": 0.00010158659638348081,
183
- "loss": 1.3119,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.5681818181818182,
188
  "grad_norm": 0.5859375,
189
  "learning_rate": 9.365760803434355e-05,
190
- "loss": 1.3096,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.5909090909090909,
195
- "grad_norm": 0.52734375,
196
  "learning_rate": 8.57685161726715e-05,
197
- "loss": 1.2951,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.6136363636363636,
202
- "grad_norm": 0.5234375,
203
  "learning_rate": 7.796894672134594e-05,
204
- "loss": 1.2856,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.6363636363636364,
209
- "grad_norm": 0.69921875,
210
  "learning_rate": 7.030796246717255e-05,
211
- "loss": 1.3007,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.6590909090909091,
216
- "grad_norm": 0.67578125,
217
  "learning_rate": 6.283375443396726e-05,
218
- "loss": 1.293,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.6818181818181818,
223
- "grad_norm": 0.6015625,
224
  "learning_rate": 5.559333873942259e-05,
225
- "loss": 1.2909,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.7045454545454546,
230
- "grad_norm": 0.7265625,
231
  "learning_rate": 4.8632260842659393e-05,
232
- "loss": 1.2965,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.7272727272727273,
237
- "grad_norm": 0.51953125,
238
  "learning_rate": 4.19943090428802e-05,
239
- "loss": 1.2818,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.75,
244
- "grad_norm": 0.451171875,
245
  "learning_rate": 3.5721239031346066e-05,
246
- "loss": 1.2593,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.7727272727272727,
251
- "grad_norm": 0.494140625,
252
  "learning_rate": 2.9852511229367865e-05,
253
- "loss": 1.2859,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.7954545454545454,
258
- "grad_norm": 0.439453125,
259
  "learning_rate": 2.4425042564574184e-05,
260
- "loss": 1.2837,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.8181818181818182,
265
- "grad_norm": 0.44140625,
266
  "learning_rate": 1.947297424689414e-05,
267
- "loss": 1.2852,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.8409090909090909,
272
- "grad_norm": 0.447265625,
273
  "learning_rate": 1.5027457005048573e-05,
274
- "loss": 1.2796,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.8636363636363636,
279
- "grad_norm": 0.66796875,
280
  "learning_rate": 1.1116455134507664e-05,
281
- "loss": 1.2792,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.8863636363636364,
286
- "grad_norm": 0.44921875,
287
  "learning_rate": 7.764570589541875e-06,
288
- "loss": 1.28,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.9090909090909091,
293
- "grad_norm": 0.470703125,
294
  "learning_rate": 4.992888225905468e-06,
295
- "loss": 1.2861,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.9318181818181818,
300
- "grad_norm": 0.546875,
301
  "learning_rate": 2.818843167645835e-06,
302
- "loss": 1.2822,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.9545454545454546,
307
- "grad_norm": 0.470703125,
308
  "learning_rate": 1.2561111323605712e-06,
309
- "loss": 1.28,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.9772727272727273,
314
- "grad_norm": 0.462890625,
315
  "learning_rate": 3.145224048057727e-07,
316
- "loss": 1.2803,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 1.0,
321
- "grad_norm": 0.47265625,
322
  "learning_rate": 0.0,
323
- "loss": 1.2746,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 1.0,
328
  "step": 220,
329
  "total_flos": 7357983621120000.0,
330
- "train_loss": 1.386462221362374,
331
- "train_runtime": 54.2438,
332
- "train_samples_per_second": 258.942,
333
- "train_steps_per_second": 4.056
334
  }
335
  ],
336
  "logging_steps": 5,
 
17
  },
18
  {
19
  "epoch": 0.022727272727272728,
20
+ "grad_norm": 3.296875,
21
  "learning_rate": 4.545454545454546e-05,
22
+ "loss": 1.8721,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.045454545454545456,
27
  "grad_norm": 1.7578125,
28
  "learning_rate": 9.090909090909092e-05,
29
+ "loss": 1.8044,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.06818181818181818,
34
  "grad_norm": 1.4375,
35
  "learning_rate": 0.00013636363636363637,
36
+ "loss": 1.7402,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.09090909090909091,
41
+ "grad_norm": 1.9921875,
42
  "learning_rate": 0.00018181818181818183,
43
+ "loss": 1.6832,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.11363636363636363,
48
+ "grad_norm": 1.6640625,
49
  "learning_rate": 0.0001998867339183008,
50
+ "loss": 1.6069,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.13636363636363635,
55
+ "grad_norm": 1.4609375,
56
  "learning_rate": 0.00019919548128307954,
57
+ "loss": 1.582,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.1590909090909091,
62
+ "grad_norm": 1.453125,
63
  "learning_rate": 0.00019788024462147788,
64
+ "loss": 1.5337,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.18181818181818182,
69
+ "grad_norm": 0.9140625,
70
  "learning_rate": 0.00019594929736144976,
71
+ "loss": 1.4999,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.20454545454545456,
76
+ "grad_norm": 0.91796875,
77
  "learning_rate": 0.00019341478602651069,
78
+ "loss": 1.486,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.22727272727272727,
83
+ "grad_norm": 0.9140625,
84
  "learning_rate": 0.00019029265382866214,
85
+ "loss": 1.4694,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.25,
90
+ "grad_norm": 0.8125,
91
  "learning_rate": 0.00018660254037844388,
92
+ "loss": 1.4435,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.2727272727272727,
97
+ "grad_norm": 0.85546875,
98
  "learning_rate": 0.0001823676581429833,
99
+ "loss": 1.4326,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.29545454545454547,
104
+ "grad_norm": 0.82421875,
105
  "learning_rate": 0.0001776146464291757,
106
+ "loss": 1.4055,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.3181818181818182,
111
+ "grad_norm": 0.8046875,
112
  "learning_rate": 0.00017237340381050703,
113
+ "loss": 1.374,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.3409090909090909,
118
+ "grad_norm": 0.96484375,
119
  "learning_rate": 0.00016667690005162916,
120
+ "loss": 1.3776,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.36363636363636365,
125
+ "grad_norm": 0.71875,
126
  "learning_rate": 0.00016056096871376667,
127
+ "loss": 1.3626,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.38636363636363635,
132
+ "grad_norm": 0.6328125,
133
  "learning_rate": 0.00015406408174555976,
134
+ "loss": 1.3566,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.4090909090909091,
139
+ "grad_norm": 0.72265625,
140
  "learning_rate": 0.0001472271074772683,
141
+ "loss": 1.3215,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.4318181818181818,
146
+ "grad_norm": 0.82421875,
147
  "learning_rate": 0.00014009305354066137,
148
+ "loss": 1.338,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.45454545454545453,
153
+ "grad_norm": 0.87109375,
154
  "learning_rate": 0.00013270679633174218,
155
+ "loss": 1.3268,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.4772727272727273,
160
+ "grad_norm": 0.78515625,
161
  "learning_rate": 0.0001251147987181079,
162
+ "loss": 1.307,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.5,
167
+ "grad_norm": 1.03125,
168
  "learning_rate": 0.00011736481776669306,
169
+ "loss": 1.3086,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.5227272727272727,
174
+ "grad_norm": 0.609375,
175
  "learning_rate": 0.00010950560433041826,
176
+ "loss": 1.2977,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.5454545454545454,
181
+ "grad_norm": 0.65625,
182
  "learning_rate": 0.00010158659638348081,
183
+ "loss": 1.3071,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.5681818181818182,
188
  "grad_norm": 0.5859375,
189
  "learning_rate": 9.365760803434355e-05,
190
+ "loss": 1.305,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.5909090909090909,
195
+ "grad_norm": 0.55859375,
196
  "learning_rate": 8.57685161726715e-05,
197
+ "loss": 1.2902,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.6136363636363636,
202
+ "grad_norm": 0.51953125,
203
  "learning_rate": 7.796894672134594e-05,
204
+ "loss": 1.281,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.6363636363636364,
209
+ "grad_norm": 0.66796875,
210
  "learning_rate": 7.030796246717255e-05,
211
+ "loss": 1.296,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.6590909090909091,
216
+ "grad_norm": 0.6875,
217
  "learning_rate": 6.283375443396726e-05,
218
+ "loss": 1.2883,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.6818181818181818,
223
+ "grad_norm": 0.62890625,
224
  "learning_rate": 5.559333873942259e-05,
225
+ "loss": 1.2866,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.7045454545454546,
230
+ "grad_norm": 0.74609375,
231
  "learning_rate": 4.8632260842659393e-05,
232
+ "loss": 1.2921,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.7272727272727273,
237
+ "grad_norm": 0.53125,
238
  "learning_rate": 4.19943090428802e-05,
239
+ "loss": 1.2774,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.75,
244
+ "grad_norm": 0.458984375,
245
  "learning_rate": 3.5721239031346066e-05,
246
+ "loss": 1.2547,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.7727272727272727,
251
+ "grad_norm": 0.515625,
252
  "learning_rate": 2.9852511229367865e-05,
253
+ "loss": 1.2814,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.7954545454545454,
258
+ "grad_norm": 0.453125,
259
  "learning_rate": 2.4425042564574184e-05,
260
+ "loss": 1.2796,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.8181818181818182,
265
+ "grad_norm": 0.455078125,
266
  "learning_rate": 1.947297424689414e-05,
267
+ "loss": 1.281,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.8409090909090909,
272
+ "grad_norm": 0.45703125,
273
  "learning_rate": 1.5027457005048573e-05,
274
+ "loss": 1.2752,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.8636363636363636,
279
+ "grad_norm": 0.69140625,
280
  "learning_rate": 1.1116455134507664e-05,
281
+ "loss": 1.2744,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.8863636363636364,
286
+ "grad_norm": 0.453125,
287
  "learning_rate": 7.764570589541875e-06,
288
+ "loss": 1.2756,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.9090909090909091,
293
+ "grad_norm": 0.47265625,
294
  "learning_rate": 4.992888225905468e-06,
295
+ "loss": 1.2816,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.9318181818181818,
300
+ "grad_norm": 0.55859375,
301
  "learning_rate": 2.818843167645835e-06,
302
+ "loss": 1.278,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.9545454545454546,
307
+ "grad_norm": 0.478515625,
308
  "learning_rate": 1.2561111323605712e-06,
309
+ "loss": 1.2754,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.9772727272727273,
314
+ "grad_norm": 0.45703125,
315
  "learning_rate": 3.145224048057727e-07,
316
+ "loss": 1.2759,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 1.0,
321
+ "grad_norm": 0.478515625,
322
  "learning_rate": 0.0,
323
+ "loss": 1.2702,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 1.0,
328
  "step": 220,
329
  "total_flos": 7357983621120000.0,
330
+ "train_loss": 1.3830752210183577,
331
+ "train_runtime": 55.0309,
332
+ "train_samples_per_second": 255.239,
333
+ "train_steps_per_second": 3.998
334
  }
335
  ],
336
  "logging_steps": 5,