SystemAdmin123 commited on
Commit
ef04524
·
verified ·
1 Parent(s): dcf6f1e

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5091c0a893b1e96cb02d978b9a43f47e89c90a649c41eaff9803fd9e4020d31
3
  size 4939116424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a30c35cbc8785b8b002f6322557cf314425af21ab83b7f51c23859ba004e393f
3
  size 4939116424
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d061c229e9d4d5e97f4c0501b759b727c2b1d5cdd0e2e3f75f85205d4b68b62
3
  size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c1c4622df49ffc72bcf3fda3e087cdd3fcdd3a1d56d2ecbbbba441054dbfaf2
3
  size 4947390880
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a14a66efc4efb7405e139a6853cad6eab928cc89da1bb6f08dbf189017c7ff28
3
  size 3590619888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a848c3958efbb12660e0c66dfca39e33220fd5a19d9bef3eb49a4609fc0f8aab
3
  size 3590619888
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:225528eb4dd5c3ce51c0b08b0824156d1b66bc98683d54515581ca4149267ff2
3
  size 13688025904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f707da0832c7ea8058a9b33d5b470abf8956d8086c479507d9e0898791f31e49
3
  size 13688025904
last-checkpoint/trainer_state.json CHANGED
@@ -11,305 +11,305 @@
11
  {
12
  "epoch": 0.0001479946721918011,
13
  "eval_loss": 2.158451557159424,
14
- "eval_runtime": 161.7202,
15
- "eval_samples_per_second": 9.288,
16
- "eval_steps_per_second": 4.644,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.001479946721918011,
21
- "grad_norm": 17.375,
22
  "learning_rate": 1.6000000000000003e-05,
23
- "loss": 1.8766,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.002959893443836022,
28
- "grad_norm": 11.3125,
29
  "learning_rate": 3.2000000000000005e-05,
30
- "loss": 1.8985,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.0044398401657540325,
35
- "grad_norm": 21.5,
36
  "learning_rate": 4.8e-05,
37
- "loss": 1.7006,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.005919786887672044,
42
- "grad_norm": 21.625,
43
  "learning_rate": 6.400000000000001e-05,
44
- "loss": 1.7853,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 0.007399733609590055,
49
- "grad_norm": 26.75,
50
  "learning_rate": 8e-05,
51
- "loss": 1.3404,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.008879680331508065,
56
- "grad_norm": 7.1875,
57
  "learning_rate": 9.6e-05,
58
- "loss": 1.9642,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.010359627053426078,
63
- "grad_norm": 19.125,
64
  "learning_rate": 0.00011200000000000001,
65
- "loss": 2.0714,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.011839573775344088,
70
- "grad_norm": 22.125,
71
  "learning_rate": 0.00012800000000000002,
72
- "loss": 1.6413,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.0133195204972621,
77
- "grad_norm": 27.75,
78
  "learning_rate": 0.000144,
79
- "loss": 2.1683,
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.01479946721918011,
84
- "grad_norm": 77.0,
85
  "learning_rate": 0.00016,
86
- "loss": 2.0759,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.01627941394109812,
91
- "grad_norm": 264.0,
92
  "learning_rate": 0.00017600000000000002,
93
- "loss": 2.4559,
94
  "step": 110
95
  },
96
  {
97
  "epoch": 0.01775936066301613,
98
- "grad_norm": 12.625,
99
  "learning_rate": 0.000192,
100
- "loss": 2.4551,
101
  "step": 120
102
  },
103
  {
104
  "epoch": 0.01923930738493414,
105
- "grad_norm": 17.625,
106
  "learning_rate": 0.0001999978128380225,
107
- "loss": 2.6614,
108
  "step": 130
109
  },
110
  {
111
  "epoch": 0.020719254106852155,
112
- "grad_norm": 54.25,
113
  "learning_rate": 0.0001999803161162393,
114
- "loss": 2.7413,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.022199200828770166,
119
- "grad_norm": 120.0,
120
  "learning_rate": 0.00019994532573409262,
121
- "loss": 3.8586,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 0.023679147550688177,
126
- "grad_norm": 14.5,
127
  "learning_rate": 0.00019989284781388617,
128
- "loss": 3.0433,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 0.025159094272606188,
133
- "grad_norm": 55.5,
134
  "learning_rate": 0.00019982289153773646,
135
- "loss": 3.2773,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 0.0266390409945242,
140
- "grad_norm": 12.625,
141
  "learning_rate": 0.00019973546914596623,
142
- "loss": 3.6984,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 0.02811898771644221,
147
- "grad_norm": 83.5,
148
  "learning_rate": 0.00019963059593496268,
149
- "loss": 3.1995,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 0.02959893443836022,
154
- "grad_norm": 246.0,
155
  "learning_rate": 0.00019950829025450114,
156
- "loss": 5.2174,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.02959893443836022,
161
- "eval_loss": 7.763393402099609,
162
- "eval_runtime": 169.6503,
163
- "eval_samples_per_second": 8.854,
164
- "eval_steps_per_second": 4.427,
165
  "step": 200
166
  },
167
  {
168
  "epoch": 0.03107888116027823,
169
- "grad_norm": 20.375,
170
  "learning_rate": 0.0001993685735045343,
171
- "loss": 4.5554,
172
  "step": 210
173
  },
174
  {
175
  "epoch": 0.03255882788219624,
176
- "grad_norm": 13.25,
177
  "learning_rate": 0.0001992114701314478,
178
- "loss": 3.3164,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.03403877460411425,
183
- "grad_norm": 14.8125,
184
  "learning_rate": 0.000199037007623783,
185
- "loss": 3.6277,
186
  "step": 230
187
  },
188
  {
189
  "epoch": 0.03551872132603226,
190
- "grad_norm": 55.5,
191
  "learning_rate": 0.00019884521650742715,
192
- "loss": 3.7657,
193
  "step": 240
194
  },
195
  {
196
  "epoch": 0.036998668047950274,
197
- "grad_norm": 18.625,
198
  "learning_rate": 0.00019863613034027224,
199
- "loss": 3.7949,
200
  "step": 250
201
  },
202
  {
203
  "epoch": 0.03847861476986828,
204
- "grad_norm": 18.5,
205
  "learning_rate": 0.0001984097857063434,
206
- "loss": 3.7278,
207
  "step": 260
208
  },
209
  {
210
  "epoch": 0.039958561491786296,
211
- "grad_norm": 17.0,
212
  "learning_rate": 0.0001981662222093976,
213
- "loss": 3.439,
214
  "step": 270
215
  },
216
  {
217
  "epoch": 0.04143850821370431,
218
- "grad_norm": 38.75,
219
  "learning_rate": 0.00019790548246599447,
220
- "loss": 3.1188,
221
  "step": 280
222
  },
223
  {
224
  "epoch": 0.04291845493562232,
225
- "grad_norm": 22.625,
226
  "learning_rate": 0.00019762761209803927,
227
- "loss": 3.7141,
228
  "step": 290
229
  },
230
  {
231
  "epoch": 0.04439840165754033,
232
- "grad_norm": 159.0,
233
  "learning_rate": 0.0001973326597248006,
234
- "loss": 5.2496,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.04587834837945834,
239
- "grad_norm": 10.125,
240
  "learning_rate": 0.00019702067695440332,
241
- "loss": 4.0533,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 0.04735829510137635,
246
- "grad_norm": 15.25,
247
  "learning_rate": 0.00019669171837479873,
248
- "loss": 3.5448,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 0.04883824182329436,
253
- "grad_norm": 12.5625,
254
  "learning_rate": 0.00019634584154421317,
255
- "loss": 3.8324,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 0.050318188545212375,
260
- "grad_norm": 36.25,
261
  "learning_rate": 0.00019598310698107702,
262
- "loss": 3.5487,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 0.05179813526713038,
267
- "grad_norm": 32.0,
268
  "learning_rate": 0.00019560357815343577,
269
- "loss": 4.1912,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 0.0532780819890484,
274
- "grad_norm": 10.25,
275
  "learning_rate": 0.00019520732146784491,
276
- "loss": 4.2146,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 0.054758028710966404,
281
- "grad_norm": 21.0,
282
  "learning_rate": 0.0001947944062577507,
283
- "loss": 3.8099,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 0.05623797543288442,
288
- "grad_norm": 26.625,
289
  "learning_rate": 0.00019436490477135878,
290
- "loss": 3.92,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 0.057717922154802426,
295
- "grad_norm": 12.5625,
296
  "learning_rate": 0.00019391889215899299,
297
- "loss": 4.44,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 0.05919786887672044,
302
- "grad_norm": 29.75,
303
  "learning_rate": 0.0001934564464599461,
304
- "loss": 3.639,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 0.05919786887672044,
309
- "eval_loss": 4.839008808135986,
310
- "eval_runtime": 162.1793,
311
- "eval_samples_per_second": 9.261,
312
- "eval_steps_per_second": 4.631,
313
  "step": 400
314
  }
315
  ],
 
11
  {
12
  "epoch": 0.0001479946721918011,
13
  "eval_loss": 2.158451557159424,
14
+ "eval_runtime": 117.4597,
15
+ "eval_samples_per_second": 12.787,
16
+ "eval_steps_per_second": 6.394,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.001479946721918011,
21
+ "grad_norm": 17.875,
22
  "learning_rate": 1.6000000000000003e-05,
23
+ "loss": 1.876,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.002959893443836022,
28
+ "grad_norm": 11.9375,
29
  "learning_rate": 3.2000000000000005e-05,
30
+ "loss": 1.8982,
31
  "step": 20
32
  },
33
  {
34
  "epoch": 0.0044398401657540325,
35
+ "grad_norm": 22.125,
36
  "learning_rate": 4.8e-05,
37
+ "loss": 1.6963,
38
  "step": 30
39
  },
40
  {
41
  "epoch": 0.005919786887672044,
42
+ "grad_norm": 21.25,
43
  "learning_rate": 6.400000000000001e-05,
44
+ "loss": 1.7986,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 0.007399733609590055,
49
+ "grad_norm": 26.375,
50
  "learning_rate": 8e-05,
51
+ "loss": 1.3725,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.008879680331508065,
56
+ "grad_norm": 6.9375,
57
  "learning_rate": 9.6e-05,
58
+ "loss": 1.9612,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.010359627053426078,
63
+ "grad_norm": 19.625,
64
  "learning_rate": 0.00011200000000000001,
65
+ "loss": 2.0516,
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.011839573775344088,
70
+ "grad_norm": 16.875,
71
  "learning_rate": 0.00012800000000000002,
72
+ "loss": 1.5824,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.0133195204972621,
77
+ "grad_norm": 21.5,
78
  "learning_rate": 0.000144,
79
+ "loss": 2.2526,
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.01479946721918011,
84
+ "grad_norm": 85.5,
85
  "learning_rate": 0.00016,
86
+ "loss": 2.2656,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.01627941394109812,
91
+ "grad_norm": 58.25,
92
  "learning_rate": 0.00017600000000000002,
93
+ "loss": 2.3181,
94
  "step": 110
95
  },
96
  {
97
  "epoch": 0.01775936066301613,
98
+ "grad_norm": 14.25,
99
  "learning_rate": 0.000192,
100
+ "loss": 2.4921,
101
  "step": 120
102
  },
103
  {
104
  "epoch": 0.01923930738493414,
105
+ "grad_norm": 15.5625,
106
  "learning_rate": 0.0001999978128380225,
107
+ "loss": 2.5357,
108
  "step": 130
109
  },
110
  {
111
  "epoch": 0.020719254106852155,
112
+ "grad_norm": 23.75,
113
  "learning_rate": 0.0001999803161162393,
114
+ "loss": 2.8389,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.022199200828770166,
119
+ "grad_norm": 115.5,
120
  "learning_rate": 0.00019994532573409262,
121
+ "loss": 3.7381,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 0.023679147550688177,
126
+ "grad_norm": 17.25,
127
  "learning_rate": 0.00019989284781388617,
128
+ "loss": 3.1991,
129
  "step": 160
130
  },
131
  {
132
  "epoch": 0.025159094272606188,
133
+ "grad_norm": 19.0,
134
  "learning_rate": 0.00019982289153773646,
135
+ "loss": 3.3157,
136
  "step": 170
137
  },
138
  {
139
  "epoch": 0.0266390409945242,
140
+ "grad_norm": 11.3125,
141
  "learning_rate": 0.00019973546914596623,
142
+ "loss": 3.309,
143
  "step": 180
144
  },
145
  {
146
  "epoch": 0.02811898771644221,
147
+ "grad_norm": 105.0,
148
  "learning_rate": 0.00019963059593496268,
149
+ "loss": 3.2528,
150
  "step": 190
151
  },
152
  {
153
  "epoch": 0.02959893443836022,
154
+ "grad_norm": 137.0,
155
  "learning_rate": 0.00019950829025450114,
156
+ "loss": 4.8144,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.02959893443836022,
161
+ "eval_loss": 4.562154293060303,
162
+ "eval_runtime": 115.3725,
163
+ "eval_samples_per_second": 13.019,
164
+ "eval_steps_per_second": 6.509,
165
  "step": 200
166
  },
167
  {
168
  "epoch": 0.03107888116027823,
169
+ "grad_norm": 14.625,
170
  "learning_rate": 0.0001993685735045343,
171
+ "loss": 3.9662,
172
  "step": 210
173
  },
174
  {
175
  "epoch": 0.03255882788219624,
176
+ "grad_norm": 18.0,
177
  "learning_rate": 0.0001992114701314478,
178
+ "loss": 3.1902,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.03403877460411425,
183
+ "grad_norm": 24.625,
184
  "learning_rate": 0.000199037007623783,
185
+ "loss": 3.4792,
186
  "step": 230
187
  },
188
  {
189
  "epoch": 0.03551872132603226,
190
+ "grad_norm": 46.0,
191
  "learning_rate": 0.00019884521650742715,
192
+ "loss": 3.7192,
193
  "step": 240
194
  },
195
  {
196
  "epoch": 0.036998668047950274,
197
+ "grad_norm": 28.0,
198
  "learning_rate": 0.00019863613034027224,
199
+ "loss": 3.6487,
200
  "step": 250
201
  },
202
  {
203
  "epoch": 0.03847861476986828,
204
+ "grad_norm": 139.0,
205
  "learning_rate": 0.0001984097857063434,
206
+ "loss": 4.3462,
207
  "step": 260
208
  },
209
  {
210
  "epoch": 0.039958561491786296,
211
+ "grad_norm": 13.875,
212
  "learning_rate": 0.0001981662222093976,
213
+ "loss": 3.3132,
214
  "step": 270
215
  },
216
  {
217
  "epoch": 0.04143850821370431,
218
+ "grad_norm": 29.5,
219
  "learning_rate": 0.00019790548246599447,
220
+ "loss": 3.3523,
221
  "step": 280
222
  },
223
  {
224
  "epoch": 0.04291845493562232,
225
+ "grad_norm": 43.75,
226
  "learning_rate": 0.00019762761209803927,
227
+ "loss": 3.8529,
228
  "step": 290
229
  },
230
  {
231
  "epoch": 0.04439840165754033,
232
+ "grad_norm": 234.0,
233
  "learning_rate": 0.0001973326597248006,
234
+ "loss": 4.7306,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.04587834837945834,
239
+ "grad_norm": 17.625,
240
  "learning_rate": 0.00019702067695440332,
241
+ "loss": 4.0088,
242
  "step": 310
243
  },
244
  {
245
  "epoch": 0.04735829510137635,
246
+ "grad_norm": 10.0625,
247
  "learning_rate": 0.00019669171837479873,
248
+ "loss": 3.4104,
249
  "step": 320
250
  },
251
  {
252
  "epoch": 0.04883824182329436,
253
+ "grad_norm": 12.6875,
254
  "learning_rate": 0.00019634584154421317,
255
+ "loss": 3.6337,
256
  "step": 330
257
  },
258
  {
259
  "epoch": 0.050318188545212375,
260
+ "grad_norm": 16.75,
261
  "learning_rate": 0.00019598310698107702,
262
+ "loss": 3.6344,
263
  "step": 340
264
  },
265
  {
266
  "epoch": 0.05179813526713038,
267
+ "grad_norm": 21.625,
268
  "learning_rate": 0.00019560357815343577,
269
+ "loss": 3.8887,
270
  "step": 350
271
  },
272
  {
273
  "epoch": 0.0532780819890484,
274
+ "grad_norm": 8.5625,
275
  "learning_rate": 0.00019520732146784491,
276
+ "loss": 3.9023,
277
  "step": 360
278
  },
279
  {
280
  "epoch": 0.054758028710966404,
281
+ "grad_norm": 10.625,
282
  "learning_rate": 0.0001947944062577507,
283
+ "loss": 3.7002,
284
  "step": 370
285
  },
286
  {
287
  "epoch": 0.05623797543288442,
288
+ "grad_norm": 34.25,
289
  "learning_rate": 0.00019436490477135878,
290
+ "loss": 3.7505,
291
  "step": 380
292
  },
293
  {
294
  "epoch": 0.057717922154802426,
295
+ "grad_norm": 17.75,
296
  "learning_rate": 0.00019391889215899299,
297
+ "loss": 3.9776,
298
  "step": 390
299
  },
300
  {
301
  "epoch": 0.05919786887672044,
302
+ "grad_norm": 41.25,
303
  "learning_rate": 0.0001934564464599461,
304
+ "loss": 3.6903,
305
  "step": 400
306
  },
307
  {
308
  "epoch": 0.05919786887672044,
309
+ "eval_loss": 5.192676544189453,
310
+ "eval_runtime": 115.1721,
311
+ "eval_samples_per_second": 13.041,
312
+ "eval_steps_per_second": 6.521,
313
  "step": 400
314
  }
315
  ],
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6df7b223289e39c6c6c7e2dd31221e01709ab3598ca49f26f218999f02b35204
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8953c8f78e882c2468a8fd9123e7e01d1ba2fa70223c087509fa59b852ab047
3
  size 6840