lewtun HF Staff commited on
Commit
503f651
·
verified ·
1 Parent(s): 08cef7d

Upload eval_results/AI-MO/deepseek-math-7b-sft/aimo_v00.00/math/results_2024-04-24T13-57-53.176523.json with huggingface_hub

Browse files
eval_results/AI-MO/deepseek-math-7b-sft/aimo_v00.00/math/results_2024-04-24T13-57-53.176523.json ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 2,
6
+ "max_samples": 10,
7
+ "job_id": "",
8
+ "start_time": 3652558.79848515,
9
+ "end_time": 3652962.355642761,
10
+ "total_evaluation_time_secondes": "403.55715761100873",
11
+ "model_name": "AI-MO/deepseek-math-7b-sft",
12
+ "model_sha": "748a5f5e457052be6c9476d87222e596fedefdb7",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.93 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|math:algebra|0": {
19
+ "qem": 0.7,
20
+ "qem_stderr": 0.15275252316519466
21
+ },
22
+ "lighteval|math:counting_and_probability|0": {
23
+ "qem": 0.5,
24
+ "qem_stderr": 0.16666666666666666
25
+ },
26
+ "lighteval|math:geometry|0": {
27
+ "qem": 0.3,
28
+ "qem_stderr": 0.15275252316519466
29
+ },
30
+ "lighteval|math:intermediate_algebra|0": {
31
+ "qem": 0.1,
32
+ "qem_stderr": 0.09999999999999999
33
+ },
34
+ "lighteval|math:number_theory|0": {
35
+ "qem": 0.2,
36
+ "qem_stderr": 0.13333333333333333
37
+ },
38
+ "lighteval|math:prealgebra|0": {
39
+ "qem": 0.6,
40
+ "qem_stderr": 0.16329931618554522
41
+ },
42
+ "lighteval|math:precalculus|0": {
43
+ "qem": 0.4,
44
+ "qem_stderr": 0.1632993161855452
45
+ },
46
+ "lighteval|math:_average|0": {
47
+ "qem": 0.39999999999999997,
48
+ "qem_stderr": 0.14744338267163995
49
+ },
50
+ "all": {
51
+ "qem": 0.39999999999999997,
52
+ "qem_stderr": 0.14744338267163995
53
+ }
54
+ },
55
+ "versions": {
56
+ "lighteval|math:algebra|0": 0,
57
+ "lighteval|math:counting_and_probability|0": 0,
58
+ "lighteval|math:geometry|0": 0,
59
+ "lighteval|math:intermediate_algebra|0": 0,
60
+ "lighteval|math:number_theory|0": 0,
61
+ "lighteval|math:prealgebra|0": 0,
62
+ "lighteval|math:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "lighteval|math:algebra": {
66
+ "name": "math:algebra",
67
+ "prompt_function": "math",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": [
85
+ "\n"
86
+ ],
87
+ "output_regex": null,
88
+ "frozen": false,
89
+ "suite": [
90
+ "lighteval",
91
+ "math"
92
+ ],
93
+ "original_num_docs": 1187,
94
+ "effective_num_docs": 10,
95
+ "trust_dataset": true,
96
+ "must_remove_duplicate_docs": null
97
+ },
98
+ "lighteval|math:counting_and_probability": {
99
+ "name": "math:counting_and_probability",
100
+ "prompt_function": "math",
101
+ "hf_repo": "lighteval/MATH",
102
+ "hf_subset": "counting_and_probability",
103
+ "metric": [
104
+ "quasi_exact_match_math"
105
+ ],
106
+ "hf_avail_splits": [
107
+ "train",
108
+ "test",
109
+ "validation"
110
+ ],
111
+ "evaluation_splits": [
112
+ "test"
113
+ ],
114
+ "few_shots_split": null,
115
+ "few_shots_select": null,
116
+ "generation_size": 2048,
117
+ "stop_sequence": [
118
+ "\n"
119
+ ],
120
+ "output_regex": null,
121
+ "frozen": false,
122
+ "suite": [
123
+ "lighteval",
124
+ "math"
125
+ ],
126
+ "original_num_docs": 474,
127
+ "effective_num_docs": 10,
128
+ "trust_dataset": true,
129
+ "must_remove_duplicate_docs": null
130
+ },
131
+ "lighteval|math:geometry": {
132
+ "name": "math:geometry",
133
+ "prompt_function": "math",
134
+ "hf_repo": "lighteval/MATH",
135
+ "hf_subset": "geometry",
136
+ "metric": [
137
+ "quasi_exact_match_math"
138
+ ],
139
+ "hf_avail_splits": [
140
+ "train",
141
+ "test",
142
+ "validation"
143
+ ],
144
+ "evaluation_splits": [
145
+ "test"
146
+ ],
147
+ "few_shots_split": null,
148
+ "few_shots_select": null,
149
+ "generation_size": 2048,
150
+ "stop_sequence": [
151
+ "\n"
152
+ ],
153
+ "output_regex": null,
154
+ "frozen": false,
155
+ "suite": [
156
+ "lighteval",
157
+ "math"
158
+ ],
159
+ "original_num_docs": 479,
160
+ "effective_num_docs": 10,
161
+ "trust_dataset": true,
162
+ "must_remove_duplicate_docs": null
163
+ },
164
+ "lighteval|math:intermediate_algebra": {
165
+ "name": "math:intermediate_algebra",
166
+ "prompt_function": "math",
167
+ "hf_repo": "lighteval/MATH",
168
+ "hf_subset": "intermediate_algebra",
169
+ "metric": [
170
+ "quasi_exact_match_math"
171
+ ],
172
+ "hf_avail_splits": [
173
+ "train",
174
+ "test",
175
+ "validation"
176
+ ],
177
+ "evaluation_splits": [
178
+ "test"
179
+ ],
180
+ "few_shots_split": null,
181
+ "few_shots_select": null,
182
+ "generation_size": 2048,
183
+ "stop_sequence": [
184
+ "\n"
185
+ ],
186
+ "output_regex": null,
187
+ "frozen": false,
188
+ "suite": [
189
+ "lighteval",
190
+ "math"
191
+ ],
192
+ "original_num_docs": 903,
193
+ "effective_num_docs": 10,
194
+ "trust_dataset": true,
195
+ "must_remove_duplicate_docs": null
196
+ },
197
+ "lighteval|math:number_theory": {
198
+ "name": "math:number_theory",
199
+ "prompt_function": "math",
200
+ "hf_repo": "lighteval/MATH",
201
+ "hf_subset": "number_theory",
202
+ "metric": [
203
+ "quasi_exact_match_math"
204
+ ],
205
+ "hf_avail_splits": [
206
+ "train",
207
+ "test",
208
+ "validation"
209
+ ],
210
+ "evaluation_splits": [
211
+ "test"
212
+ ],
213
+ "few_shots_split": null,
214
+ "few_shots_select": null,
215
+ "generation_size": 2048,
216
+ "stop_sequence": [
217
+ "\n"
218
+ ],
219
+ "output_regex": null,
220
+ "frozen": false,
221
+ "suite": [
222
+ "lighteval",
223
+ "math"
224
+ ],
225
+ "original_num_docs": 540,
226
+ "effective_num_docs": 10,
227
+ "trust_dataset": true,
228
+ "must_remove_duplicate_docs": null
229
+ },
230
+ "lighteval|math:prealgebra": {
231
+ "name": "math:prealgebra",
232
+ "prompt_function": "math",
233
+ "hf_repo": "lighteval/MATH",
234
+ "hf_subset": "prealgebra",
235
+ "metric": [
236
+ "quasi_exact_match_math"
237
+ ],
238
+ "hf_avail_splits": [
239
+ "train",
240
+ "test",
241
+ "validation"
242
+ ],
243
+ "evaluation_splits": [
244
+ "test"
245
+ ],
246
+ "few_shots_split": null,
247
+ "few_shots_select": null,
248
+ "generation_size": 2048,
249
+ "stop_sequence": [
250
+ "\n"
251
+ ],
252
+ "output_regex": null,
253
+ "frozen": false,
254
+ "suite": [
255
+ "lighteval",
256
+ "math"
257
+ ],
258
+ "original_num_docs": 871,
259
+ "effective_num_docs": 10,
260
+ "trust_dataset": true,
261
+ "must_remove_duplicate_docs": null
262
+ },
263
+ "lighteval|math:precalculus": {
264
+ "name": "math:precalculus",
265
+ "prompt_function": "math",
266
+ "hf_repo": "lighteval/MATH",
267
+ "hf_subset": "precalculus",
268
+ "metric": [
269
+ "quasi_exact_match_math"
270
+ ],
271
+ "hf_avail_splits": [
272
+ "train",
273
+ "test",
274
+ "validation"
275
+ ],
276
+ "evaluation_splits": [
277
+ "test"
278
+ ],
279
+ "few_shots_split": null,
280
+ "few_shots_select": null,
281
+ "generation_size": 2048,
282
+ "stop_sequence": [
283
+ "\n"
284
+ ],
285
+ "output_regex": null,
286
+ "frozen": false,
287
+ "suite": [
288
+ "lighteval",
289
+ "math"
290
+ ],
291
+ "original_num_docs": 546,
292
+ "effective_num_docs": 10,
293
+ "trust_dataset": true,
294
+ "must_remove_duplicate_docs": null
295
+ }
296
+ },
297
+ "summary_tasks": {
298
+ "lighteval|math:algebra|0": {
299
+ "hashes": {
300
+ "hash_examples": "a13d68854ca927ce",
301
+ "hash_full_prompts": "7e0d2b25e14caad6",
302
+ "hash_input_tokens": "58c8826560827dfc",
303
+ "hash_cont_tokens": "310523472b3267fc"
304
+ },
305
+ "truncated": 10,
306
+ "non_truncated": 0,
307
+ "padded": 3,
308
+ "non_padded": 7,
309
+ "effective_few_shots": 0.0,
310
+ "num_truncated_few_shots": 0
311
+ },
312
+ "lighteval|math:counting_and_probability|0": {
313
+ "hashes": {
314
+ "hash_examples": "a8004c36a2d9cb68",
315
+ "hash_full_prompts": "2acaf205499ed79c",
316
+ "hash_input_tokens": "d76018dbed1fcc48",
317
+ "hash_cont_tokens": "733b9c4cba844ec2"
318
+ },
319
+ "truncated": 10,
320
+ "non_truncated": 0,
321
+ "padded": 1,
322
+ "non_padded": 9,
323
+ "effective_few_shots": 0.0,
324
+ "num_truncated_few_shots": 0
325
+ },
326
+ "lighteval|math:geometry|0": {
327
+ "hashes": {
328
+ "hash_examples": "5e12e37f7378cc4c",
329
+ "hash_full_prompts": "32e7c26bfc66828d",
330
+ "hash_input_tokens": "288b95f1a6a9a3d5",
331
+ "hash_cont_tokens": "8d9422af27507fe7"
332
+ },
333
+ "truncated": 10,
334
+ "non_truncated": 0,
335
+ "padded": 5,
336
+ "non_padded": 5,
337
+ "effective_few_shots": 0.0,
338
+ "num_truncated_few_shots": 0
339
+ },
340
+ "lighteval|math:intermediate_algebra|0": {
341
+ "hashes": {
342
+ "hash_examples": "71738fc49d471d6d",
343
+ "hash_full_prompts": "6779f5c079af81a6",
344
+ "hash_input_tokens": "53a5702086e49106",
345
+ "hash_cont_tokens": "6f8722f0a58ef37a"
346
+ },
347
+ "truncated": 10,
348
+ "non_truncated": 0,
349
+ "padded": 3,
350
+ "non_padded": 7,
351
+ "effective_few_shots": 0.0,
352
+ "num_truncated_few_shots": 0
353
+ },
354
+ "lighteval|math:number_theory|0": {
355
+ "hashes": {
356
+ "hash_examples": "bdb66471a0eed93a",
357
+ "hash_full_prompts": "9985b650f03b8f91",
358
+ "hash_input_tokens": "8b240f273f300e85",
359
+ "hash_cont_tokens": "ae8bd3b0e9f74ac5"
360
+ },
361
+ "truncated": 10,
362
+ "non_truncated": 0,
363
+ "padded": 5,
364
+ "non_padded": 5,
365
+ "effective_few_shots": 0.0,
366
+ "num_truncated_few_shots": 0
367
+ },
368
+ "lighteval|math:prealgebra|0": {
369
+ "hashes": {
370
+ "hash_examples": "3c59373ec7e3a94a",
371
+ "hash_full_prompts": "722cea4098cecd00",
372
+ "hash_input_tokens": "6788548239da9c91",
373
+ "hash_cont_tokens": "c2f2eaca08a1e171"
374
+ },
375
+ "truncated": 10,
376
+ "non_truncated": 0,
377
+ "padded": 1,
378
+ "non_padded": 9,
379
+ "effective_few_shots": 0.0,
380
+ "num_truncated_few_shots": 0
381
+ },
382
+ "lighteval|math:precalculus|0": {
383
+ "hashes": {
384
+ "hash_examples": "8a97d7d7bd780ca3",
385
+ "hash_full_prompts": "df2793e826f0dfcc",
386
+ "hash_input_tokens": "90b2332eeca3284b",
387
+ "hash_cont_tokens": "56aff6344490817d"
388
+ },
389
+ "truncated": 10,
390
+ "non_truncated": 0,
391
+ "padded": 4,
392
+ "non_padded": 6,
393
+ "effective_few_shots": 0.0,
394
+ "num_truncated_few_shots": 0
395
+ }
396
+ },
397
+ "summary_general": {
398
+ "hashes": {
399
+ "hash_examples": "e05a305d27ed0540",
400
+ "hash_full_prompts": "7847980860d96b71",
401
+ "hash_input_tokens": "a2515743ef01f5cf",
402
+ "hash_cont_tokens": "734c8abceb12e43b"
403
+ },
404
+ "truncated": 70,
405
+ "non_truncated": 0,
406
+ "padded": 22,
407
+ "non_padded": 48,
408
+ "num_truncated_few_shots": 0
409
+ }
410
+ }