lewtun HF Staff commited on
Commit
3ba829d
·
1 Parent(s): 74770f7

Remove false merge evals

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json +0 -174
  2. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json +0 -138
  3. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json +0 -114
  4. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json +0 -174
  5. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json +0 -138
  6. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json +0 -114
  7. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json +0 -138
  8. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json +0 -114
  9. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json +0 -174
  10. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json +0 -138
  11. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json +0 -114
  12. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json +0 -174
  13. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json +0 -138
  14. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json +0 -114
  15. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json +0 -174
  16. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json +0 -138
  17. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json +0 -114
  18. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json +0 -174
  19. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json +0 -138
  20. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json +0 -114
  21. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json +0 -174
  22. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json +0 -138
  23. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json +0 -114
  24. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json +0 -174
  25. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json +0 -138
  26. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json +0 -114
  27. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json +0 -174
  28. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json +0 -138
  29. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json +0 -114
  30. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json +0 -174
  31. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json +0 -138
  32. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json +0 -114
  33. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json +0 -174
  34. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json +0 -138
  35. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json +0 -114
  36. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json +0 -174
  37. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json +0 -138
  38. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json +0 -114
  39. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json +0 -174
  40. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json +0 -138
  41. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json +0 -114
  42. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json +0 -174
  43. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json +0 -138
  44. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json +0 -114
  45. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json +0 -174
  46. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json +0 -138
  47. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json +0 -114
  48. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json +0 -174
  49. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json +0 -138
  50. eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json +0 -114
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4229414.83378225,
8
- "end_time": 4233704.73810174,
9
- "total_evaluation_time_secondes": "4289.904319490306",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.3,
35
- "math_pass@1:1_samples_stderr": 0.0850962943396763,
36
- "math_pass@1:4_samples": 0.30833333333333335,
37
- "math_pass@1:4_samples_stderr": 0.06639672359547992,
38
- "math_pass@1:8_samples": 0.32083333333333336,
39
- "math_pass@1:8_samples_stderr": 0.0663831976271335,
40
- "math_pass@1:16_samples": 0.3375,
41
- "math_pass@1:16_samples_stderr": 0.06514517386699431,
42
- "math_pass@1:32_samples": 0.34375,
43
- "math_pass@1:32_samples_stderr": 0.06482735693283008,
44
- "math_pass@1:64_samples": 0.34010416666666665,
45
- "math_pass@1:64_samples_stderr": 0.06450050595768915
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.3,
49
- "math_pass@1:1_samples_stderr": 0.0850962943396763,
50
- "math_pass@1:4_samples": 0.30833333333333335,
51
- "math_pass@1:4_samples_stderr": 0.06639672359547992,
52
- "math_pass@1:8_samples": 0.32083333333333336,
53
- "math_pass@1:8_samples_stderr": 0.0663831976271335,
54
- "math_pass@1:16_samples": 0.3375,
55
- "math_pass@1:16_samples_stderr": 0.06514517386699431,
56
- "math_pass@1:32_samples": 0.34375,
57
- "math_pass@1:32_samples_stderr": 0.06482735693283008,
58
- "math_pass@1:64_samples": 0.34010416666666665,
59
- "math_pass@1:64_samples_stderr": 0.06450050595768915
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "a06d62ae60da2e78"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "15d38a88ca396a91"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 255139.778479921,
8
- "end_time": 257187.860357864,
9
- "total_evaluation_time_secondes": "2048.081877943012",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.42424242424242425,
35
- "gpqa_pass@1:1_samples_stderr": 0.035212249088415824,
36
- "gpqa_pass@1:4_samples": 0.4595959595959596,
37
- "gpqa_pass@1:4_samples_stderr": 0.026456334635227167,
38
- "gpqa_pass@1:8_samples": 0.4659090909090909,
39
- "gpqa_pass@1:8_samples_stderr": 0.024944118627141743
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.42424242424242425,
43
- "gpqa_pass@1:1_samples_stderr": 0.035212249088415824,
44
- "gpqa_pass@1:4_samples": 0.4595959595959596,
45
- "gpqa_pass@1:4_samples_stderr": 0.026456334635227167,
46
- "gpqa_pass@1:8_samples": 0.4659090909090909,
47
- "gpqa_pass@1:8_samples_stderr": 0.024944118627141743
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "e71088c1c1cb0672"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "82854e16d8413332"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 255139.778483932,
8
- "end_time": 260156.247424606,
9
- "total_evaluation_time_secondes": "5016.468940674007",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.1547029702970297,
35
- "codegen_pass@1:16_stderr": 0.025710721439537176
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.1547029702970297,
39
- "codegen_pass@1:16_stderr": 0.025710721439537176
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "d5c1f33cd610d8c2"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "4fd58e23485de17c"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4203294.857831319,
8
- "end_time": 4209677.501601624,
9
- "total_evaluation_time_secondes": "6382.64377030544",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.3333333333333333,
35
- "math_pass@1:1_samples_stderr": 0.0875376219064817,
36
- "math_pass@1:4_samples": 0.35,
37
- "math_pass@1:4_samples_stderr": 0.07050719373096685,
38
- "math_pass@1:8_samples": 0.3375,
39
- "math_pass@1:8_samples_stderr": 0.06486889962016494,
40
- "math_pass@1:16_samples": 0.3541666666666667,
41
- "math_pass@1:16_samples_stderr": 0.06415435021369162,
42
- "math_pass@1:32_samples": 0.33645833333333336,
43
- "math_pass@1:32_samples_stderr": 0.06365109759161156,
44
- "math_pass@1:64_samples": 0.3302083333333334,
45
- "math_pass@1:64_samples_stderr": 0.06445720210168741
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.3333333333333333,
49
- "math_pass@1:1_samples_stderr": 0.0875376219064817,
50
- "math_pass@1:4_samples": 0.35,
51
- "math_pass@1:4_samples_stderr": 0.07050719373096685,
52
- "math_pass@1:8_samples": 0.3375,
53
- "math_pass@1:8_samples_stderr": 0.06486889962016494,
54
- "math_pass@1:16_samples": 0.3541666666666667,
55
- "math_pass@1:16_samples_stderr": 0.06415435021369162,
56
- "math_pass@1:32_samples": 0.33645833333333336,
57
- "math_pass@1:32_samples_stderr": 0.06365109759161156,
58
- "math_pass@1:64_samples": 0.3302083333333334,
59
- "math_pass@1:64_samples_stderr": 0.06445720210168741
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "0e5a5f1153950a6a"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "32344497527b5d97"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4203294.857834889,
8
- "end_time": 4206332.097575001,
9
- "total_evaluation_time_secondes": "3037.2397401127964",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.42424242424242425,
35
- "gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
36
- "gpqa_pass@1:4_samples": 0.4621212121212121,
37
- "gpqa_pass@1:4_samples_stderr": 0.025425623214011414,
38
- "gpqa_pass@1:8_samples": 0.442550505050505,
39
- "gpqa_pass@1:8_samples_stderr": 0.02303993298372008
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.42424242424242425,
43
- "gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
44
- "gpqa_pass@1:4_samples": 0.4621212121212121,
45
- "gpqa_pass@1:4_samples_stderr": 0.025425623214011414,
46
- "gpqa_pass@1:8_samples": 0.442550505050505,
47
- "gpqa_pass@1:8_samples_stderr": 0.02303993298372008
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "7af1a756034c5bbc"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "6eb8ae5f37259377"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4229414.83378778,
8
- "end_time": 4236185.516046914,
9
- "total_evaluation_time_secondes": "6770.682259134017",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.19183168316831684,
35
- "codegen_pass@1:16_stderr": 0.029331683168316847
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.19183168316831684,
39
- "codegen_pass@1:16_stderr": 0.029331683168316847
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "9e98481ea25c6036"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "cbf7ec0857e47d2e"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 661654.831401038,
8
- "end_time": 662374.12054711,
9
- "total_evaluation_time_secondes": "719.2891460719984",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.29797979797979796,
35
- "gpqa_pass@1:1_samples_stderr": 0.03258630383836555,
36
- "gpqa_pass@1:4_samples": 0.2828282828282828,
37
- "gpqa_pass@1:4_samples_stderr": 0.020745116201625495,
38
- "gpqa_pass@1:8_samples": 0.29419191919191917,
39
- "gpqa_pass@1:8_samples_stderr": 0.017370591329565217
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.29797979797979796,
43
- "gpqa_pass@1:1_samples_stderr": 0.03258630383836555,
44
- "gpqa_pass@1:4_samples": 0.2828282828282828,
45
- "gpqa_pass@1:4_samples_stderr": 0.020745116201625495,
46
- "gpqa_pass@1:8_samples": 0.29419191919191917,
47
- "gpqa_pass@1:8_samples_stderr": 0.017370591329565217
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "266022d5f2e05638"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "60eaf58a791fe18e"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1727441.946810529,
8
- "end_time": 1730120.952128118,
9
- "total_evaluation_time_secondes": "2679.0053175888024",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.0024752475247524753,
35
- "codegen_pass@1:16_stderr": 0.0015030888368182086
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.0024752475247524753,
39
- "codegen_pass@1:16_stderr": 0.0015030888368182086
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "c9c2c4597e65c9d2"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "e1b3bd8b8c45b5b3"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4375790.455053379,
8
- "end_time": 4393182.469945202,
9
- "total_evaluation_time_secondes": "17392.014891822822",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.06666666666666667,
35
- "math_pass@1:1_samples_stderr": 0.046320555585310084,
36
- "math_pass@1:4_samples": 0.041666666666666664,
37
- "math_pass@1:4_samples_stderr": 0.02104774721520688,
38
- "math_pass@1:8_samples": 0.04583333333333333,
39
- "math_pass@1:8_samples_stderr": 0.022808653496633722,
40
- "math_pass@1:16_samples": 0.05,
41
- "math_pass@1:16_samples_stderr": 0.020847696198419845,
42
- "math_pass@1:32_samples": 0.053125,
43
- "math_pass@1:32_samples_stderr": 0.023507438752959405,
44
- "math_pass@1:64_samples": 0.04791666666666667,
45
- "math_pass@1:64_samples_stderr": 0.02018476822495169
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.06666666666666667,
49
- "math_pass@1:1_samples_stderr": 0.046320555585310084,
50
- "math_pass@1:4_samples": 0.041666666666666664,
51
- "math_pass@1:4_samples_stderr": 0.02104774721520688,
52
- "math_pass@1:8_samples": 0.04583333333333333,
53
- "math_pass@1:8_samples_stderr": 0.022808653496633722,
54
- "math_pass@1:16_samples": 0.05,
55
- "math_pass@1:16_samples_stderr": 0.020847696198419845,
56
- "math_pass@1:32_samples": 0.053125,
57
- "math_pass@1:32_samples_stderr": 0.023507438752959405,
58
- "math_pass@1:64_samples": 0.04791666666666667,
59
- "math_pass@1:64_samples_stderr": 0.02018476822495169
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "d19a02dc5ef5f1cc"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "75cbaf1a9fa10dae"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 662431.93175746,
8
- "end_time": 663129.029831021,
9
- "total_evaluation_time_secondes": "697.0980735609774",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.3383838383838384,
35
- "gpqa_pass@1:1_samples_stderr": 0.03371124142626305,
36
- "gpqa_pass@1:4_samples": 0.31565656565656564,
37
- "gpqa_pass@1:4_samples_stderr": 0.019948058896725632,
38
- "gpqa_pass@1:8_samples": 0.3194444444444444,
39
- "gpqa_pass@1:8_samples_stderr": 0.017458419547239812
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.3383838383838384,
43
- "gpqa_pass@1:1_samples_stderr": 0.03371124142626305,
44
- "gpqa_pass@1:4_samples": 0.31565656565656564,
45
- "gpqa_pass@1:4_samples_stderr": 0.019948058896725632,
46
- "gpqa_pass@1:8_samples": 0.3194444444444444,
47
- "gpqa_pass@1:8_samples_stderr": 0.017458419547239812
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "ec9269c9a892ac2b"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "916c86d0d9b7e51f"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1034056.23321307,
8
- "end_time": 1036357.266139834,
9
- "total_evaluation_time_secondes": "2301.03292676399",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.018564356435643563,
35
- "codegen_pass@1:16_stderr": 0.006729419168462321
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.018564356435643563,
39
- "codegen_pass@1:16_stderr": 0.006729419168462321
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "8cca433c93724a34"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "b35bed0f5fac56be"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4379509.827509476,
8
- "end_time": 4386019.321696337,
9
- "total_evaluation_time_secondes": "6509.4941868614405",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.1,
35
- "math_pass@1:1_samples_stderr": 0.055708601453115535,
36
- "math_pass@1:4_samples": 0.10833333333333334,
37
- "math_pass@1:4_samples_stderr": 0.04433955534939854,
38
- "math_pass@1:8_samples": 0.0875,
39
- "math_pass@1:8_samples_stderr": 0.036529589643742644,
40
- "math_pass@1:16_samples": 0.07916666666666666,
41
- "math_pass@1:16_samples_stderr": 0.028100377685248996,
42
- "math_pass@1:32_samples": 0.078125,
43
- "math_pass@1:32_samples_stderr": 0.028140959711881665,
44
- "math_pass@1:64_samples": 0.0838541666666667,
45
- "math_pass@1:64_samples_stderr": 0.028796300226399354
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.1,
49
- "math_pass@1:1_samples_stderr": 0.055708601453115535,
50
- "math_pass@1:4_samples": 0.10833333333333334,
51
- "math_pass@1:4_samples_stderr": 0.04433955534939854,
52
- "math_pass@1:8_samples": 0.0875,
53
- "math_pass@1:8_samples_stderr": 0.036529589643742644,
54
- "math_pass@1:16_samples": 0.07916666666666666,
55
- "math_pass@1:16_samples_stderr": 0.028100377685248996,
56
- "math_pass@1:32_samples": 0.078125,
57
- "math_pass@1:32_samples_stderr": 0.028140959711881665,
58
- "math_pass@1:64_samples": 0.0838541666666667,
59
- "math_pass@1:64_samples_stderr": 0.028796300226399354
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "002199bc178ae8cb"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "3e3c04aad02ace80"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4100452.1099866,
8
- "end_time": 4101297.327114199,
9
- "total_evaluation_time_secondes": "845.2171275988221",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.3383838383838384,
35
- "gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
36
- "gpqa_pass@1:4_samples": 0.32954545454545453,
37
- "gpqa_pass@1:4_samples_stderr": 0.021141754316903207,
38
- "gpqa_pass@1:8_samples": 0.3402777777777778,
39
- "gpqa_pass@1:8_samples_stderr": 0.019152531335331204
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.3383838383838384,
43
- "gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
44
- "gpqa_pass@1:4_samples": 0.32954545454545453,
45
- "gpqa_pass@1:4_samples_stderr": 0.021141754316903207,
46
- "gpqa_pass@1:8_samples": 0.3402777777777778,
47
- "gpqa_pass@1:8_samples_stderr": 0.019152531335331204
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "2c81421b70193adb"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "36d252e8d20884d0"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 663217.335620686,
8
- "end_time": 665135.776709884,
9
- "total_evaluation_time_secondes": "1918.4410891981097",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.04084158415841584,
35
- "codegen_pass@1:16_stderr": 0.012008777587382542
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.04084158415841584,
39
- "codegen_pass@1:16_stderr": 0.012008777587382542
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "769ba85444eb5c3a"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "f2e5e2faa5b55b56"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1786741.173175074,
8
- "end_time": 1788880.427720995,
9
- "total_evaluation_time_secondes": "2139.254545920994",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.06666666666666667,
35
- "math_pass@1:1_samples_stderr": 0.046320555585310084,
36
- "math_pass@1:4_samples": 0.10833333333333334,
37
- "math_pass@1:4_samples_stderr": 0.04433955534939852,
38
- "math_pass@1:8_samples": 0.12916666666666668,
39
- "math_pass@1:8_samples_stderr": 0.04812703695246316,
40
- "math_pass@1:16_samples": 0.11666666666666667,
41
- "math_pass@1:16_samples_stderr": 0.04039733214513608,
42
- "math_pass@1:32_samples": 0.10937499999999999,
43
- "math_pass@1:32_samples_stderr": 0.03777883642096245,
44
- "math_pass@1:64_samples": 0.11250000000000003,
45
- "math_pass@1:64_samples_stderr": 0.038334816937806986
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.06666666666666667,
49
- "math_pass@1:1_samples_stderr": 0.046320555585310084,
50
- "math_pass@1:4_samples": 0.10833333333333334,
51
- "math_pass@1:4_samples_stderr": 0.04433955534939852,
52
- "math_pass@1:8_samples": 0.12916666666666668,
53
- "math_pass@1:8_samples_stderr": 0.04812703695246316,
54
- "math_pass@1:16_samples": 0.11666666666666667,
55
- "math_pass@1:16_samples_stderr": 0.04039733214513608,
56
- "math_pass@1:32_samples": 0.10937499999999999,
57
- "math_pass@1:32_samples_stderr": 0.03777883642096245,
58
- "math_pass@1:64_samples": 0.11250000000000003,
59
- "math_pass@1:64_samples_stderr": 0.038334816937806986
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "390196d6efde02bb"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "f95079649d32b7fb"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1448505.258795178,
8
- "end_time": 1449311.597771412,
9
- "total_evaluation_time_secondes": "806.3389762339648",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.35353535353535354,
35
- "gpqa_pass@1:1_samples_stderr": 0.03406086723547153,
36
- "gpqa_pass@1:4_samples": 0.34974747474747475,
37
- "gpqa_pass@1:4_samples_stderr": 0.020933262798829153,
38
- "gpqa_pass@1:8_samples": 0.34974747474747475,
39
- "gpqa_pass@1:8_samples_stderr": 0.018901969553475017
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.35353535353535354,
43
- "gpqa_pass@1:1_samples_stderr": 0.03406086723547153,
44
- "gpqa_pass@1:4_samples": 0.34974747474747475,
45
- "gpqa_pass@1:4_samples_stderr": 0.020933262798829153,
46
- "gpqa_pass@1:8_samples": 0.34974747474747475,
47
- "gpqa_pass@1:8_samples_stderr": 0.018901969553475017
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "db8e9046047f49d2"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "9da8b4866f2c2766"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 139762.663140377,
8
- "end_time": 141889.570223524,
9
- "total_evaluation_time_secondes": "2126.9070831470017",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.05631188118811881,
35
- "codegen_pass@1:16_stderr": 0.015118778650913064
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.05631188118811881,
39
- "codegen_pass@1:16_stderr": 0.015118778650913064
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "63432947237b0029"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "65d78b27534d12ab"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 191200.869072273,
8
- "end_time": 193465.549947105,
9
- "total_evaluation_time_secondes": "2264.6808748320036",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.13333333333333333,
35
- "math_pass@1:1_samples_stderr": 0.06312427686319994,
36
- "math_pass@1:4_samples": 0.15833333333333333,
37
- "math_pass@1:4_samples_stderr": 0.04716584419087129,
38
- "math_pass@1:8_samples": 0.14583333333333331,
39
- "math_pass@1:8_samples_stderr": 0.04283561423329068,
40
- "math_pass@1:16_samples": 0.16875,
41
- "math_pass@1:16_samples_stderr": 0.04634155275230052,
42
- "math_pass@1:32_samples": 0.15729166666666664,
43
- "math_pass@1:32_samples_stderr": 0.04530991949978092,
44
- "math_pass@1:64_samples": 0.15052083333333335,
45
- "math_pass@1:64_samples_stderr": 0.04492772455875969
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.13333333333333333,
49
- "math_pass@1:1_samples_stderr": 0.06312427686319994,
50
- "math_pass@1:4_samples": 0.15833333333333333,
51
- "math_pass@1:4_samples_stderr": 0.04716584419087129,
52
- "math_pass@1:8_samples": 0.14583333333333331,
53
- "math_pass@1:8_samples_stderr": 0.04283561423329068,
54
- "math_pass@1:16_samples": 0.16875,
55
- "math_pass@1:16_samples_stderr": 0.04634155275230052,
56
- "math_pass@1:32_samples": 0.15729166666666664,
57
- "math_pass@1:32_samples_stderr": 0.04530991949978092,
58
- "math_pass@1:64_samples": 0.15052083333333335,
59
- "math_pass@1:64_samples_stderr": 0.04492772455875969
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "64bb8b614a09d95e"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "cec7cf7bfce92574"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4101347.219924198,
8
- "end_time": 4102199.893680966,
9
- "total_evaluation_time_secondes": "852.6737567679957",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.36363636363636365,
35
- "gpqa_pass@1:1_samples_stderr": 0.034273086529999344,
36
- "gpqa_pass@1:4_samples": 0.35858585858585856,
37
- "gpqa_pass@1:4_samples_stderr": 0.023017398790703834,
38
- "gpqa_pass@1:8_samples": 0.3491161616161616,
39
- "gpqa_pass@1:8_samples_stderr": 0.019574428228765074
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.36363636363636365,
43
- "gpqa_pass@1:1_samples_stderr": 0.034273086529999344,
44
- "gpqa_pass@1:4_samples": 0.35858585858585856,
45
- "gpqa_pass@1:4_samples_stderr": 0.023017398790703834,
46
- "gpqa_pass@1:8_samples": 0.3491161616161616,
47
- "gpqa_pass@1:8_samples_stderr": 0.019574428228765074
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "2d700f06d1a2b9e8"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "8165d55bdeb1c114"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1449399.391339998,
8
- "end_time": 1451425.492734413,
9
- "total_evaluation_time_secondes": "2026.1013944149017",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.07116336633663366,
35
- "codegen_pass@1:16_stderr": 0.01817453760822587
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.07116336633663366,
39
- "codegen_pass@1:16_stderr": 0.01817453760822587
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "91ac6717ee31129b"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "2405745051128cc6"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1730268.444026561,
8
- "end_time": 1732185.82558593,
9
- "total_evaluation_time_secondes": "1917.3815593689214",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.1,
35
- "math_pass@1:1_samples_stderr": 0.055708601453115555,
36
- "math_pass@1:4_samples": 0.19166666666666668,
37
- "math_pass@1:4_samples_stderr": 0.05708852438851644,
38
- "math_pass@1:8_samples": 0.17916666666666667,
39
- "math_pass@1:8_samples_stderr": 0.04894113478792151,
40
- "math_pass@1:16_samples": 0.175,
41
- "math_pass@1:16_samples_stderr": 0.05053592099036909,
42
- "math_pass@1:32_samples": 0.17291666666666664,
43
- "math_pass@1:32_samples_stderr": 0.04727993993552142,
44
- "math_pass@1:64_samples": 0.17343750000000005,
45
- "math_pass@1:64_samples_stderr": 0.0472883475530281
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.1,
49
- "math_pass@1:1_samples_stderr": 0.055708601453115555,
50
- "math_pass@1:4_samples": 0.19166666666666668,
51
- "math_pass@1:4_samples_stderr": 0.05708852438851644,
52
- "math_pass@1:8_samples": 0.17916666666666667,
53
- "math_pass@1:8_samples_stderr": 0.04894113478792151,
54
- "math_pass@1:16_samples": 0.175,
55
- "math_pass@1:16_samples_stderr": 0.05053592099036909,
56
- "math_pass@1:32_samples": 0.17291666666666664,
57
- "math_pass@1:32_samples_stderr": 0.04727993993552142,
58
- "math_pass@1:64_samples": 0.17343750000000005,
59
- "math_pass@1:64_samples_stderr": 0.0472883475530281
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "baad61c00784bbc6"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "1d2d658abf54d77a"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4102309.818978777,
8
- "end_time": 4103334.357564577,
9
- "total_evaluation_time_secondes": "1024.5385857997462",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.3383838383838384,
35
- "gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
36
- "gpqa_pass@1:4_samples": 0.37626262626262624,
37
- "gpqa_pass@1:4_samples_stderr": 0.023116332418314938,
38
- "gpqa_pass@1:8_samples": 0.37941919191919193,
39
- "gpqa_pass@1:8_samples_stderr": 0.020691066749223084
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.3383838383838384,
43
- "gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
44
- "gpqa_pass@1:4_samples": 0.37626262626262624,
45
- "gpqa_pass@1:4_samples_stderr": 0.023116332418314938,
46
- "gpqa_pass@1:8_samples": 0.37941919191919193,
47
- "gpqa_pass@1:8_samples_stderr": 0.020691066749223084
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "f67476bc22ade0f4"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "e01d5e8a161d251b"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 665261.659275858,
8
- "end_time": 667567.554631831,
9
- "total_evaluation_time_secondes": "2305.8953559730435",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.0983910891089109,
35
- "codegen_pass@1:16_stderr": 0.01992689560651361
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.0983910891089109,
39
- "codegen_pass@1:16_stderr": 0.01992689560651361
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "99f5478f5867964b"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "c6ae0af3cc4b2079"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1036513.965434865,
8
- "end_time": 1038606.671981354,
9
- "total_evaluation_time_secondes": "2092.706546489033",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.3,
35
- "math_pass@1:1_samples_stderr": 0.0850962943396763,
36
- "math_pass@1:4_samples": 0.25833333333333336,
37
- "math_pass@1:4_samples_stderr": 0.060509490721976415,
38
- "math_pass@1:8_samples": 0.23333333333333334,
39
- "math_pass@1:8_samples_stderr": 0.056136806073810275,
40
- "math_pass@1:16_samples": 0.23541666666666666,
41
- "math_pass@1:16_samples_stderr": 0.05651544435100526,
42
- "math_pass@1:32_samples": 0.23333333333333334,
43
- "math_pass@1:32_samples_stderr": 0.05619676067142738,
44
- "math_pass@1:64_samples": 0.225,
45
- "math_pass@1:64_samples_stderr": 0.05496823468797815
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.3,
49
- "math_pass@1:1_samples_stderr": 0.0850962943396763,
50
- "math_pass@1:4_samples": 0.25833333333333336,
51
- "math_pass@1:4_samples_stderr": 0.060509490721976415,
52
- "math_pass@1:8_samples": 0.23333333333333334,
53
- "math_pass@1:8_samples_stderr": 0.056136806073810275,
54
- "math_pass@1:16_samples": 0.23541666666666666,
55
- "math_pass@1:16_samples_stderr": 0.05651544435100526,
56
- "math_pass@1:32_samples": 0.23333333333333334,
57
- "math_pass@1:32_samples_stderr": 0.05619676067142738,
58
- "math_pass@1:64_samples": 0.225,
59
- "math_pass@1:64_samples_stderr": 0.05496823468797815
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "35294f76776e888f"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "1a1b4593f16e83ad"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1788957.695045191,
8
- "end_time": 1790333.486523546,
9
- "total_evaluation_time_secondes": "1375.7914783549495",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.41919191919191917,
35
- "gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
36
- "gpqa_pass@1:4_samples": 0.38762626262626265,
37
- "gpqa_pass@1:4_samples_stderr": 0.024117072654924694,
38
- "gpqa_pass@1:8_samples": 0.3952020202020202,
39
- "gpqa_pass@1:8_samples_stderr": 0.02069180007019445
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.41919191919191917,
43
- "gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
44
- "gpqa_pass@1:4_samples": 0.38762626262626265,
45
- "gpqa_pass@1:4_samples_stderr": 0.024117072654924694,
46
- "gpqa_pass@1:8_samples": 0.3952020202020202,
47
- "gpqa_pass@1:8_samples_stderr": 0.02069180007019445
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "ed5bee871908dfc5"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "9265af1b081a4052"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 142042.086601042,
8
- "end_time": 144975.10104017,
9
- "total_evaluation_time_secondes": "2933.01443912799",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.1219059405940594,
35
- "codegen_pass@1:16_stderr": 0.02220405336387271
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.1219059405940594,
39
- "codegen_pass@1:16_stderr": 0.02220405336387271
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "04c3944ff3ccf570"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "a7aea995096e87e1"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 193540.0209108,
8
- "end_time": 195561.734842471,
9
- "total_evaluation_time_secondes": "2021.7139316709945",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.3333333333333333,
35
- "math_pass@1:1_samples_stderr": 0.08753762190648169,
36
- "math_pass@1:4_samples": 0.26666666666666666,
37
- "math_pass@1:4_samples_stderr": 0.06447548466359415,
38
- "math_pass@1:8_samples": 0.3,
39
- "math_pass@1:8_samples_stderr": 0.06608945522512666,
40
- "math_pass@1:16_samples": 0.30833333333333335,
41
- "math_pass@1:16_samples_stderr": 0.06537451054006056,
42
- "math_pass@1:32_samples": 0.3,
43
- "math_pass@1:32_samples_stderr": 0.06204214478791862,
44
- "math_pass@1:64_samples": 0.28958333333333336,
45
- "math_pass@1:64_samples_stderr": 0.06107266238666994
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.3333333333333333,
49
- "math_pass@1:1_samples_stderr": 0.08753762190648169,
50
- "math_pass@1:4_samples": 0.26666666666666666,
51
- "math_pass@1:4_samples_stderr": 0.06447548466359415,
52
- "math_pass@1:8_samples": 0.3,
53
- "math_pass@1:8_samples_stderr": 0.06608945522512666,
54
- "math_pass@1:16_samples": 0.30833333333333335,
55
- "math_pass@1:16_samples_stderr": 0.06537451054006056,
56
- "math_pass@1:32_samples": 0.3,
57
- "math_pass@1:32_samples_stderr": 0.06204214478791862,
58
- "math_pass@1:64_samples": 0.28958333333333336,
59
- "math_pass@1:64_samples_stderr": 0.06107266238666994
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "9c99493d67fe2f24"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "bed1b477cfa0b1ff"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 666253.876774008,
8
- "end_time": 668195.952495732,
9
- "total_evaluation_time_secondes": "1942.0757217239588",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.42424242424242425,
35
- "gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
36
- "gpqa_pass@1:4_samples": 0.43308080808080807,
37
- "gpqa_pass@1:4_samples_stderr": 0.024044484352700964,
38
- "gpqa_pass@1:8_samples": 0.42803030303030304,
39
- "gpqa_pass@1:8_samples_stderr": 0.020899215514001906
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.42424242424242425,
43
- "gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
44
- "gpqa_pass@1:4_samples": 0.43308080808080807,
45
- "gpqa_pass@1:4_samples_stderr": 0.024044484352700964,
46
- "gpqa_pass@1:8_samples": 0.42803030303030304,
47
- "gpqa_pass@1:8_samples_stderr": 0.020899215514001906
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "0124da80a3b3148c"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "640d6ef92615cdc9"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4103397.762373666,
8
- "end_time": 4107093.315890551,
9
- "total_evaluation_time_secondes": "3695.5535168852657",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.13242574257425743,
35
- "codegen_pass@1:16_stderr": 0.023340448876772665
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.13242574257425743,
39
- "codegen_pass@1:16_stderr": 0.023340448876772665
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "bbd21ba6aa3e9ebc"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "42b830bb8babc2e2"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 3105542.202297136,
8
- "end_time": 3108662.362517458,
9
- "total_evaluation_time_secondes": "3120.1602203217335",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.23333333333333334,
35
- "math_pass@1:1_samples_stderr": 0.07854032324531728,
36
- "math_pass@1:4_samples": 0.2833333333333333,
37
- "math_pass@1:4_samples_stderr": 0.06645079993694987,
38
- "math_pass@1:8_samples": 0.325,
39
- "math_pass@1:8_samples_stderr": 0.06581714481595306,
40
- "math_pass@1:16_samples": 0.3375,
41
- "math_pass@1:16_samples_stderr": 0.06596705464738117,
42
- "math_pass@1:32_samples": 0.33333333333333326,
43
- "math_pass@1:32_samples_stderr": 0.06514402515511274,
44
- "math_pass@1:64_samples": 0.3302083333333333,
45
- "math_pass@1:64_samples_stderr": 0.06416922063509661
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.23333333333333334,
49
- "math_pass@1:1_samples_stderr": 0.07854032324531728,
50
- "math_pass@1:4_samples": 0.2833333333333333,
51
- "math_pass@1:4_samples_stderr": 0.06645079993694987,
52
- "math_pass@1:8_samples": 0.325,
53
- "math_pass@1:8_samples_stderr": 0.06581714481595306,
54
- "math_pass@1:16_samples": 0.3375,
55
- "math_pass@1:16_samples_stderr": 0.06596705464738117,
56
- "math_pass@1:32_samples": 0.33333333333333326,
57
- "math_pass@1:32_samples_stderr": 0.06514402515511274,
58
- "math_pass@1:64_samples": 0.3302083333333333,
59
- "math_pass@1:64_samples_stderr": 0.06416922063509661
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "ad2c9153cc62d80c"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "39dba3b375f55f0a"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1451567.197422397,
8
- "end_time": 1454324.521386077,
9
- "total_evaluation_time_secondes": "2757.323963679839",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.47474747474747475,
35
- "gpqa_pass@1:1_samples_stderr": 0.03557806245087314,
36
- "gpqa_pass@1:4_samples": 0.44065656565656564,
37
- "gpqa_pass@1:4_samples_stderr": 0.02492886595604172,
38
- "gpqa_pass@1:8_samples": 0.43813131313131315,
39
- "gpqa_pass@1:8_samples_stderr": 0.022425998553299985
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.47474747474747475,
43
- "gpqa_pass@1:1_samples_stderr": 0.03557806245087314,
44
- "gpqa_pass@1:4_samples": 0.44065656565656564,
45
- "gpqa_pass@1:4_samples_stderr": 0.02492886595604172,
46
- "gpqa_pass@1:8_samples": 0.43813131313131315,
47
- "gpqa_pass@1:8_samples_stderr": 0.022425998553299985
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "53e37624af23b9ca"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "a1cb53db5b9766ea"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1732255.309611542,
8
- "end_time": 1737024.368172801,
9
- "total_evaluation_time_secondes": "4769.058561259182",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.1373762376237624,
35
- "codegen_pass@1:16_stderr": 0.022884162024885638
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.1373762376237624,
39
- "codegen_pass@1:16_stderr": 0.022884162024885638
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "27c2f3670bc0b1b6"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "989470b0bec8abc3"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 666848.134926491,
8
- "end_time": 672038.016941095,
9
- "total_evaluation_time_secondes": "5189.8820146040525",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.4666666666666667,
35
- "math_pass@1:1_samples_stderr": 0.09264111117062017,
36
- "math_pass@1:4_samples": 0.31666666666666665,
37
- "math_pass@1:4_samples_stderr": 0.06447548466359414,
38
- "math_pass@1:8_samples": 0.3416666666666667,
39
- "math_pass@1:8_samples_stderr": 0.06369076392980136,
40
- "math_pass@1:16_samples": 0.36666666666666664,
41
- "math_pass@1:16_samples_stderr": 0.06269605266009247,
42
- "math_pass@1:32_samples": 0.35625,
43
- "math_pass@1:32_samples_stderr": 0.06415901581496421,
44
- "math_pass@1:64_samples": 0.3536458333333333,
45
- "math_pass@1:64_samples_stderr": 0.0623673310546343
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.4666666666666667,
49
- "math_pass@1:1_samples_stderr": 0.09264111117062017,
50
- "math_pass@1:4_samples": 0.31666666666666665,
51
- "math_pass@1:4_samples_stderr": 0.06447548466359414,
52
- "math_pass@1:8_samples": 0.3416666666666667,
53
- "math_pass@1:8_samples_stderr": 0.06369076392980136,
54
- "math_pass@1:16_samples": 0.36666666666666664,
55
- "math_pass@1:16_samples_stderr": 0.06269605266009247,
56
- "math_pass@1:32_samples": 0.35625,
57
- "math_pass@1:32_samples_stderr": 0.06415901581496421,
58
- "math_pass@1:64_samples": 0.3536458333333333,
59
- "math_pass@1:64_samples_stderr": 0.0623673310546343
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "db9f011765ce1405"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "99e409b11cfeb8fe"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 2310674.72759561,
8
- "end_time": 2314614.622140041,
9
- "total_evaluation_time_secondes": "3939.8945444310084",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.4696969696969697,
35
- "gpqa_pass@1:1_samples_stderr": 0.03555804051763929,
36
- "gpqa_pass@1:4_samples": 0.4734848484848485,
37
- "gpqa_pass@1:4_samples_stderr": 0.02495871351378464,
38
- "gpqa_pass@1:8_samples": 0.47474747474747475,
39
- "gpqa_pass@1:8_samples_stderr": 0.02386611685973234
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.4696969696969697,
43
- "gpqa_pass@1:1_samples_stderr": 0.03555804051763929,
44
- "gpqa_pass@1:4_samples": 0.4734848484848485,
45
- "gpqa_pass@1:4_samples_stderr": 0.02495871351378464,
46
- "gpqa_pass@1:8_samples": 0.47474747474747475,
47
- "gpqa_pass@1:8_samples_stderr": 0.02386611685973234
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "e267619bde58d2f4"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "25cf5bcfcac50bfe"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1790406.352589241,
8
- "end_time": 1797453.289874955,
9
- "total_evaluation_time_secondes": "7046.937285714084",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.12933168316831684,
35
- "codegen_pass@1:16_stderr": 0.021057614597480553
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.12933168316831684,
39
- "codegen_pass@1:16_stderr": 0.021057614597480553
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "5cb3b09dbd0d8421"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "3b6e90f800171033"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 667672.719953287,
8
- "end_time": 673240.688099726,
9
- "total_evaluation_time_secondes": "5567.968146438943",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.4666666666666667,
35
- "math_pass@1:1_samples_stderr": 0.09264111117062017,
36
- "math_pass@1:4_samples": 0.4,
37
- "math_pass@1:4_samples_stderr": 0.06297235299224026,
38
- "math_pass@1:8_samples": 0.3958333333333333,
39
- "math_pass@1:8_samples_stderr": 0.05874750341153335,
40
- "math_pass@1:16_samples": 0.40208333333333335,
41
- "math_pass@1:16_samples_stderr": 0.059457809263169785,
42
- "math_pass@1:32_samples": 0.3968749999999999,
43
- "math_pass@1:32_samples_stderr": 0.060234627158371676,
44
- "math_pass@1:64_samples": 0.3989583333333334,
45
- "math_pass@1:64_samples_stderr": 0.058255348631323874
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.4666666666666667,
49
- "math_pass@1:1_samples_stderr": 0.09264111117062017,
50
- "math_pass@1:4_samples": 0.4,
51
- "math_pass@1:4_samples_stderr": 0.06297235299224026,
52
- "math_pass@1:8_samples": 0.3958333333333333,
53
- "math_pass@1:8_samples_stderr": 0.05874750341153335,
54
- "math_pass@1:16_samples": 0.40208333333333335,
55
- "math_pass@1:16_samples_stderr": 0.059457809263169785,
56
- "math_pass@1:32_samples": 0.3968749999999999,
57
- "math_pass@1:32_samples_stderr": 0.060234627158371676,
58
- "math_pass@1:64_samples": 0.3989583333333334,
59
- "math_pass@1:64_samples_stderr": 0.058255348631323874
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "3c721c7a9344ad5e"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "316677ce6ea763a4"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1038687.737303237,
8
- "end_time": 1042926.270903001,
9
- "total_evaluation_time_secondes": "4238.533599763992",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.41919191919191917,
35
- "gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
36
- "gpqa_pass@1:4_samples": 0.4911616161616162,
37
- "gpqa_pass@1:4_samples_stderr": 0.026149437445442125,
38
- "gpqa_pass@1:8_samples": 0.49747474747474746,
39
- "gpqa_pass@1:8_samples_stderr": 0.02373146147193087
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.41919191919191917,
43
- "gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
44
- "gpqa_pass@1:4_samples": 0.4911616161616162,
45
- "gpqa_pass@1:4_samples_stderr": 0.026149437445442125,
46
- "gpqa_pass@1:8_samples": 0.49747474747474746,
47
- "gpqa_pass@1:8_samples_stderr": 0.02373146147193087
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "5f36388c28f23b2f"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "b4ff756ebb6cb76f"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 195649.240633843,
8
- "end_time": 203014.740776724,
9
- "total_evaluation_time_secondes": "7365.5001428810065",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.12995049504950495,
35
- "codegen_pass@1:16_stderr": 0.019894585442980322
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.12995049504950495,
39
- "codegen_pass@1:16_stderr": 0.019894585442980322
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "058943720bd4b8b5"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "a230e53cef0867b7"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 3515687.21682791,
8
- "end_time": 3535561.065907141,
9
- "total_evaluation_time_secondes": "19873.849079231266",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.26666666666666666,
35
- "math_pass@1:1_samples_stderr": 0.0821175682735253,
36
- "math_pass@1:4_samples": 0.3416666666666667,
37
- "math_pass@1:4_samples_stderr": 0.07140154113622724,
38
- "math_pass@1:8_samples": 0.32916666666666666,
39
- "math_pass@1:8_samples_stderr": 0.05947668495839227,
40
- "math_pass@1:16_samples": 0.3333333333333333,
41
- "math_pass@1:16_samples_stderr": 0.06099725984675264,
42
- "math_pass@1:32_samples": 0.3510416666666666,
43
- "math_pass@1:32_samples_stderr": 0.06140717235446427,
44
- "math_pass@1:64_samples": 0.3567708333333333,
45
- "math_pass@1:64_samples_stderr": 0.06195924170213546
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.26666666666666666,
49
- "math_pass@1:1_samples_stderr": 0.0821175682735253,
50
- "math_pass@1:4_samples": 0.3416666666666667,
51
- "math_pass@1:4_samples_stderr": 0.07140154113622724,
52
- "math_pass@1:8_samples": 0.32916666666666666,
53
- "math_pass@1:8_samples_stderr": 0.05947668495839227,
54
- "math_pass@1:16_samples": 0.3333333333333333,
55
- "math_pass@1:16_samples_stderr": 0.06099725984675264,
56
- "math_pass@1:32_samples": 0.3510416666666666,
57
- "math_pass@1:32_samples_stderr": 0.06140717235446427,
58
- "math_pass@1:64_samples": 0.3567708333333333,
59
- "math_pass@1:64_samples_stderr": 0.06195924170213546
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "1d3092217357c073"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "2c87a9a122dd4f20"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 3515687.21683147,
8
- "end_time": 3531824.923517652,
9
- "total_evaluation_time_secondes": "16137.706686181948",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.4797979797979798,
35
- "gpqa_pass@1:1_samples_stderr": 0.035594435655639196,
36
- "gpqa_pass@1:4_samples": 0.4684343434343434,
37
- "gpqa_pass@1:4_samples_stderr": 0.026426494324412967,
38
- "gpqa_pass@1:8_samples": 0.461489898989899,
39
- "gpqa_pass@1:8_samples_stderr": 0.024035774175898385
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.4797979797979798,
43
- "gpqa_pass@1:1_samples_stderr": 0.035594435655639196,
44
- "gpqa_pass@1:4_samples": 0.4684343434343434,
45
- "gpqa_pass@1:4_samples_stderr": 0.026426494324412967,
46
- "gpqa_pass@1:8_samples": 0.461489898989899,
47
- "gpqa_pass@1:8_samples_stderr": 0.024035774175898385
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "e7d8965b0240d400"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "17f24ae7e354a0f4"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 3515687.21683316,
8
- "end_time": 3535202.419424415,
9
- "total_evaluation_time_secondes": "19515.202591255307",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.14913366336633663,
35
- "codegen_pass@1:16_stderr": 0.022127176170976755
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.14913366336633663,
39
- "codegen_pass@1:16_stderr": 0.022127176170976755
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "8c430f2397c9f2d6"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "737d8b05e5405367"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1459863.759389219,
8
- "end_time": 1464656.175600588,
9
- "total_evaluation_time_secondes": "4792.416211368982",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.3333333333333333,
35
- "math_pass@1:1_samples_stderr": 0.08753762190648169,
36
- "math_pass@1:4_samples": 0.375,
37
- "math_pass@1:4_samples_stderr": 0.07266479487202247,
38
- "math_pass@1:8_samples": 0.36666666666666664,
39
- "math_pass@1:8_samples_stderr": 0.06752323284011273,
40
- "math_pass@1:16_samples": 0.3458333333333333,
41
- "math_pass@1:16_samples_stderr": 0.0651543628329568,
42
- "math_pass@1:32_samples": 0.35625,
43
- "math_pass@1:32_samples_stderr": 0.06644516903813058,
44
- "math_pass@1:64_samples": 0.3447916666666667,
45
- "math_pass@1:64_samples_stderr": 0.06668770995112147
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.3333333333333333,
49
- "math_pass@1:1_samples_stderr": 0.08753762190648169,
50
- "math_pass@1:4_samples": 0.375,
51
- "math_pass@1:4_samples_stderr": 0.07266479487202247,
52
- "math_pass@1:8_samples": 0.36666666666666664,
53
- "math_pass@1:8_samples_stderr": 0.06752323284011273,
54
- "math_pass@1:16_samples": 0.3458333333333333,
55
- "math_pass@1:16_samples_stderr": 0.0651543628329568,
56
- "math_pass@1:32_samples": 0.35625,
57
- "math_pass@1:32_samples_stderr": 0.06644516903813058,
58
- "math_pass@1:64_samples": 0.3447916666666667,
59
- "math_pass@1:64_samples_stderr": 0.06668770995112147
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "f1a35fbe9cb5d79d"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "9a1149edaf877e78"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 1459863.759393328,
8
- "end_time": 1462431.22071959,
9
- "total_evaluation_time_secondes": "2567.4613262619823",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.4494949494949495,
35
- "gpqa_pass@1:1_samples_stderr": 0.0354413249194797,
36
- "gpqa_pass@1:4_samples": 0.45580808080808083,
37
- "gpqa_pass@1:4_samples_stderr": 0.025905050629807098,
38
- "gpqa_pass@1:8_samples": 0.4659090909090909,
39
- "gpqa_pass@1:8_samples_stderr": 0.023860690969531115
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.4494949494949495,
43
- "gpqa_pass@1:1_samples_stderr": 0.0354413249194797,
44
- "gpqa_pass@1:4_samples": 0.45580808080808083,
45
- "gpqa_pass@1:4_samples_stderr": 0.025905050629807098,
46
- "gpqa_pass@1:8_samples": 0.4659090909090909,
47
- "gpqa_pass@1:8_samples_stderr": 0.023860690969531115
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "95e4a38cf316bba8"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "20add5806d35746d"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 4549324.508904906,
8
- "end_time": 4556098.186270666,
9
- "total_evaluation_time_secondes": "6773.677365760319",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.28898514851485146,
35
- "codegen_pass@1:16_stderr": 0.031818486170004524
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.28898514851485146,
39
- "codegen_pass@1:16_stderr": 0.031818486170004524
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "286a12cf9de34606"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "22a2678d6edd498b"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 2232253.173363435,
8
- "end_time": 2237347.560457066,
9
- "total_evaluation_time_secondes": "5094.387093631085",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.3333333333333333,
35
- "math_pass@1:1_samples_stderr": 0.08753762190648172,
36
- "math_pass@1:4_samples": 0.36666666666666664,
37
- "math_pass@1:4_samples_stderr": 0.07459982640473675,
38
- "math_pass@1:8_samples": 0.3625,
39
- "math_pass@1:8_samples_stderr": 0.06842596048990117,
40
- "math_pass@1:16_samples": 0.3729166666666667,
41
- "math_pass@1:16_samples_stderr": 0.06849700926155665,
42
- "math_pass@1:32_samples": 0.3625,
43
- "math_pass@1:32_samples_stderr": 0.06721777418709032,
44
- "math_pass@1:64_samples": 0.3661458333333334,
45
- "math_pass@1:64_samples_stderr": 0.06797451270875941
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.3333333333333333,
49
- "math_pass@1:1_samples_stderr": 0.08753762190648172,
50
- "math_pass@1:4_samples": 0.36666666666666664,
51
- "math_pass@1:4_samples_stderr": 0.07459982640473675,
52
- "math_pass@1:8_samples": 0.3625,
53
- "math_pass@1:8_samples_stderr": 0.06842596048990117,
54
- "math_pass@1:16_samples": 0.3729166666666667,
55
- "math_pass@1:16_samples_stderr": 0.06849700926155665,
56
- "math_pass@1:32_samples": 0.3625,
57
- "math_pass@1:32_samples_stderr": 0.06721777418709032,
58
- "math_pass@1:64_samples": 0.3661458333333334,
59
- "math_pass@1:64_samples_stderr": 0.06797451270875941
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "414e787f40171ca1"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "ddef51b0406c3982"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 2232253.173368685,
8
- "end_time": 2234788.75330911,
9
- "total_evaluation_time_secondes": "2535.579940425232",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.47474747474747475,
35
- "gpqa_pass@1:1_samples_stderr": 0.035578062450873145,
36
- "gpqa_pass@1:4_samples": 0.4671717171717172,
37
- "gpqa_pass@1:4_samples_stderr": 0.02577398329081247,
38
- "gpqa_pass@1:8_samples": 0.46275252525252525,
39
- "gpqa_pass@1:8_samples_stderr": 0.023284154380105133
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.47474747474747475,
43
- "gpqa_pass@1:1_samples_stderr": 0.035578062450873145,
44
- "gpqa_pass@1:4_samples": 0.4671717171717172,
45
- "gpqa_pass@1:4_samples_stderr": 0.02577398329081247,
46
- "gpqa_pass@1:8_samples": 0.46275252525252525,
47
- "gpqa_pass@1:8_samples_stderr": 0.023284154380105133
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "e1585b875c6c5ab7"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "931f077715c52375"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 2232253.173378256,
8
- "end_time": 2239358.756926505,
9
- "total_evaluation_time_secondes": "7105.5835482487455",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.31064356435643564,
35
- "codegen_pass@1:16_stderr": 0.03401113048493175
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.31064356435643564,
39
- "codegen_pass@1:16_stderr": 0.03401113048493175
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "53e4514580daf13c"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "181ee4063ef7c525"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json DELETED
@@ -1,174 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 45759.384835112,
8
- "end_time": 50908.194217762,
9
- "total_evaluation_time_secondes": "5148.809382649997",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|aime24|0": {
34
- "math_pass@1:1_samples": 0.4666666666666667,
35
- "math_pass@1:1_samples_stderr": 0.09264111117062017,
36
- "math_pass@1:4_samples": 0.4083333333333333,
37
- "math_pass@1:4_samples_stderr": 0.07904179504530925,
38
- "math_pass@1:8_samples": 0.4,
39
- "math_pass@1:8_samples_stderr": 0.0725163477407348,
40
- "math_pass@1:16_samples": 0.4,
41
- "math_pass@1:16_samples_stderr": 0.06999589478931119,
42
- "math_pass@1:32_samples": 0.384375,
43
- "math_pass@1:32_samples_stderr": 0.06687540283475192,
44
- "math_pass@1:64_samples": 0.3895833333333334,
45
- "math_pass@1:64_samples_stderr": 0.06801268906639514
46
- },
47
- "all": {
48
- "math_pass@1:1_samples": 0.4666666666666667,
49
- "math_pass@1:1_samples_stderr": 0.09264111117062017,
50
- "math_pass@1:4_samples": 0.4083333333333333,
51
- "math_pass@1:4_samples_stderr": 0.07904179504530925,
52
- "math_pass@1:8_samples": 0.4,
53
- "math_pass@1:8_samples_stderr": 0.0725163477407348,
54
- "math_pass@1:16_samples": 0.4,
55
- "math_pass@1:16_samples_stderr": 0.06999589478931119,
56
- "math_pass@1:32_samples": 0.384375,
57
- "math_pass@1:32_samples_stderr": 0.06687540283475192,
58
- "math_pass@1:64_samples": 0.3895833333333334,
59
- "math_pass@1:64_samples_stderr": 0.06801268906639514
60
- }
61
- },
62
- "versions": {
63
- "lighteval|aime24|0": 2
64
- },
65
- "config_tasks": {
66
- "lighteval|aime24": {
67
- "name": "aime24",
68
- "prompt_function": "aime_prompt_fn",
69
- "hf_repo": "HuggingFaceH4/aime_2024",
70
- "hf_subset": "default",
71
- "metric": [
72
- {
73
- "metric_name": "math_pass@1:1_samples",
74
- "higher_is_better": true,
75
- "category": "5",
76
- "use_case": "6",
77
- "sample_level_fn": "compute",
78
- "corpus_level_fn": "mean"
79
- },
80
- {
81
- "metric_name": "math_pass@1:4_samples",
82
- "higher_is_better": true,
83
- "category": "5",
84
- "use_case": "6",
85
- "sample_level_fn": "compute",
86
- "corpus_level_fn": "mean"
87
- },
88
- {
89
- "metric_name": "math_pass@1:8_samples",
90
- "higher_is_better": true,
91
- "category": "5",
92
- "use_case": "6",
93
- "sample_level_fn": "compute",
94
- "corpus_level_fn": "mean"
95
- },
96
- {
97
- "metric_name": "math_pass@1:16_samples",
98
- "higher_is_better": true,
99
- "category": "5",
100
- "use_case": "6",
101
- "sample_level_fn": "compute",
102
- "corpus_level_fn": "mean"
103
- },
104
- {
105
- "metric_name": "math_pass@1:32_samples",
106
- "higher_is_better": true,
107
- "category": "5",
108
- "use_case": "6",
109
- "sample_level_fn": "compute",
110
- "corpus_level_fn": "mean"
111
- },
112
- {
113
- "metric_name": "math_pass@1:64_samples",
114
- "higher_is_better": true,
115
- "category": "5",
116
- "use_case": "6",
117
- "sample_level_fn": "compute",
118
- "corpus_level_fn": "mean"
119
- }
120
- ],
121
- "hf_revision": null,
122
- "hf_filter": null,
123
- "hf_avail_splits": [
124
- "train"
125
- ],
126
- "trust_dataset": false,
127
- "evaluation_splits": [
128
- "train"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 32768,
133
- "generation_grammar": null,
134
- "stop_sequence": [],
135
- "num_samples": null,
136
- "suite": [
137
- "lighteval"
138
- ],
139
- "original_num_docs": 30,
140
- "effective_num_docs": 30,
141
- "must_remove_duplicate_docs": false,
142
- "version": 2
143
- }
144
- },
145
- "summary_tasks": {
146
- "lighteval|aime24|0": {
147
- "hashes": {
148
- "hash_examples": "ddec8fc79d0a014b",
149
- "hash_full_prompts": "d1829811f23cf34b",
150
- "hash_input_tokens": "7211f832bf7f8d79",
151
- "hash_cont_tokens": "6539c0aead2e735a"
152
- },
153
- "truncated": 0,
154
- "non_truncated": 30,
155
- "padded": 0,
156
- "non_padded": 30,
157
- "effective_few_shots": 0.0,
158
- "num_truncated_few_shots": 0
159
- }
160
- },
161
- "summary_general": {
162
- "hashes": {
163
- "hash_examples": "c903e836a519cf98",
164
- "hash_full_prompts": "09fe8694776a7143",
165
- "hash_input_tokens": "b52bc353fe82900e",
166
- "hash_cont_tokens": "42ffa955b6687226"
167
- },
168
- "truncated": 0,
169
- "non_truncated": 30,
170
- "padded": 0,
171
- "non_padded": 30,
172
- "num_truncated_few_shots": 0
173
- }
174
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json DELETED
@@ -1,138 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 45759.384842682,
8
- "end_time": 48213.678803292,
9
- "total_evaluation_time_secondes": "2454.2939606100044",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "lighteval|gpqa:diamond|0": {
34
- "gpqa_pass@1:1_samples": 0.494949494949495,
35
- "gpqa_pass@1:1_samples_stderr": 0.035621707606254015,
36
- "gpqa_pass@1:4_samples": 0.48737373737373735,
37
- "gpqa_pass@1:4_samples_stderr": 0.02574005112363341,
38
- "gpqa_pass@1:8_samples": 0.4722222222222222,
39
- "gpqa_pass@1:8_samples_stderr": 0.023598609724334477
40
- },
41
- "all": {
42
- "gpqa_pass@1:1_samples": 0.494949494949495,
43
- "gpqa_pass@1:1_samples_stderr": 0.035621707606254015,
44
- "gpqa_pass@1:4_samples": 0.48737373737373735,
45
- "gpqa_pass@1:4_samples_stderr": 0.02574005112363341,
46
- "gpqa_pass@1:8_samples": 0.4722222222222222,
47
- "gpqa_pass@1:8_samples_stderr": 0.023598609724334477
48
- }
49
- },
50
- "versions": {
51
- "lighteval|gpqa:diamond|0": 1
52
- },
53
- "config_tasks": {
54
- "lighteval|gpqa:diamond": {
55
- "name": "gpqa:diamond",
56
- "prompt_function": "gpqa_instruct",
57
- "hf_repo": "Idavidrein/gpqa",
58
- "hf_subset": "gpqa_diamond",
59
- "metric": [
60
- {
61
- "metric_name": "gpqa_pass@1:1_samples",
62
- "higher_is_better": true,
63
- "category": "5",
64
- "use_case": "6",
65
- "sample_level_fn": "compute",
66
- "corpus_level_fn": "mean"
67
- },
68
- {
69
- "metric_name": "gpqa_pass@1:4_samples",
70
- "higher_is_better": true,
71
- "category": "5",
72
- "use_case": "6",
73
- "sample_level_fn": "compute",
74
- "corpus_level_fn": "mean"
75
- },
76
- {
77
- "metric_name": "gpqa_pass@1:8_samples",
78
- "higher_is_better": true,
79
- "category": "5",
80
- "use_case": "6",
81
- "sample_level_fn": "compute",
82
- "corpus_level_fn": "mean"
83
- }
84
- ],
85
- "hf_revision": null,
86
- "hf_filter": null,
87
- "hf_avail_splits": [
88
- "train"
89
- ],
90
- "trust_dataset": true,
91
- "evaluation_splits": [
92
- "train"
93
- ],
94
- "few_shots_split": null,
95
- "few_shots_select": null,
96
- "generation_size": 32768,
97
- "generation_grammar": null,
98
- "stop_sequence": [],
99
- "num_samples": null,
100
- "suite": [
101
- "lighteval"
102
- ],
103
- "original_num_docs": 198,
104
- "effective_num_docs": 198,
105
- "must_remove_duplicate_docs": false,
106
- "version": 1
107
- }
108
- },
109
- "summary_tasks": {
110
- "lighteval|gpqa:diamond|0": {
111
- "hashes": {
112
- "hash_examples": "50ecb6f5d091bd95",
113
- "hash_full_prompts": "390ed6852fd81cbb",
114
- "hash_input_tokens": "fa33490cb78df6ed",
115
- "hash_cont_tokens": "37b107e03a2fa405"
116
- },
117
- "truncated": 0,
118
- "non_truncated": 198,
119
- "padded": 0,
120
- "non_padded": 198,
121
- "effective_few_shots": 0.0,
122
- "num_truncated_few_shots": 0
123
- }
124
- },
125
- "summary_general": {
126
- "hashes": {
127
- "hash_examples": "a9318dbdd867770b",
128
- "hash_full_prompts": "d039833a1bb27d62",
129
- "hash_input_tokens": "c464503d38a8a664",
130
- "hash_cont_tokens": "6503080bbaae38cf"
131
- },
132
- "truncated": 0,
133
- "non_truncated": 198,
134
- "padded": 0,
135
- "non_padded": 198,
136
- "num_truncated_few_shots": 0
137
- }
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json DELETED
@@ -1,114 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "max_samples": null,
6
- "job_id": 0,
7
- "start_time": 3757625.063178942,
8
- "end_time": 3764306.98508252,
9
- "total_evaluation_time_secondes": "6681.921903578099",
10
- "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
11
- "model_sha": "",
12
- "model_dtype": null,
13
- "model_size": null,
14
- "generation_parameters": {
15
- "early_stopping": null,
16
- "repetition_penalty": null,
17
- "frequency_penalty": null,
18
- "length_penalty": null,
19
- "presence_penalty": null,
20
- "max_new_tokens": 32768,
21
- "min_new_tokens": null,
22
- "seed": null,
23
- "stop_tokens": null,
24
- "temperature": 0.6,
25
- "top_k": null,
26
- "min_p": null,
27
- "top_p": 0.95,
28
- "truncate_prompt": null,
29
- "response_format": null
30
- }
31
- },
32
- "results": {
33
- "extended|lcb:codegeneration_v4|0": {
34
- "codegen_pass@1:16": 0.3087871287128713,
35
- "codegen_pass@1:16_stderr": 0.03410613555508327
36
- },
37
- "all": {
38
- "codegen_pass@1:16": 0.3087871287128713,
39
- "codegen_pass@1:16_stderr": 0.03410613555508327
40
- }
41
- },
42
- "versions": {
43
- "extended|lcb:codegeneration_v4|0": 0
44
- },
45
- "config_tasks": {
46
- "extended|lcb:codegeneration_v4": {
47
- "name": "lcb:codegeneration_v4",
48
- "prompt_function": "lcb_codegeneration_prompt_fn",
49
- "hf_repo": "livecodebench/code_generation_lite",
50
- "hf_subset": "v4",
51
- "metric": [
52
- {
53
- "metric_name": "codegen_pass@1:16",
54
- "higher_is_better": true,
55
- "category": "5",
56
- "use_case": "6",
57
- "sample_level_fn": "codegen_metric",
58
- "corpus_level_fn": "mean"
59
- }
60
- ],
61
- "hf_revision": null,
62
- "hf_filter": null,
63
- "hf_avail_splits": [
64
- "test"
65
- ],
66
- "trust_dataset": true,
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 32768,
73
- "generation_grammar": null,
74
- "stop_sequence": [],
75
- "num_samples": null,
76
- "suite": [
77
- "extended"
78
- ],
79
- "original_num_docs": 101,
80
- "effective_num_docs": 101,
81
- "must_remove_duplicate_docs": false,
82
- "version": 0
83
- }
84
- },
85
- "summary_tasks": {
86
- "extended|lcb:codegeneration_v4|0": {
87
- "hashes": {
88
- "hash_examples": "e7a7c74abad0ab2c",
89
- "hash_full_prompts": "94b75660a771bba7",
90
- "hash_input_tokens": "e3b779035e44a0d6",
91
- "hash_cont_tokens": "fd6fcb680afd951f"
92
- },
93
- "truncated": 0,
94
- "non_truncated": 101,
95
- "padded": 0,
96
- "non_padded": 101,
97
- "effective_few_shots": 0.0,
98
- "num_truncated_few_shots": 0
99
- }
100
- },
101
- "summary_general": {
102
- "hashes": {
103
- "hash_examples": "1e6aa5adc63aa1b2",
104
- "hash_full_prompts": "3f758d56940d66a4",
105
- "hash_input_tokens": "dcc90dd35f661f20",
106
- "hash_cont_tokens": "4f8fb4bb608ae2ab"
107
- },
108
- "truncated": 0,
109
- "non_truncated": 101,
110
- "padded": 0,
111
- "non_padded": 101,
112
- "num_truncated_few_shots": 0
113
- }
114
- }