Remove false merge evals
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json +0 -114
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json +0 -174
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json +0 -138
- eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json +0 -114
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4229414.83378225,
|
8 |
-
"end_time": 4233704.73810174,
|
9 |
-
"total_evaluation_time_secondes": "4289.904319490306",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.3,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.0850962943396763,
|
36 |
-
"math_pass@1:4_samples": 0.30833333333333335,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.06639672359547992,
|
38 |
-
"math_pass@1:8_samples": 0.32083333333333336,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.0663831976271335,
|
40 |
-
"math_pass@1:16_samples": 0.3375,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06514517386699431,
|
42 |
-
"math_pass@1:32_samples": 0.34375,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06482735693283008,
|
44 |
-
"math_pass@1:64_samples": 0.34010416666666665,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06450050595768915
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.3,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.0850962943396763,
|
50 |
-
"math_pass@1:4_samples": 0.30833333333333335,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.06639672359547992,
|
52 |
-
"math_pass@1:8_samples": 0.32083333333333336,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.0663831976271335,
|
54 |
-
"math_pass@1:16_samples": 0.3375,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06514517386699431,
|
56 |
-
"math_pass@1:32_samples": 0.34375,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06482735693283008,
|
58 |
-
"math_pass@1:64_samples": 0.34010416666666665,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06450050595768915
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "a06d62ae60da2e78"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "15d38a88ca396a91"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 255139.778479921,
|
8 |
-
"end_time": 257187.860357864,
|
9 |
-
"total_evaluation_time_secondes": "2048.081877943012",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.035212249088415824,
|
36 |
-
"gpqa_pass@1:4_samples": 0.4595959595959596,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.026456334635227167,
|
38 |
-
"gpqa_pass@1:8_samples": 0.4659090909090909,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.024944118627141743
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.035212249088415824,
|
44 |
-
"gpqa_pass@1:4_samples": 0.4595959595959596,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.026456334635227167,
|
46 |
-
"gpqa_pass@1:8_samples": 0.4659090909090909,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.024944118627141743
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "e71088c1c1cb0672"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "82854e16d8413332"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 255139.778483932,
|
8 |
-
"end_time": 260156.247424606,
|
9 |
-
"total_evaluation_time_secondes": "5016.468940674007",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.1547029702970297,
|
35 |
-
"codegen_pass@1:16_stderr": 0.025710721439537176
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.1547029702970297,
|
39 |
-
"codegen_pass@1:16_stderr": 0.025710721439537176
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "d5c1f33cd610d8c2"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "4fd58e23485de17c"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4203294.857831319,
|
8 |
-
"end_time": 4209677.501601624,
|
9 |
-
"total_evaluation_time_secondes": "6382.64377030544",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.0875376219064817,
|
36 |
-
"math_pass@1:4_samples": 0.35,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.07050719373096685,
|
38 |
-
"math_pass@1:8_samples": 0.3375,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.06486889962016494,
|
40 |
-
"math_pass@1:16_samples": 0.3541666666666667,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06415435021369162,
|
42 |
-
"math_pass@1:32_samples": 0.33645833333333336,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06365109759161156,
|
44 |
-
"math_pass@1:64_samples": 0.3302083333333334,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06445720210168741
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.0875376219064817,
|
50 |
-
"math_pass@1:4_samples": 0.35,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.07050719373096685,
|
52 |
-
"math_pass@1:8_samples": 0.3375,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.06486889962016494,
|
54 |
-
"math_pass@1:16_samples": 0.3541666666666667,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06415435021369162,
|
56 |
-
"math_pass@1:32_samples": 0.33645833333333336,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06365109759161156,
|
58 |
-
"math_pass@1:64_samples": 0.3302083333333334,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06445720210168741
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "0e5a5f1153950a6a"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "32344497527b5d97"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4203294.857834889,
|
8 |
-
"end_time": 4206332.097575001,
|
9 |
-
"total_evaluation_time_secondes": "3037.2397401127964",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
|
36 |
-
"gpqa_pass@1:4_samples": 0.4621212121212121,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.025425623214011414,
|
38 |
-
"gpqa_pass@1:8_samples": 0.442550505050505,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.02303993298372008
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
|
44 |
-
"gpqa_pass@1:4_samples": 0.4621212121212121,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.025425623214011414,
|
46 |
-
"gpqa_pass@1:8_samples": 0.442550505050505,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.02303993298372008
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "7af1a756034c5bbc"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "6eb8ae5f37259377"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4229414.83378778,
|
8 |
-
"end_time": 4236185.516046914,
|
9 |
-
"total_evaluation_time_secondes": "6770.682259134017",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.19183168316831684,
|
35 |
-
"codegen_pass@1:16_stderr": 0.029331683168316847
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.19183168316831684,
|
39 |
-
"codegen_pass@1:16_stderr": 0.029331683168316847
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "9e98481ea25c6036"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "cbf7ec0857e47d2e"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 661654.831401038,
|
8 |
-
"end_time": 662374.12054711,
|
9 |
-
"total_evaluation_time_secondes": "719.2891460719984",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.29797979797979796,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03258630383836555,
|
36 |
-
"gpqa_pass@1:4_samples": 0.2828282828282828,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.020745116201625495,
|
38 |
-
"gpqa_pass@1:8_samples": 0.29419191919191917,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.017370591329565217
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.29797979797979796,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03258630383836555,
|
44 |
-
"gpqa_pass@1:4_samples": 0.2828282828282828,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.020745116201625495,
|
46 |
-
"gpqa_pass@1:8_samples": 0.29419191919191917,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.017370591329565217
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "266022d5f2e05638"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "60eaf58a791fe18e"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1727441.946810529,
|
8 |
-
"end_time": 1730120.952128118,
|
9 |
-
"total_evaluation_time_secondes": "2679.0053175888024",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.0024752475247524753,
|
35 |
-
"codegen_pass@1:16_stderr": 0.0015030888368182086
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.0024752475247524753,
|
39 |
-
"codegen_pass@1:16_stderr": 0.0015030888368182086
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "c9c2c4597e65c9d2"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "e1b3bd8b8c45b5b3"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4375790.455053379,
|
8 |
-
"end_time": 4393182.469945202,
|
9 |
-
"total_evaluation_time_secondes": "17392.014891822822",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.06666666666666667,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.046320555585310084,
|
36 |
-
"math_pass@1:4_samples": 0.041666666666666664,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.02104774721520688,
|
38 |
-
"math_pass@1:8_samples": 0.04583333333333333,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.022808653496633722,
|
40 |
-
"math_pass@1:16_samples": 0.05,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.020847696198419845,
|
42 |
-
"math_pass@1:32_samples": 0.053125,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.023507438752959405,
|
44 |
-
"math_pass@1:64_samples": 0.04791666666666667,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.02018476822495169
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.06666666666666667,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.046320555585310084,
|
50 |
-
"math_pass@1:4_samples": 0.041666666666666664,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.02104774721520688,
|
52 |
-
"math_pass@1:8_samples": 0.04583333333333333,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.022808653496633722,
|
54 |
-
"math_pass@1:16_samples": 0.05,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.020847696198419845,
|
56 |
-
"math_pass@1:32_samples": 0.053125,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.023507438752959405,
|
58 |
-
"math_pass@1:64_samples": 0.04791666666666667,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.02018476822495169
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "d19a02dc5ef5f1cc"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "75cbaf1a9fa10dae"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 662431.93175746,
|
8 |
-
"end_time": 663129.029831021,
|
9 |
-
"total_evaluation_time_secondes": "697.0980735609774",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.3383838383838384,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03371124142626305,
|
36 |
-
"gpqa_pass@1:4_samples": 0.31565656565656564,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.019948058896725632,
|
38 |
-
"gpqa_pass@1:8_samples": 0.3194444444444444,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.017458419547239812
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.3383838383838384,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03371124142626305,
|
44 |
-
"gpqa_pass@1:4_samples": 0.31565656565656564,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.019948058896725632,
|
46 |
-
"gpqa_pass@1:8_samples": 0.3194444444444444,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.017458419547239812
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "ec9269c9a892ac2b"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "916c86d0d9b7e51f"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1034056.23321307,
|
8 |
-
"end_time": 1036357.266139834,
|
9 |
-
"total_evaluation_time_secondes": "2301.03292676399",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.018564356435643563,
|
35 |
-
"codegen_pass@1:16_stderr": 0.006729419168462321
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.018564356435643563,
|
39 |
-
"codegen_pass@1:16_stderr": 0.006729419168462321
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "8cca433c93724a34"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "b35bed0f5fac56be"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4379509.827509476,
|
8 |
-
"end_time": 4386019.321696337,
|
9 |
-
"total_evaluation_time_secondes": "6509.4941868614405",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.1,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.055708601453115535,
|
36 |
-
"math_pass@1:4_samples": 0.10833333333333334,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.04433955534939854,
|
38 |
-
"math_pass@1:8_samples": 0.0875,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.036529589643742644,
|
40 |
-
"math_pass@1:16_samples": 0.07916666666666666,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.028100377685248996,
|
42 |
-
"math_pass@1:32_samples": 0.078125,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.028140959711881665,
|
44 |
-
"math_pass@1:64_samples": 0.0838541666666667,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.028796300226399354
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.1,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.055708601453115535,
|
50 |
-
"math_pass@1:4_samples": 0.10833333333333334,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.04433955534939854,
|
52 |
-
"math_pass@1:8_samples": 0.0875,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.036529589643742644,
|
54 |
-
"math_pass@1:16_samples": 0.07916666666666666,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.028100377685248996,
|
56 |
-
"math_pass@1:32_samples": 0.078125,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.028140959711881665,
|
58 |
-
"math_pass@1:64_samples": 0.0838541666666667,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.028796300226399354
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "002199bc178ae8cb"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "3e3c04aad02ace80"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4100452.1099866,
|
8 |
-
"end_time": 4101297.327114199,
|
9 |
-
"total_evaluation_time_secondes": "845.2171275988221",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.3383838383838384,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
|
36 |
-
"gpqa_pass@1:4_samples": 0.32954545454545453,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.021141754316903207,
|
38 |
-
"gpqa_pass@1:8_samples": 0.3402777777777778,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.019152531335331204
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.3383838383838384,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
|
44 |
-
"gpqa_pass@1:4_samples": 0.32954545454545453,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.021141754316903207,
|
46 |
-
"gpqa_pass@1:8_samples": 0.3402777777777778,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.019152531335331204
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "2c81421b70193adb"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "36d252e8d20884d0"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 663217.335620686,
|
8 |
-
"end_time": 665135.776709884,
|
9 |
-
"total_evaluation_time_secondes": "1918.4410891981097",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.04084158415841584,
|
35 |
-
"codegen_pass@1:16_stderr": 0.012008777587382542
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.04084158415841584,
|
39 |
-
"codegen_pass@1:16_stderr": 0.012008777587382542
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "769ba85444eb5c3a"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "f2e5e2faa5b55b56"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1786741.173175074,
|
8 |
-
"end_time": 1788880.427720995,
|
9 |
-
"total_evaluation_time_secondes": "2139.254545920994",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.06666666666666667,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.046320555585310084,
|
36 |
-
"math_pass@1:4_samples": 0.10833333333333334,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.04433955534939852,
|
38 |
-
"math_pass@1:8_samples": 0.12916666666666668,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.04812703695246316,
|
40 |
-
"math_pass@1:16_samples": 0.11666666666666667,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.04039733214513608,
|
42 |
-
"math_pass@1:32_samples": 0.10937499999999999,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.03777883642096245,
|
44 |
-
"math_pass@1:64_samples": 0.11250000000000003,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.038334816937806986
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.06666666666666667,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.046320555585310084,
|
50 |
-
"math_pass@1:4_samples": 0.10833333333333334,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.04433955534939852,
|
52 |
-
"math_pass@1:8_samples": 0.12916666666666668,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.04812703695246316,
|
54 |
-
"math_pass@1:16_samples": 0.11666666666666667,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.04039733214513608,
|
56 |
-
"math_pass@1:32_samples": 0.10937499999999999,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.03777883642096245,
|
58 |
-
"math_pass@1:64_samples": 0.11250000000000003,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.038334816937806986
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "390196d6efde02bb"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "f95079649d32b7fb"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1448505.258795178,
|
8 |
-
"end_time": 1449311.597771412,
|
9 |
-
"total_evaluation_time_secondes": "806.3389762339648",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.35353535353535354,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03406086723547153,
|
36 |
-
"gpqa_pass@1:4_samples": 0.34974747474747475,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.020933262798829153,
|
38 |
-
"gpqa_pass@1:8_samples": 0.34974747474747475,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.018901969553475017
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.35353535353535354,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03406086723547153,
|
44 |
-
"gpqa_pass@1:4_samples": 0.34974747474747475,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.020933262798829153,
|
46 |
-
"gpqa_pass@1:8_samples": 0.34974747474747475,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.018901969553475017
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "db8e9046047f49d2"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "9da8b4866f2c2766"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 139762.663140377,
|
8 |
-
"end_time": 141889.570223524,
|
9 |
-
"total_evaluation_time_secondes": "2126.9070831470017",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.05631188118811881,
|
35 |
-
"codegen_pass@1:16_stderr": 0.015118778650913064
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.05631188118811881,
|
39 |
-
"codegen_pass@1:16_stderr": 0.015118778650913064
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "63432947237b0029"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "65d78b27534d12ab"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 191200.869072273,
|
8 |
-
"end_time": 193465.549947105,
|
9 |
-
"total_evaluation_time_secondes": "2264.6808748320036",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.13333333333333333,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.06312427686319994,
|
36 |
-
"math_pass@1:4_samples": 0.15833333333333333,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.04716584419087129,
|
38 |
-
"math_pass@1:8_samples": 0.14583333333333331,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.04283561423329068,
|
40 |
-
"math_pass@1:16_samples": 0.16875,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.04634155275230052,
|
42 |
-
"math_pass@1:32_samples": 0.15729166666666664,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.04530991949978092,
|
44 |
-
"math_pass@1:64_samples": 0.15052083333333335,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.04492772455875969
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.13333333333333333,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.06312427686319994,
|
50 |
-
"math_pass@1:4_samples": 0.15833333333333333,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.04716584419087129,
|
52 |
-
"math_pass@1:8_samples": 0.14583333333333331,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.04283561423329068,
|
54 |
-
"math_pass@1:16_samples": 0.16875,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.04634155275230052,
|
56 |
-
"math_pass@1:32_samples": 0.15729166666666664,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.04530991949978092,
|
58 |
-
"math_pass@1:64_samples": 0.15052083333333335,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.04492772455875969
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "64bb8b614a09d95e"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "cec7cf7bfce92574"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4101347.219924198,
|
8 |
-
"end_time": 4102199.893680966,
|
9 |
-
"total_evaluation_time_secondes": "852.6737567679957",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.36363636363636365,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.034273086529999344,
|
36 |
-
"gpqa_pass@1:4_samples": 0.35858585858585856,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.023017398790703834,
|
38 |
-
"gpqa_pass@1:8_samples": 0.3491161616161616,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.019574428228765074
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.36363636363636365,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.034273086529999344,
|
44 |
-
"gpqa_pass@1:4_samples": 0.35858585858585856,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.023017398790703834,
|
46 |
-
"gpqa_pass@1:8_samples": 0.3491161616161616,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.019574428228765074
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "2d700f06d1a2b9e8"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "8165d55bdeb1c114"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1449399.391339998,
|
8 |
-
"end_time": 1451425.492734413,
|
9 |
-
"total_evaluation_time_secondes": "2026.1013944149017",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.07116336633663366,
|
35 |
-
"codegen_pass@1:16_stderr": 0.01817453760822587
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.07116336633663366,
|
39 |
-
"codegen_pass@1:16_stderr": 0.01817453760822587
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "91ac6717ee31129b"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "2405745051128cc6"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1730268.444026561,
|
8 |
-
"end_time": 1732185.82558593,
|
9 |
-
"total_evaluation_time_secondes": "1917.3815593689214",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.1,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.055708601453115555,
|
36 |
-
"math_pass@1:4_samples": 0.19166666666666668,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.05708852438851644,
|
38 |
-
"math_pass@1:8_samples": 0.17916666666666667,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.04894113478792151,
|
40 |
-
"math_pass@1:16_samples": 0.175,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.05053592099036909,
|
42 |
-
"math_pass@1:32_samples": 0.17291666666666664,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.04727993993552142,
|
44 |
-
"math_pass@1:64_samples": 0.17343750000000005,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.0472883475530281
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.1,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.055708601453115555,
|
50 |
-
"math_pass@1:4_samples": 0.19166666666666668,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.05708852438851644,
|
52 |
-
"math_pass@1:8_samples": 0.17916666666666667,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.04894113478792151,
|
54 |
-
"math_pass@1:16_samples": 0.175,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.05053592099036909,
|
56 |
-
"math_pass@1:32_samples": 0.17291666666666664,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.04727993993552142,
|
58 |
-
"math_pass@1:64_samples": 0.17343750000000005,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.0472883475530281
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "baad61c00784bbc6"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "1d2d658abf54d77a"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4102309.818978777,
|
8 |
-
"end_time": 4103334.357564577,
|
9 |
-
"total_evaluation_time_secondes": "1024.5385857997462",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.3383838383838384,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
|
36 |
-
"gpqa_pass@1:4_samples": 0.37626262626262624,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.023116332418314938,
|
38 |
-
"gpqa_pass@1:8_samples": 0.37941919191919193,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.020691066749223084
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.3383838383838384,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.033711241426263035,
|
44 |
-
"gpqa_pass@1:4_samples": 0.37626262626262624,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.023116332418314938,
|
46 |
-
"gpqa_pass@1:8_samples": 0.37941919191919193,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.020691066749223084
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "f67476bc22ade0f4"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "e01d5e8a161d251b"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 665261.659275858,
|
8 |
-
"end_time": 667567.554631831,
|
9 |
-
"total_evaluation_time_secondes": "2305.8953559730435",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.0983910891089109,
|
35 |
-
"codegen_pass@1:16_stderr": 0.01992689560651361
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.0983910891089109,
|
39 |
-
"codegen_pass@1:16_stderr": 0.01992689560651361
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "99f5478f5867964b"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "c6ae0af3cc4b2079"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1036513.965434865,
|
8 |
-
"end_time": 1038606.671981354,
|
9 |
-
"total_evaluation_time_secondes": "2092.706546489033",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.3,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.0850962943396763,
|
36 |
-
"math_pass@1:4_samples": 0.25833333333333336,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.060509490721976415,
|
38 |
-
"math_pass@1:8_samples": 0.23333333333333334,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.056136806073810275,
|
40 |
-
"math_pass@1:16_samples": 0.23541666666666666,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.05651544435100526,
|
42 |
-
"math_pass@1:32_samples": 0.23333333333333334,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.05619676067142738,
|
44 |
-
"math_pass@1:64_samples": 0.225,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.05496823468797815
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.3,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.0850962943396763,
|
50 |
-
"math_pass@1:4_samples": 0.25833333333333336,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.060509490721976415,
|
52 |
-
"math_pass@1:8_samples": 0.23333333333333334,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.056136806073810275,
|
54 |
-
"math_pass@1:16_samples": 0.23541666666666666,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.05651544435100526,
|
56 |
-
"math_pass@1:32_samples": 0.23333333333333334,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.05619676067142738,
|
58 |
-
"math_pass@1:64_samples": 0.225,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.05496823468797815
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "35294f76776e888f"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "1a1b4593f16e83ad"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1788957.695045191,
|
8 |
-
"end_time": 1790333.486523546,
|
9 |
-
"total_evaluation_time_secondes": "1375.7914783549495",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.41919191919191917,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
|
36 |
-
"gpqa_pass@1:4_samples": 0.38762626262626265,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.024117072654924694,
|
38 |
-
"gpqa_pass@1:8_samples": 0.3952020202020202,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.02069180007019445
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.41919191919191917,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
|
44 |
-
"gpqa_pass@1:4_samples": 0.38762626262626265,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.024117072654924694,
|
46 |
-
"gpqa_pass@1:8_samples": 0.3952020202020202,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.02069180007019445
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "ed5bee871908dfc5"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "9265af1b081a4052"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 142042.086601042,
|
8 |
-
"end_time": 144975.10104017,
|
9 |
-
"total_evaluation_time_secondes": "2933.01443912799",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.1219059405940594,
|
35 |
-
"codegen_pass@1:16_stderr": 0.02220405336387271
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.1219059405940594,
|
39 |
-
"codegen_pass@1:16_stderr": 0.02220405336387271
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "04c3944ff3ccf570"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "a7aea995096e87e1"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 193540.0209108,
|
8 |
-
"end_time": 195561.734842471,
|
9 |
-
"total_evaluation_time_secondes": "2021.7139316709945",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.08753762190648169,
|
36 |
-
"math_pass@1:4_samples": 0.26666666666666666,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.06447548466359415,
|
38 |
-
"math_pass@1:8_samples": 0.3,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.06608945522512666,
|
40 |
-
"math_pass@1:16_samples": 0.30833333333333335,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06537451054006056,
|
42 |
-
"math_pass@1:32_samples": 0.3,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06204214478791862,
|
44 |
-
"math_pass@1:64_samples": 0.28958333333333336,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06107266238666994
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.08753762190648169,
|
50 |
-
"math_pass@1:4_samples": 0.26666666666666666,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.06447548466359415,
|
52 |
-
"math_pass@1:8_samples": 0.3,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.06608945522512666,
|
54 |
-
"math_pass@1:16_samples": 0.30833333333333335,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06537451054006056,
|
56 |
-
"math_pass@1:32_samples": 0.3,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06204214478791862,
|
58 |
-
"math_pass@1:64_samples": 0.28958333333333336,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06107266238666994
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "9c99493d67fe2f24"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "bed1b477cfa0b1ff"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 666253.876774008,
|
8 |
-
"end_time": 668195.952495732,
|
9 |
-
"total_evaluation_time_secondes": "1942.0757217239588",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
|
36 |
-
"gpqa_pass@1:4_samples": 0.43308080808080807,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.024044484352700964,
|
38 |
-
"gpqa_pass@1:8_samples": 0.42803030303030304,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.020899215514001906
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03521224908841583,
|
44 |
-
"gpqa_pass@1:4_samples": 0.43308080808080807,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.024044484352700964,
|
46 |
-
"gpqa_pass@1:8_samples": 0.42803030303030304,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.020899215514001906
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "0124da80a3b3148c"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "640d6ef92615cdc9"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4103397.762373666,
|
8 |
-
"end_time": 4107093.315890551,
|
9 |
-
"total_evaluation_time_secondes": "3695.5535168852657",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.13242574257425743,
|
35 |
-
"codegen_pass@1:16_stderr": 0.023340448876772665
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.13242574257425743,
|
39 |
-
"codegen_pass@1:16_stderr": 0.023340448876772665
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "bbd21ba6aa3e9ebc"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "42b830bb8babc2e2"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 3105542.202297136,
|
8 |
-
"end_time": 3108662.362517458,
|
9 |
-
"total_evaluation_time_secondes": "3120.1602203217335",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.23333333333333334,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.07854032324531728,
|
36 |
-
"math_pass@1:4_samples": 0.2833333333333333,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.06645079993694987,
|
38 |
-
"math_pass@1:8_samples": 0.325,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.06581714481595306,
|
40 |
-
"math_pass@1:16_samples": 0.3375,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06596705464738117,
|
42 |
-
"math_pass@1:32_samples": 0.33333333333333326,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06514402515511274,
|
44 |
-
"math_pass@1:64_samples": 0.3302083333333333,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06416922063509661
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.23333333333333334,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.07854032324531728,
|
50 |
-
"math_pass@1:4_samples": 0.2833333333333333,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.06645079993694987,
|
52 |
-
"math_pass@1:8_samples": 0.325,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.06581714481595306,
|
54 |
-
"math_pass@1:16_samples": 0.3375,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06596705464738117,
|
56 |
-
"math_pass@1:32_samples": 0.33333333333333326,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06514402515511274,
|
58 |
-
"math_pass@1:64_samples": 0.3302083333333333,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06416922063509661
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "ad2c9153cc62d80c"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "39dba3b375f55f0a"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1451567.197422397,
|
8 |
-
"end_time": 1454324.521386077,
|
9 |
-
"total_evaluation_time_secondes": "2757.323963679839",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.47474747474747475,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03557806245087314,
|
36 |
-
"gpqa_pass@1:4_samples": 0.44065656565656564,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.02492886595604172,
|
38 |
-
"gpqa_pass@1:8_samples": 0.43813131313131315,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.022425998553299985
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.47474747474747475,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03557806245087314,
|
44 |
-
"gpqa_pass@1:4_samples": 0.44065656565656564,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.02492886595604172,
|
46 |
-
"gpqa_pass@1:8_samples": 0.43813131313131315,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.022425998553299985
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "53e37624af23b9ca"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "a1cb53db5b9766ea"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1732255.309611542,
|
8 |
-
"end_time": 1737024.368172801,
|
9 |
-
"total_evaluation_time_secondes": "4769.058561259182",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.1373762376237624,
|
35 |
-
"codegen_pass@1:16_stderr": 0.022884162024885638
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.1373762376237624,
|
39 |
-
"codegen_pass@1:16_stderr": 0.022884162024885638
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "27c2f3670bc0b1b6"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "989470b0bec8abc3"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 666848.134926491,
|
8 |
-
"end_time": 672038.016941095,
|
9 |
-
"total_evaluation_time_secondes": "5189.8820146040525",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.4666666666666667,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.09264111117062017,
|
36 |
-
"math_pass@1:4_samples": 0.31666666666666665,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.06447548466359414,
|
38 |
-
"math_pass@1:8_samples": 0.3416666666666667,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.06369076392980136,
|
40 |
-
"math_pass@1:16_samples": 0.36666666666666664,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06269605266009247,
|
42 |
-
"math_pass@1:32_samples": 0.35625,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06415901581496421,
|
44 |
-
"math_pass@1:64_samples": 0.3536458333333333,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.0623673310546343
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.4666666666666667,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.09264111117062017,
|
50 |
-
"math_pass@1:4_samples": 0.31666666666666665,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.06447548466359414,
|
52 |
-
"math_pass@1:8_samples": 0.3416666666666667,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.06369076392980136,
|
54 |
-
"math_pass@1:16_samples": 0.36666666666666664,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06269605266009247,
|
56 |
-
"math_pass@1:32_samples": 0.35625,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06415901581496421,
|
58 |
-
"math_pass@1:64_samples": 0.3536458333333333,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.0623673310546343
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "db9f011765ce1405"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "99e409b11cfeb8fe"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 2310674.72759561,
|
8 |
-
"end_time": 2314614.622140041,
|
9 |
-
"total_evaluation_time_secondes": "3939.8945444310084",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.4696969696969697,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.03555804051763929,
|
36 |
-
"gpqa_pass@1:4_samples": 0.4734848484848485,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.02495871351378464,
|
38 |
-
"gpqa_pass@1:8_samples": 0.47474747474747475,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.02386611685973234
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.4696969696969697,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.03555804051763929,
|
44 |
-
"gpqa_pass@1:4_samples": 0.4734848484848485,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.02495871351378464,
|
46 |
-
"gpqa_pass@1:8_samples": 0.47474747474747475,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.02386611685973234
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "e267619bde58d2f4"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "25cf5bcfcac50bfe"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1790406.352589241,
|
8 |
-
"end_time": 1797453.289874955,
|
9 |
-
"total_evaluation_time_secondes": "7046.937285714084",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.12933168316831684,
|
35 |
-
"codegen_pass@1:16_stderr": 0.021057614597480553
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.12933168316831684,
|
39 |
-
"codegen_pass@1:16_stderr": 0.021057614597480553
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "5cb3b09dbd0d8421"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "3b6e90f800171033"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 667672.719953287,
|
8 |
-
"end_time": 673240.688099726,
|
9 |
-
"total_evaluation_time_secondes": "5567.968146438943",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.4666666666666667,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.09264111117062017,
|
36 |
-
"math_pass@1:4_samples": 0.4,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.06297235299224026,
|
38 |
-
"math_pass@1:8_samples": 0.3958333333333333,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.05874750341153335,
|
40 |
-
"math_pass@1:16_samples": 0.40208333333333335,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.059457809263169785,
|
42 |
-
"math_pass@1:32_samples": 0.3968749999999999,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.060234627158371676,
|
44 |
-
"math_pass@1:64_samples": 0.3989583333333334,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.058255348631323874
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.4666666666666667,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.09264111117062017,
|
50 |
-
"math_pass@1:4_samples": 0.4,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.06297235299224026,
|
52 |
-
"math_pass@1:8_samples": 0.3958333333333333,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.05874750341153335,
|
54 |
-
"math_pass@1:16_samples": 0.40208333333333335,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.059457809263169785,
|
56 |
-
"math_pass@1:32_samples": 0.3968749999999999,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.060234627158371676,
|
58 |
-
"math_pass@1:64_samples": 0.3989583333333334,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.058255348631323874
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "3c721c7a9344ad5e"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "316677ce6ea763a4"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1038687.737303237,
|
8 |
-
"end_time": 1042926.270903001,
|
9 |
-
"total_evaluation_time_secondes": "4238.533599763992",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.41919191919191917,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
|
36 |
-
"gpqa_pass@1:4_samples": 0.4911616161616162,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.026149437445442125,
|
38 |
-
"gpqa_pass@1:8_samples": 0.49747474747474746,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.02373146147193087
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.41919191919191917,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.035155207286704175,
|
44 |
-
"gpqa_pass@1:4_samples": 0.4911616161616162,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.026149437445442125,
|
46 |
-
"gpqa_pass@1:8_samples": 0.49747474747474746,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.02373146147193087
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "5f36388c28f23b2f"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "b4ff756ebb6cb76f"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 195649.240633843,
|
8 |
-
"end_time": 203014.740776724,
|
9 |
-
"total_evaluation_time_secondes": "7365.5001428810065",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.12995049504950495,
|
35 |
-
"codegen_pass@1:16_stderr": 0.019894585442980322
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.12995049504950495,
|
39 |
-
"codegen_pass@1:16_stderr": 0.019894585442980322
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "058943720bd4b8b5"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "a230e53cef0867b7"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 3515687.21682791,
|
8 |
-
"end_time": 3535561.065907141,
|
9 |
-
"total_evaluation_time_secondes": "19873.849079231266",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.26666666666666666,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.0821175682735253,
|
36 |
-
"math_pass@1:4_samples": 0.3416666666666667,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.07140154113622724,
|
38 |
-
"math_pass@1:8_samples": 0.32916666666666666,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.05947668495839227,
|
40 |
-
"math_pass@1:16_samples": 0.3333333333333333,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06099725984675264,
|
42 |
-
"math_pass@1:32_samples": 0.3510416666666666,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06140717235446427,
|
44 |
-
"math_pass@1:64_samples": 0.3567708333333333,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06195924170213546
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.26666666666666666,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.0821175682735253,
|
50 |
-
"math_pass@1:4_samples": 0.3416666666666667,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.07140154113622724,
|
52 |
-
"math_pass@1:8_samples": 0.32916666666666666,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.05947668495839227,
|
54 |
-
"math_pass@1:16_samples": 0.3333333333333333,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06099725984675264,
|
56 |
-
"math_pass@1:32_samples": 0.3510416666666666,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06140717235446427,
|
58 |
-
"math_pass@1:64_samples": 0.3567708333333333,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06195924170213546
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "1d3092217357c073"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "2c87a9a122dd4f20"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 3515687.21683147,
|
8 |
-
"end_time": 3531824.923517652,
|
9 |
-
"total_evaluation_time_secondes": "16137.706686181948",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.4797979797979798,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.035594435655639196,
|
36 |
-
"gpqa_pass@1:4_samples": 0.4684343434343434,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.026426494324412967,
|
38 |
-
"gpqa_pass@1:8_samples": 0.461489898989899,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.024035774175898385
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.4797979797979798,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.035594435655639196,
|
44 |
-
"gpqa_pass@1:4_samples": 0.4684343434343434,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.026426494324412967,
|
46 |
-
"gpqa_pass@1:8_samples": 0.461489898989899,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.024035774175898385
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "e7d8965b0240d400"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "17f24ae7e354a0f4"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 3515687.21683316,
|
8 |
-
"end_time": 3535202.419424415,
|
9 |
-
"total_evaluation_time_secondes": "19515.202591255307",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.14913366336633663,
|
35 |
-
"codegen_pass@1:16_stderr": 0.022127176170976755
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.14913366336633663,
|
39 |
-
"codegen_pass@1:16_stderr": 0.022127176170976755
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "8c430f2397c9f2d6"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "737d8b05e5405367"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1459863.759389219,
|
8 |
-
"end_time": 1464656.175600588,
|
9 |
-
"total_evaluation_time_secondes": "4792.416211368982",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.08753762190648169,
|
36 |
-
"math_pass@1:4_samples": 0.375,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.07266479487202247,
|
38 |
-
"math_pass@1:8_samples": 0.36666666666666664,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.06752323284011273,
|
40 |
-
"math_pass@1:16_samples": 0.3458333333333333,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.0651543628329568,
|
42 |
-
"math_pass@1:32_samples": 0.35625,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06644516903813058,
|
44 |
-
"math_pass@1:64_samples": 0.3447916666666667,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06668770995112147
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.08753762190648169,
|
50 |
-
"math_pass@1:4_samples": 0.375,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.07266479487202247,
|
52 |
-
"math_pass@1:8_samples": 0.36666666666666664,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.06752323284011273,
|
54 |
-
"math_pass@1:16_samples": 0.3458333333333333,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.0651543628329568,
|
56 |
-
"math_pass@1:32_samples": 0.35625,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06644516903813058,
|
58 |
-
"math_pass@1:64_samples": 0.3447916666666667,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06668770995112147
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "f1a35fbe9cb5d79d"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "9a1149edaf877e78"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 1459863.759393328,
|
8 |
-
"end_time": 1462431.22071959,
|
9 |
-
"total_evaluation_time_secondes": "2567.4613262619823",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.4494949494949495,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.0354413249194797,
|
36 |
-
"gpqa_pass@1:4_samples": 0.45580808080808083,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.025905050629807098,
|
38 |
-
"gpqa_pass@1:8_samples": 0.4659090909090909,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.023860690969531115
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.4494949494949495,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.0354413249194797,
|
44 |
-
"gpqa_pass@1:4_samples": 0.45580808080808083,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.025905050629807098,
|
46 |
-
"gpqa_pass@1:8_samples": 0.4659090909090909,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.023860690969531115
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "95e4a38cf316bba8"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "20add5806d35746d"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 4549324.508904906,
|
8 |
-
"end_time": 4556098.186270666,
|
9 |
-
"total_evaluation_time_secondes": "6773.677365760319",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.28898514851485146,
|
35 |
-
"codegen_pass@1:16_stderr": 0.031818486170004524
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.28898514851485146,
|
39 |
-
"codegen_pass@1:16_stderr": 0.031818486170004524
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "286a12cf9de34606"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "22a2678d6edd498b"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 2232253.173363435,
|
8 |
-
"end_time": 2237347.560457066,
|
9 |
-
"total_evaluation_time_secondes": "5094.387093631085",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.08753762190648172,
|
36 |
-
"math_pass@1:4_samples": 0.36666666666666664,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.07459982640473675,
|
38 |
-
"math_pass@1:8_samples": 0.3625,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.06842596048990117,
|
40 |
-
"math_pass@1:16_samples": 0.3729166666666667,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06849700926155665,
|
42 |
-
"math_pass@1:32_samples": 0.3625,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06721777418709032,
|
44 |
-
"math_pass@1:64_samples": 0.3661458333333334,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06797451270875941
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.3333333333333333,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.08753762190648172,
|
50 |
-
"math_pass@1:4_samples": 0.36666666666666664,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.07459982640473675,
|
52 |
-
"math_pass@1:8_samples": 0.3625,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.06842596048990117,
|
54 |
-
"math_pass@1:16_samples": 0.3729166666666667,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06849700926155665,
|
56 |
-
"math_pass@1:32_samples": 0.3625,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06721777418709032,
|
58 |
-
"math_pass@1:64_samples": 0.3661458333333334,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06797451270875941
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "414e787f40171ca1"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "ddef51b0406c3982"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 2232253.173368685,
|
8 |
-
"end_time": 2234788.75330911,
|
9 |
-
"total_evaluation_time_secondes": "2535.579940425232",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.47474747474747475,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.035578062450873145,
|
36 |
-
"gpqa_pass@1:4_samples": 0.4671717171717172,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.02577398329081247,
|
38 |
-
"gpqa_pass@1:8_samples": 0.46275252525252525,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.023284154380105133
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.47474747474747475,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.035578062450873145,
|
44 |
-
"gpqa_pass@1:4_samples": 0.4671717171717172,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.02577398329081247,
|
46 |
-
"gpqa_pass@1:8_samples": 0.46275252525252525,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.023284154380105133
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "e1585b875c6c5ab7"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "931f077715c52375"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 2232253.173378256,
|
8 |
-
"end_time": 2239358.756926505,
|
9 |
-
"total_evaluation_time_secondes": "7105.5835482487455",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.31064356435643564,
|
35 |
-
"codegen_pass@1:16_stderr": 0.03401113048493175
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.31064356435643564,
|
39 |
-
"codegen_pass@1:16_stderr": 0.03401113048493175
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "53e4514580daf13c"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "181ee4063ef7c525"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 45759.384835112,
|
8 |
-
"end_time": 50908.194217762,
|
9 |
-
"total_evaluation_time_secondes": "5148.809382649997",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|aime24|0": {
|
34 |
-
"math_pass@1:1_samples": 0.4666666666666667,
|
35 |
-
"math_pass@1:1_samples_stderr": 0.09264111117062017,
|
36 |
-
"math_pass@1:4_samples": 0.4083333333333333,
|
37 |
-
"math_pass@1:4_samples_stderr": 0.07904179504530925,
|
38 |
-
"math_pass@1:8_samples": 0.4,
|
39 |
-
"math_pass@1:8_samples_stderr": 0.0725163477407348,
|
40 |
-
"math_pass@1:16_samples": 0.4,
|
41 |
-
"math_pass@1:16_samples_stderr": 0.06999589478931119,
|
42 |
-
"math_pass@1:32_samples": 0.384375,
|
43 |
-
"math_pass@1:32_samples_stderr": 0.06687540283475192,
|
44 |
-
"math_pass@1:64_samples": 0.3895833333333334,
|
45 |
-
"math_pass@1:64_samples_stderr": 0.06801268906639514
|
46 |
-
},
|
47 |
-
"all": {
|
48 |
-
"math_pass@1:1_samples": 0.4666666666666667,
|
49 |
-
"math_pass@1:1_samples_stderr": 0.09264111117062017,
|
50 |
-
"math_pass@1:4_samples": 0.4083333333333333,
|
51 |
-
"math_pass@1:4_samples_stderr": 0.07904179504530925,
|
52 |
-
"math_pass@1:8_samples": 0.4,
|
53 |
-
"math_pass@1:8_samples_stderr": 0.0725163477407348,
|
54 |
-
"math_pass@1:16_samples": 0.4,
|
55 |
-
"math_pass@1:16_samples_stderr": 0.06999589478931119,
|
56 |
-
"math_pass@1:32_samples": 0.384375,
|
57 |
-
"math_pass@1:32_samples_stderr": 0.06687540283475192,
|
58 |
-
"math_pass@1:64_samples": 0.3895833333333334,
|
59 |
-
"math_pass@1:64_samples_stderr": 0.06801268906639514
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"versions": {
|
63 |
-
"lighteval|aime24|0": 2
|
64 |
-
},
|
65 |
-
"config_tasks": {
|
66 |
-
"lighteval|aime24": {
|
67 |
-
"name": "aime24",
|
68 |
-
"prompt_function": "aime_prompt_fn",
|
69 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
70 |
-
"hf_subset": "default",
|
71 |
-
"metric": [
|
72 |
-
{
|
73 |
-
"metric_name": "math_pass@1:1_samples",
|
74 |
-
"higher_is_better": true,
|
75 |
-
"category": "5",
|
76 |
-
"use_case": "6",
|
77 |
-
"sample_level_fn": "compute",
|
78 |
-
"corpus_level_fn": "mean"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"metric_name": "math_pass@1:4_samples",
|
82 |
-
"higher_is_better": true,
|
83 |
-
"category": "5",
|
84 |
-
"use_case": "6",
|
85 |
-
"sample_level_fn": "compute",
|
86 |
-
"corpus_level_fn": "mean"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"metric_name": "math_pass@1:8_samples",
|
90 |
-
"higher_is_better": true,
|
91 |
-
"category": "5",
|
92 |
-
"use_case": "6",
|
93 |
-
"sample_level_fn": "compute",
|
94 |
-
"corpus_level_fn": "mean"
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"metric_name": "math_pass@1:16_samples",
|
98 |
-
"higher_is_better": true,
|
99 |
-
"category": "5",
|
100 |
-
"use_case": "6",
|
101 |
-
"sample_level_fn": "compute",
|
102 |
-
"corpus_level_fn": "mean"
|
103 |
-
},
|
104 |
-
{
|
105 |
-
"metric_name": "math_pass@1:32_samples",
|
106 |
-
"higher_is_better": true,
|
107 |
-
"category": "5",
|
108 |
-
"use_case": "6",
|
109 |
-
"sample_level_fn": "compute",
|
110 |
-
"corpus_level_fn": "mean"
|
111 |
-
},
|
112 |
-
{
|
113 |
-
"metric_name": "math_pass@1:64_samples",
|
114 |
-
"higher_is_better": true,
|
115 |
-
"category": "5",
|
116 |
-
"use_case": "6",
|
117 |
-
"sample_level_fn": "compute",
|
118 |
-
"corpus_level_fn": "mean"
|
119 |
-
}
|
120 |
-
],
|
121 |
-
"hf_revision": null,
|
122 |
-
"hf_filter": null,
|
123 |
-
"hf_avail_splits": [
|
124 |
-
"train"
|
125 |
-
],
|
126 |
-
"trust_dataset": false,
|
127 |
-
"evaluation_splits": [
|
128 |
-
"train"
|
129 |
-
],
|
130 |
-
"few_shots_split": null,
|
131 |
-
"few_shots_select": null,
|
132 |
-
"generation_size": 32768,
|
133 |
-
"generation_grammar": null,
|
134 |
-
"stop_sequence": [],
|
135 |
-
"num_samples": null,
|
136 |
-
"suite": [
|
137 |
-
"lighteval"
|
138 |
-
],
|
139 |
-
"original_num_docs": 30,
|
140 |
-
"effective_num_docs": 30,
|
141 |
-
"must_remove_duplicate_docs": false,
|
142 |
-
"version": 2
|
143 |
-
}
|
144 |
-
},
|
145 |
-
"summary_tasks": {
|
146 |
-
"lighteval|aime24|0": {
|
147 |
-
"hashes": {
|
148 |
-
"hash_examples": "ddec8fc79d0a014b",
|
149 |
-
"hash_full_prompts": "d1829811f23cf34b",
|
150 |
-
"hash_input_tokens": "7211f832bf7f8d79",
|
151 |
-
"hash_cont_tokens": "6539c0aead2e735a"
|
152 |
-
},
|
153 |
-
"truncated": 0,
|
154 |
-
"non_truncated": 30,
|
155 |
-
"padded": 0,
|
156 |
-
"non_padded": 30,
|
157 |
-
"effective_few_shots": 0.0,
|
158 |
-
"num_truncated_few_shots": 0
|
159 |
-
}
|
160 |
-
},
|
161 |
-
"summary_general": {
|
162 |
-
"hashes": {
|
163 |
-
"hash_examples": "c903e836a519cf98",
|
164 |
-
"hash_full_prompts": "09fe8694776a7143",
|
165 |
-
"hash_input_tokens": "b52bc353fe82900e",
|
166 |
-
"hash_cont_tokens": "42ffa955b6687226"
|
167 |
-
},
|
168 |
-
"truncated": 0,
|
169 |
-
"non_truncated": 30,
|
170 |
-
"padded": 0,
|
171 |
-
"non_padded": 30,
|
172 |
-
"num_truncated_few_shots": 0
|
173 |
-
}
|
174 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 45759.384842682,
|
8 |
-
"end_time": 48213.678803292,
|
9 |
-
"total_evaluation_time_secondes": "2454.2939606100044",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"lighteval|gpqa:diamond|0": {
|
34 |
-
"gpqa_pass@1:1_samples": 0.494949494949495,
|
35 |
-
"gpqa_pass@1:1_samples_stderr": 0.035621707606254015,
|
36 |
-
"gpqa_pass@1:4_samples": 0.48737373737373735,
|
37 |
-
"gpqa_pass@1:4_samples_stderr": 0.02574005112363341,
|
38 |
-
"gpqa_pass@1:8_samples": 0.4722222222222222,
|
39 |
-
"gpqa_pass@1:8_samples_stderr": 0.023598609724334477
|
40 |
-
},
|
41 |
-
"all": {
|
42 |
-
"gpqa_pass@1:1_samples": 0.494949494949495,
|
43 |
-
"gpqa_pass@1:1_samples_stderr": 0.035621707606254015,
|
44 |
-
"gpqa_pass@1:4_samples": 0.48737373737373735,
|
45 |
-
"gpqa_pass@1:4_samples_stderr": 0.02574005112363341,
|
46 |
-
"gpqa_pass@1:8_samples": 0.4722222222222222,
|
47 |
-
"gpqa_pass@1:8_samples_stderr": 0.023598609724334477
|
48 |
-
}
|
49 |
-
},
|
50 |
-
"versions": {
|
51 |
-
"lighteval|gpqa:diamond|0": 1
|
52 |
-
},
|
53 |
-
"config_tasks": {
|
54 |
-
"lighteval|gpqa:diamond": {
|
55 |
-
"name": "gpqa:diamond",
|
56 |
-
"prompt_function": "gpqa_instruct",
|
57 |
-
"hf_repo": "Idavidrein/gpqa",
|
58 |
-
"hf_subset": "gpqa_diamond",
|
59 |
-
"metric": [
|
60 |
-
{
|
61 |
-
"metric_name": "gpqa_pass@1:1_samples",
|
62 |
-
"higher_is_better": true,
|
63 |
-
"category": "5",
|
64 |
-
"use_case": "6",
|
65 |
-
"sample_level_fn": "compute",
|
66 |
-
"corpus_level_fn": "mean"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"metric_name": "gpqa_pass@1:4_samples",
|
70 |
-
"higher_is_better": true,
|
71 |
-
"category": "5",
|
72 |
-
"use_case": "6",
|
73 |
-
"sample_level_fn": "compute",
|
74 |
-
"corpus_level_fn": "mean"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"metric_name": "gpqa_pass@1:8_samples",
|
78 |
-
"higher_is_better": true,
|
79 |
-
"category": "5",
|
80 |
-
"use_case": "6",
|
81 |
-
"sample_level_fn": "compute",
|
82 |
-
"corpus_level_fn": "mean"
|
83 |
-
}
|
84 |
-
],
|
85 |
-
"hf_revision": null,
|
86 |
-
"hf_filter": null,
|
87 |
-
"hf_avail_splits": [
|
88 |
-
"train"
|
89 |
-
],
|
90 |
-
"trust_dataset": true,
|
91 |
-
"evaluation_splits": [
|
92 |
-
"train"
|
93 |
-
],
|
94 |
-
"few_shots_split": null,
|
95 |
-
"few_shots_select": null,
|
96 |
-
"generation_size": 32768,
|
97 |
-
"generation_grammar": null,
|
98 |
-
"stop_sequence": [],
|
99 |
-
"num_samples": null,
|
100 |
-
"suite": [
|
101 |
-
"lighteval"
|
102 |
-
],
|
103 |
-
"original_num_docs": 198,
|
104 |
-
"effective_num_docs": 198,
|
105 |
-
"must_remove_duplicate_docs": false,
|
106 |
-
"version": 1
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"summary_tasks": {
|
110 |
-
"lighteval|gpqa:diamond|0": {
|
111 |
-
"hashes": {
|
112 |
-
"hash_examples": "50ecb6f5d091bd95",
|
113 |
-
"hash_full_prompts": "390ed6852fd81cbb",
|
114 |
-
"hash_input_tokens": "fa33490cb78df6ed",
|
115 |
-
"hash_cont_tokens": "37b107e03a2fa405"
|
116 |
-
},
|
117 |
-
"truncated": 0,
|
118 |
-
"non_truncated": 198,
|
119 |
-
"padded": 0,
|
120 |
-
"non_padded": 198,
|
121 |
-
"effective_few_shots": 0.0,
|
122 |
-
"num_truncated_few_shots": 0
|
123 |
-
}
|
124 |
-
},
|
125 |
-
"summary_general": {
|
126 |
-
"hashes": {
|
127 |
-
"hash_examples": "a9318dbdd867770b",
|
128 |
-
"hash_full_prompts": "d039833a1bb27d62",
|
129 |
-
"hash_input_tokens": "c464503d38a8a664",
|
130 |
-
"hash_cont_tokens": "6503080bbaae38cf"
|
131 |
-
},
|
132 |
-
"truncated": 0,
|
133 |
-
"non_truncated": 198,
|
134 |
-
"padded": 0,
|
135 |
-
"non_padded": 198,
|
136 |
-
"num_truncated_few_shots": 0
|
137 |
-
}
|
138 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"max_samples": null,
|
6 |
-
"job_id": 0,
|
7 |
-
"start_time": 3757625.063178942,
|
8 |
-
"end_time": 3764306.98508252,
|
9 |
-
"total_evaluation_time_secondes": "6681.921903578099",
|
10 |
-
"model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges",
|
11 |
-
"model_sha": "",
|
12 |
-
"model_dtype": null,
|
13 |
-
"model_size": null,
|
14 |
-
"generation_parameters": {
|
15 |
-
"early_stopping": null,
|
16 |
-
"repetition_penalty": null,
|
17 |
-
"frequency_penalty": null,
|
18 |
-
"length_penalty": null,
|
19 |
-
"presence_penalty": null,
|
20 |
-
"max_new_tokens": 32768,
|
21 |
-
"min_new_tokens": null,
|
22 |
-
"seed": null,
|
23 |
-
"stop_tokens": null,
|
24 |
-
"temperature": 0.6,
|
25 |
-
"top_k": null,
|
26 |
-
"min_p": null,
|
27 |
-
"top_p": 0.95,
|
28 |
-
"truncate_prompt": null,
|
29 |
-
"response_format": null
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"results": {
|
33 |
-
"extended|lcb:codegeneration_v4|0": {
|
34 |
-
"codegen_pass@1:16": 0.3087871287128713,
|
35 |
-
"codegen_pass@1:16_stderr": 0.03410613555508327
|
36 |
-
},
|
37 |
-
"all": {
|
38 |
-
"codegen_pass@1:16": 0.3087871287128713,
|
39 |
-
"codegen_pass@1:16_stderr": 0.03410613555508327
|
40 |
-
}
|
41 |
-
},
|
42 |
-
"versions": {
|
43 |
-
"extended|lcb:codegeneration_v4|0": 0
|
44 |
-
},
|
45 |
-
"config_tasks": {
|
46 |
-
"extended|lcb:codegeneration_v4": {
|
47 |
-
"name": "lcb:codegeneration_v4",
|
48 |
-
"prompt_function": "lcb_codegeneration_prompt_fn",
|
49 |
-
"hf_repo": "livecodebench/code_generation_lite",
|
50 |
-
"hf_subset": "v4",
|
51 |
-
"metric": [
|
52 |
-
{
|
53 |
-
"metric_name": "codegen_pass@1:16",
|
54 |
-
"higher_is_better": true,
|
55 |
-
"category": "5",
|
56 |
-
"use_case": "6",
|
57 |
-
"sample_level_fn": "codegen_metric",
|
58 |
-
"corpus_level_fn": "mean"
|
59 |
-
}
|
60 |
-
],
|
61 |
-
"hf_revision": null,
|
62 |
-
"hf_filter": null,
|
63 |
-
"hf_avail_splits": [
|
64 |
-
"test"
|
65 |
-
],
|
66 |
-
"trust_dataset": true,
|
67 |
-
"evaluation_splits": [
|
68 |
-
"test"
|
69 |
-
],
|
70 |
-
"few_shots_split": null,
|
71 |
-
"few_shots_select": null,
|
72 |
-
"generation_size": 32768,
|
73 |
-
"generation_grammar": null,
|
74 |
-
"stop_sequence": [],
|
75 |
-
"num_samples": null,
|
76 |
-
"suite": [
|
77 |
-
"extended"
|
78 |
-
],
|
79 |
-
"original_num_docs": 101,
|
80 |
-
"effective_num_docs": 101,
|
81 |
-
"must_remove_duplicate_docs": false,
|
82 |
-
"version": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_tasks": {
|
86 |
-
"extended|lcb:codegeneration_v4|0": {
|
87 |
-
"hashes": {
|
88 |
-
"hash_examples": "e7a7c74abad0ab2c",
|
89 |
-
"hash_full_prompts": "94b75660a771bba7",
|
90 |
-
"hash_input_tokens": "e3b779035e44a0d6",
|
91 |
-
"hash_cont_tokens": "fd6fcb680afd951f"
|
92 |
-
},
|
93 |
-
"truncated": 0,
|
94 |
-
"non_truncated": 101,
|
95 |
-
"padded": 0,
|
96 |
-
"non_padded": 101,
|
97 |
-
"effective_few_shots": 0.0,
|
98 |
-
"num_truncated_few_shots": 0
|
99 |
-
}
|
100 |
-
},
|
101 |
-
"summary_general": {
|
102 |
-
"hashes": {
|
103 |
-
"hash_examples": "1e6aa5adc63aa1b2",
|
104 |
-
"hash_full_prompts": "3f758d56940d66a4",
|
105 |
-
"hash_input_tokens": "dcc90dd35f661f20",
|
106 |
-
"hash_cont_tokens": "4f8fb4bb608ae2ab"
|
107 |
-
},
|
108 |
-
"truncated": 0,
|
109 |
-
"non_truncated": 101,
|
110 |
-
"padded": 0,
|
111 |
-
"non_padded": 101,
|
112 |
-
"num_truncated_few_shots": 0
|
113 |
-
}
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|