diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json deleted file mode 100644 index 3bc3c6a0d28d215bd561c6ed01c560037c793cca..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/aime24/results_2025-05-09T17-54-59.392443.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4229414.83378225, - "end_time": 4233704.73810174, - "total_evaluation_time_secondes": "4289.904319490306", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.3, - "math_pass@1:1_samples_stderr": 0.0850962943396763, - "math_pass@1:4_samples": 0.30833333333333335, - "math_pass@1:4_samples_stderr": 0.06639672359547992, - "math_pass@1:8_samples": 0.32083333333333336, - "math_pass@1:8_samples_stderr": 0.0663831976271335, - "math_pass@1:16_samples": 0.3375, - "math_pass@1:16_samples_stderr": 0.06514517386699431, - "math_pass@1:32_samples": 0.34375, - "math_pass@1:32_samples_stderr": 0.06482735693283008, - "math_pass@1:64_samples": 0.34010416666666665, - "math_pass@1:64_samples_stderr": 0.06450050595768915 - }, - "all": { - "math_pass@1:1_samples": 0.3, - "math_pass@1:1_samples_stderr": 0.0850962943396763, - "math_pass@1:4_samples": 0.30833333333333335, - "math_pass@1:4_samples_stderr": 0.06639672359547992, - "math_pass@1:8_samples": 0.32083333333333336, - "math_pass@1:8_samples_stderr": 0.0663831976271335, - "math_pass@1:16_samples": 0.3375, - "math_pass@1:16_samples_stderr": 0.06514517386699431, - "math_pass@1:32_samples": 0.34375, - "math_pass@1:32_samples_stderr": 0.06482735693283008, - "math_pass@1:64_samples": 0.34010416666666665, - "math_pass@1:64_samples_stderr": 0.06450050595768915 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "a06d62ae60da2e78" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "15d38a88ca396a91" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json deleted file mode 100644 index ba35f842d77a659fa66cd2b942bd183ec88a3bfb..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/gpqa/results_2025-05-09T17-17-43.158034.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 255139.778479921, - "end_time": 257187.860357864, - "total_evaluation_time_secondes": "2048.081877943012", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.035212249088415824, - "gpqa_pass@1:4_samples": 0.4595959595959596, - "gpqa_pass@1:4_samples_stderr": 0.026456334635227167, - "gpqa_pass@1:8_samples": 0.4659090909090909, - "gpqa_pass@1:8_samples_stderr": 0.024944118627141743 - }, - "all": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.035212249088415824, - "gpqa_pass@1:4_samples": 0.4595959595959596, - "gpqa_pass@1:4_samples_stderr": 0.026456334635227167, - "gpqa_pass@1:8_samples": 0.4659090909090909, - "gpqa_pass@1:8_samples_stderr": 0.024944118627141743 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "e71088c1c1cb0672" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "82854e16d8413332" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json deleted file mode 100644 index 862f6bde206c9b6a0722a8c9d01d388ab8d38ae9..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_v02.00-step-000006735_weights-0.33-0.33-0.33/lcb_v4/results_2025-05-09T18-07-11.529902.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 255139.778483932, - "end_time": 260156.247424606, - "total_evaluation_time_secondes": "5016.468940674007", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.1547029702970297, - "codegen_pass@1:16_stderr": 0.025710721439537176 - }, - "all": { - "codegen_pass@1:16": 0.1547029702970297, - "codegen_pass@1:16_stderr": 0.025710721439537176 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "d5c1f33cd610d8c2" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "4fd58e23485de17c" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json deleted file mode 100644 index c50a3ee188ab5120340f4cbbd5b1f86b35e4afbd..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/aime24/results_2025-05-09T18-29-57.523561.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4203294.857831319, - "end_time": 4209677.501601624, - "total_evaluation_time_secondes": "6382.64377030544", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.0875376219064817, - "math_pass@1:4_samples": 0.35, - "math_pass@1:4_samples_stderr": 0.07050719373096685, - "math_pass@1:8_samples": 0.3375, - "math_pass@1:8_samples_stderr": 0.06486889962016494, - "math_pass@1:16_samples": 0.3541666666666667, - "math_pass@1:16_samples_stderr": 0.06415435021369162, - "math_pass@1:32_samples": 0.33645833333333336, - "math_pass@1:32_samples_stderr": 0.06365109759161156, - "math_pass@1:64_samples": 0.3302083333333334, - "math_pass@1:64_samples_stderr": 0.06445720210168741 - }, - "all": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.0875376219064817, - "math_pass@1:4_samples": 0.35, - "math_pass@1:4_samples_stderr": 0.07050719373096685, - "math_pass@1:8_samples": 0.3375, - "math_pass@1:8_samples_stderr": 0.06486889962016494, - "math_pass@1:16_samples": 0.3541666666666667, - "math_pass@1:16_samples_stderr": 0.06415435021369162, - "math_pass@1:32_samples": 0.33645833333333336, - "math_pass@1:32_samples_stderr": 0.06365109759161156, - "math_pass@1:64_samples": 0.3302083333333334, - "math_pass@1:64_samples_stderr": 0.06445720210168741 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "0e5a5f1153950a6a" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "32344497527b5d97" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json deleted file mode 100644 index 00f74766c160c79a1e7fcf50c77937256177a280..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/gpqa/results_2025-05-09T17-34-12.167880.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4203294.857834889, - "end_time": 4206332.097575001, - "total_evaluation_time_secondes": "3037.2397401127964", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.03521224908841583, - "gpqa_pass@1:4_samples": 0.4621212121212121, - "gpqa_pass@1:4_samples_stderr": 0.025425623214011414, - "gpqa_pass@1:8_samples": 0.442550505050505, - "gpqa_pass@1:8_samples_stderr": 0.02303993298372008 - }, - "all": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.03521224908841583, - "gpqa_pass@1:4_samples": 0.4621212121212121, - "gpqa_pass@1:4_samples_stderr": 0.025425623214011414, - "gpqa_pass@1:8_samples": 0.442550505050505, - "gpqa_pass@1:8_samples_stderr": 0.02303993298372008 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "7af1a756034c5bbc" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "6eb8ae5f37259377" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json deleted file mode 100644 index 9cd9631ebc1ee3a19e71fecb8dadc75868e71a05..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50/lcb_v4/results_2025-05-09T18-36-20.217086.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4229414.83378778, - "end_time": 4236185.516046914, - "total_evaluation_time_secondes": "6770.682259134017", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.19183168316831684, - "codegen_pass@1:16_stderr": 0.029331683168316847 - }, - "all": { - "codegen_pass@1:16": 0.19183168316831684, - "codegen_pass@1:16_stderr": 0.029331683168316847 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "9e98481ea25c6036" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "cbf7ec0857e47d2e" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json deleted file mode 100644 index 511698a8c29d7d7769609cce13500a37fbab2d84..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/gpqa/results_2025-05-14T09-50-49.390939.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 661654.831401038, - "end_time": 662374.12054711, - "total_evaluation_time_secondes": "719.2891460719984", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.29797979797979796, - "gpqa_pass@1:1_samples_stderr": 0.03258630383836555, - "gpqa_pass@1:4_samples": 0.2828282828282828, - "gpqa_pass@1:4_samples_stderr": 0.020745116201625495, - "gpqa_pass@1:8_samples": 0.29419191919191917, - "gpqa_pass@1:8_samples_stderr": 0.017370591329565217 - }, - "all": { - "gpqa_pass@1:1_samples": 0.29797979797979796, - "gpqa_pass@1:1_samples_stderr": 0.03258630383836555, - "gpqa_pass@1:4_samples": 0.2828282828282828, - "gpqa_pass@1:4_samples_stderr": 0.020745116201625495, - "gpqa_pass@1:8_samples": 0.29419191919191917, - "gpqa_pass@1:8_samples_stderr": 0.017370591329565217 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "266022d5f2e05638" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "60eaf58a791fe18e" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json deleted file mode 100644 index c92d19bd881cea86c2c0bd1ecf952708b736c8a7..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.0/lcb_v4/results_2025-05-14T10-23-28.414124.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1727441.946810529, - "end_time": 1730120.952128118, - "total_evaluation_time_secondes": "2679.0053175888024", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.0024752475247524753, - "codegen_pass@1:16_stderr": 0.0015030888368182086 - }, - "all": { - "codegen_pass@1:16": 0.0024752475247524753, - "codegen_pass@1:16_stderr": 0.0015030888368182086 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "c9c2c4597e65c9d2" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "e1b3bd8b8c45b5b3" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json deleted file mode 100644 index 95e09da75dea6f556125c46ff0cbfa8333d24f9a..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/aime24/results_2025-05-14T14-29-42.293814.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4375790.455053379, - "end_time": 4393182.469945202, - "total_evaluation_time_secondes": "17392.014891822822", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.06666666666666667, - "math_pass@1:1_samples_stderr": 0.046320555585310084, - "math_pass@1:4_samples": 0.041666666666666664, - "math_pass@1:4_samples_stderr": 0.02104774721520688, - "math_pass@1:8_samples": 0.04583333333333333, - "math_pass@1:8_samples_stderr": 0.022808653496633722, - "math_pass@1:16_samples": 0.05, - "math_pass@1:16_samples_stderr": 0.020847696198419845, - "math_pass@1:32_samples": 0.053125, - "math_pass@1:32_samples_stderr": 0.023507438752959405, - "math_pass@1:64_samples": 0.04791666666666667, - "math_pass@1:64_samples_stderr": 0.02018476822495169 - }, - "all": { - "math_pass@1:1_samples": 0.06666666666666667, - "math_pass@1:1_samples_stderr": 0.046320555585310084, - "math_pass@1:4_samples": 0.041666666666666664, - "math_pass@1:4_samples_stderr": 0.02104774721520688, - "math_pass@1:8_samples": 0.04583333333333333, - "math_pass@1:8_samples_stderr": 0.022808653496633722, - "math_pass@1:16_samples": 0.05, - "math_pass@1:16_samples_stderr": 0.020847696198419845, - "math_pass@1:32_samples": 0.053125, - "math_pass@1:32_samples_stderr": 0.023507438752959405, - "math_pass@1:64_samples": 0.04791666666666667, - "math_pass@1:64_samples_stderr": 0.02018476822495169 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "d19a02dc5ef5f1cc" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "75cbaf1a9fa10dae" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json deleted file mode 100644 index 63bf160fde2eca7c9b843cbc0e7064da72de4759..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/gpqa/results_2025-05-14T10-03-24.312664.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 662431.93175746, - "end_time": 663129.029831021, - "total_evaluation_time_secondes": "697.0980735609774", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.3383838383838384, - "gpqa_pass@1:1_samples_stderr": 0.03371124142626305, - "gpqa_pass@1:4_samples": 0.31565656565656564, - "gpqa_pass@1:4_samples_stderr": 0.019948058896725632, - "gpqa_pass@1:8_samples": 0.3194444444444444, - "gpqa_pass@1:8_samples_stderr": 0.017458419547239812 - }, - "all": { - "gpqa_pass@1:1_samples": 0.3383838383838384, - "gpqa_pass@1:1_samples_stderr": 0.03371124142626305, - "gpqa_pass@1:4_samples": 0.31565656565656564, - "gpqa_pass@1:4_samples_stderr": 0.019948058896725632, - "gpqa_pass@1:8_samples": 0.3194444444444444, - "gpqa_pass@1:8_samples_stderr": 0.017458419547239812 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "ec9269c9a892ac2b" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "916c86d0d9b7e51f" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json deleted file mode 100644 index 31224c9db3ffb26f17d0e6cb49875c0fb42ede91..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.1/lcb_v4/results_2025-05-14T10-41-21.354575.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1034056.23321307, - "end_time": 1036357.266139834, - "total_evaluation_time_secondes": "2301.03292676399", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.018564356435643563, - "codegen_pass@1:16_stderr": 0.006729419168462321 - }, - "all": { - "codegen_pass@1:16": 0.018564356435643563, - "codegen_pass@1:16_stderr": 0.006729419168462321 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "8cca433c93724a34" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "b35bed0f5fac56be" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json deleted file mode 100644 index afe3f7c4c62c9186176a795761a5d1e5873ba08b..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/aime24/results_2025-05-14T11-51-30.307308.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4379509.827509476, - "end_time": 4386019.321696337, - "total_evaluation_time_secondes": "6509.4941868614405", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.1, - "math_pass@1:1_samples_stderr": 0.055708601453115535, - "math_pass@1:4_samples": 0.10833333333333334, - "math_pass@1:4_samples_stderr": 0.04433955534939854, - "math_pass@1:8_samples": 0.0875, - "math_pass@1:8_samples_stderr": 0.036529589643742644, - "math_pass@1:16_samples": 0.07916666666666666, - "math_pass@1:16_samples_stderr": 0.028100377685248996, - "math_pass@1:32_samples": 0.078125, - "math_pass@1:32_samples_stderr": 0.028140959711881665, - "math_pass@1:64_samples": 0.0838541666666667, - "math_pass@1:64_samples_stderr": 0.028796300226399354 - }, - "all": { - "math_pass@1:1_samples": 0.1, - "math_pass@1:1_samples_stderr": 0.055708601453115535, - "math_pass@1:4_samples": 0.10833333333333334, - "math_pass@1:4_samples_stderr": 0.04433955534939854, - "math_pass@1:8_samples": 0.0875, - "math_pass@1:8_samples_stderr": 0.036529589643742644, - "math_pass@1:16_samples": 0.07916666666666666, - "math_pass@1:16_samples_stderr": 0.028100377685248996, - "math_pass@1:32_samples": 0.078125, - "math_pass@1:32_samples_stderr": 0.028140959711881665, - "math_pass@1:64_samples": 0.0838541666666667, - "math_pass@1:64_samples_stderr": 0.028796300226399354 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "002199bc178ae8cb" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "3e3c04aad02ace80" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json deleted file mode 100644 index 4b6dc13ce9471b479d2090f55ec1d058bb8ad16a..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/gpqa/results_2025-05-14T10-17-05.154786.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4100452.1099866, - "end_time": 4101297.327114199, - "total_evaluation_time_secondes": "845.2171275988221", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.3383838383838384, - "gpqa_pass@1:1_samples_stderr": 0.033711241426263035, - "gpqa_pass@1:4_samples": 0.32954545454545453, - "gpqa_pass@1:4_samples_stderr": 0.021141754316903207, - "gpqa_pass@1:8_samples": 0.3402777777777778, - "gpqa_pass@1:8_samples_stderr": 0.019152531335331204 - }, - "all": { - "gpqa_pass@1:1_samples": 0.3383838383838384, - "gpqa_pass@1:1_samples_stderr": 0.033711241426263035, - "gpqa_pass@1:4_samples": 0.32954545454545453, - "gpqa_pass@1:4_samples_stderr": 0.021141754316903207, - "gpqa_pass@1:8_samples": 0.3402777777777778, - "gpqa_pass@1:8_samples_stderr": 0.019152531335331204 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "2c81421b70193adb" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "36d252e8d20884d0" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json deleted file mode 100644 index aba31b346020801658dc63c8d7da1431e05cabec..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.2/lcb_v4/results_2025-05-14T10-36-51.019112.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 663217.335620686, - "end_time": 665135.776709884, - "total_evaluation_time_secondes": "1918.4410891981097", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.04084158415841584, - "codegen_pass@1:16_stderr": 0.012008777587382542 - }, - "all": { - "codegen_pass@1:16": 0.04084158415841584, - "codegen_pass@1:16_stderr": 0.012008777587382542 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "769ba85444eb5c3a" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "f2e5e2faa5b55b56" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json deleted file mode 100644 index e9dec9cddc41efcad73d54ddf512f03fa36a76ce..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/aime24/results_2025-05-14T10-43-40.634343.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1786741.173175074, - "end_time": 1788880.427720995, - "total_evaluation_time_secondes": "2139.254545920994", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.06666666666666667, - "math_pass@1:1_samples_stderr": 0.046320555585310084, - "math_pass@1:4_samples": 0.10833333333333334, - "math_pass@1:4_samples_stderr": 0.04433955534939852, - "math_pass@1:8_samples": 0.12916666666666668, - "math_pass@1:8_samples_stderr": 0.04812703695246316, - "math_pass@1:16_samples": 0.11666666666666667, - "math_pass@1:16_samples_stderr": 0.04039733214513608, - "math_pass@1:32_samples": 0.10937499999999999, - "math_pass@1:32_samples_stderr": 0.03777883642096245, - "math_pass@1:64_samples": 0.11250000000000003, - "math_pass@1:64_samples_stderr": 0.038334816937806986 - }, - "all": { - "math_pass@1:1_samples": 0.06666666666666667, - "math_pass@1:1_samples_stderr": 0.046320555585310084, - "math_pass@1:4_samples": 0.10833333333333334, - "math_pass@1:4_samples_stderr": 0.04433955534939852, - "math_pass@1:8_samples": 0.12916666666666668, - "math_pass@1:8_samples_stderr": 0.04812703695246316, - "math_pass@1:16_samples": 0.11666666666666667, - "math_pass@1:16_samples_stderr": 0.04039733214513608, - "math_pass@1:32_samples": 0.10937499999999999, - "math_pass@1:32_samples_stderr": 0.03777883642096245, - "math_pass@1:64_samples": 0.11250000000000003, - "math_pass@1:64_samples_stderr": 0.038334816937806986 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "390196d6efde02bb" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "f95079649d32b7fb" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json deleted file mode 100644 index 64e6d81e235b4d093b6741627fb8ac3641e5a7a0..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/gpqa/results_2025-05-14T10-21-30.046686.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1448505.258795178, - "end_time": 1449311.597771412, - "total_evaluation_time_secondes": "806.3389762339648", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.35353535353535354, - "gpqa_pass@1:1_samples_stderr": 0.03406086723547153, - "gpqa_pass@1:4_samples": 0.34974747474747475, - "gpqa_pass@1:4_samples_stderr": 0.020933262798829153, - "gpqa_pass@1:8_samples": 0.34974747474747475, - "gpqa_pass@1:8_samples_stderr": 0.018901969553475017 - }, - "all": { - "gpqa_pass@1:1_samples": 0.35353535353535354, - "gpqa_pass@1:1_samples_stderr": 0.03406086723547153, - "gpqa_pass@1:4_samples": 0.34974747474747475, - "gpqa_pass@1:4_samples_stderr": 0.020933262798829153, - "gpqa_pass@1:8_samples": 0.34974747474747475, - "gpqa_pass@1:8_samples_stderr": 0.018901969553475017 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "db8e9046047f49d2" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "9da8b4866f2c2766" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json deleted file mode 100644 index 9a3f64a5a29992a589a69c5ef595cae875908491..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.3/lcb_v4/results_2025-05-14T10-43-30.571087.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 139762.663140377, - "end_time": 141889.570223524, - "total_evaluation_time_secondes": "2126.9070831470017", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.05631188118811881, - "codegen_pass@1:16_stderr": 0.015118778650913064 - }, - "all": { - "codegen_pass@1:16": 0.05631188118811881, - "codegen_pass@1:16_stderr": 0.015118778650913064 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "63432947237b0029" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "65d78b27534d12ab" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json deleted file mode 100644 index 231cea0984abf90c36cf5fe2c3fdc60b3179e5b1..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/aime24/results_2025-05-14T10-45-46.548256.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 191200.869072273, - "end_time": 193465.549947105, - "total_evaluation_time_secondes": "2264.6808748320036", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.13333333333333333, - "math_pass@1:1_samples_stderr": 0.06312427686319994, - "math_pass@1:4_samples": 0.15833333333333333, - "math_pass@1:4_samples_stderr": 0.04716584419087129, - "math_pass@1:8_samples": 0.14583333333333331, - "math_pass@1:8_samples_stderr": 0.04283561423329068, - "math_pass@1:16_samples": 0.16875, - "math_pass@1:16_samples_stderr": 0.04634155275230052, - "math_pass@1:32_samples": 0.15729166666666664, - "math_pass@1:32_samples_stderr": 0.04530991949978092, - "math_pass@1:64_samples": 0.15052083333333335, - "math_pass@1:64_samples_stderr": 0.04492772455875969 - }, - "all": { - "math_pass@1:1_samples": 0.13333333333333333, - "math_pass@1:1_samples_stderr": 0.06312427686319994, - "math_pass@1:4_samples": 0.15833333333333333, - "math_pass@1:4_samples_stderr": 0.04716584419087129, - "math_pass@1:8_samples": 0.14583333333333331, - "math_pass@1:8_samples_stderr": 0.04283561423329068, - "math_pass@1:16_samples": 0.16875, - "math_pass@1:16_samples_stderr": 0.04634155275230052, - "math_pass@1:32_samples": 0.15729166666666664, - "math_pass@1:32_samples_stderr": 0.04530991949978092, - "math_pass@1:64_samples": 0.15052083333333335, - "math_pass@1:64_samples_stderr": 0.04492772455875969 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "64bb8b614a09d95e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "cec7cf7bfce92574" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json deleted file mode 100644 index 2fdfa88b9fc0dd864151e8344c2c8ebf7c464bc6..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/gpqa/results_2025-05-14T10-32-07.714971.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4101347.219924198, - "end_time": 4102199.893680966, - "total_evaluation_time_secondes": "852.6737567679957", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.36363636363636365, - "gpqa_pass@1:1_samples_stderr": 0.034273086529999344, - "gpqa_pass@1:4_samples": 0.35858585858585856, - "gpqa_pass@1:4_samples_stderr": 0.023017398790703834, - "gpqa_pass@1:8_samples": 0.3491161616161616, - "gpqa_pass@1:8_samples_stderr": 0.019574428228765074 - }, - "all": { - "gpqa_pass@1:1_samples": 0.36363636363636365, - "gpqa_pass@1:1_samples_stderr": 0.034273086529999344, - "gpqa_pass@1:4_samples": 0.35858585858585856, - "gpqa_pass@1:4_samples_stderr": 0.023017398790703834, - "gpqa_pass@1:8_samples": 0.3491161616161616, - "gpqa_pass@1:8_samples_stderr": 0.019574428228765074 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "2d700f06d1a2b9e8" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "8165d55bdeb1c114" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json deleted file mode 100644 index ebf8a52c744f47dc80d39524dd63d92882287fd9..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.4/lcb_v4/results_2025-05-14T10-56-43.916534.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1449399.391339998, - "end_time": 1451425.492734413, - "total_evaluation_time_secondes": "2026.1013944149017", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.07116336633663366, - "codegen_pass@1:16_stderr": 0.01817453760822587 - }, - "all": { - "codegen_pass@1:16": 0.07116336633663366, - "codegen_pass@1:16_stderr": 0.01817453760822587 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "91ac6717ee31129b" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "2405745051128cc6" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json deleted file mode 100644 index 5d4b6295b815afbf5a04abcd9bdbb95dd2f2d526..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/aime24/results_2025-05-14T10-57-53.248259.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1730268.444026561, - "end_time": 1732185.82558593, - "total_evaluation_time_secondes": "1917.3815593689214", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.1, - "math_pass@1:1_samples_stderr": 0.055708601453115555, - "math_pass@1:4_samples": 0.19166666666666668, - "math_pass@1:4_samples_stderr": 0.05708852438851644, - "math_pass@1:8_samples": 0.17916666666666667, - "math_pass@1:8_samples_stderr": 0.04894113478792151, - "math_pass@1:16_samples": 0.175, - "math_pass@1:16_samples_stderr": 0.05053592099036909, - "math_pass@1:32_samples": 0.17291666666666664, - "math_pass@1:32_samples_stderr": 0.04727993993552142, - "math_pass@1:64_samples": 0.17343750000000005, - "math_pass@1:64_samples_stderr": 0.0472883475530281 - }, - "all": { - "math_pass@1:1_samples": 0.1, - "math_pass@1:1_samples_stderr": 0.055708601453115555, - "math_pass@1:4_samples": 0.19166666666666668, - "math_pass@1:4_samples_stderr": 0.05708852438851644, - "math_pass@1:8_samples": 0.17916666666666667, - "math_pass@1:8_samples_stderr": 0.04894113478792151, - "math_pass@1:16_samples": 0.175, - "math_pass@1:16_samples_stderr": 0.05053592099036909, - "math_pass@1:32_samples": 0.17291666666666664, - "math_pass@1:32_samples_stderr": 0.04727993993552142, - "math_pass@1:64_samples": 0.17343750000000005, - "math_pass@1:64_samples_stderr": 0.0472883475530281 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "baad61c00784bbc6" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "1d2d658abf54d77a" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json deleted file mode 100644 index 9a7b2d1d45b02fcbe60873f79c87c2e384f39531..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/gpqa/results_2025-05-14T10-51-02.205828.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4102309.818978777, - "end_time": 4103334.357564577, - "total_evaluation_time_secondes": "1024.5385857997462", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.3383838383838384, - "gpqa_pass@1:1_samples_stderr": 0.033711241426263035, - "gpqa_pass@1:4_samples": 0.37626262626262624, - "gpqa_pass@1:4_samples_stderr": 0.023116332418314938, - "gpqa_pass@1:8_samples": 0.37941919191919193, - "gpqa_pass@1:8_samples_stderr": 0.020691066749223084 - }, - "all": { - "gpqa_pass@1:1_samples": 0.3383838383838384, - "gpqa_pass@1:1_samples_stderr": 0.033711241426263035, - "gpqa_pass@1:4_samples": 0.37626262626262624, - "gpqa_pass@1:4_samples_stderr": 0.023116332418314938, - "gpqa_pass@1:8_samples": 0.37941919191919193, - "gpqa_pass@1:8_samples_stderr": 0.020691066749223084 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "f67476bc22ade0f4" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "e01d5e8a161d251b" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json deleted file mode 100644 index 5fb789ba8573d071fb1137ffabb25816c830c060..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.5/lcb_v4/results_2025-05-14T11-17-22.812297.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 665261.659275858, - "end_time": 667567.554631831, - "total_evaluation_time_secondes": "2305.8953559730435", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.0983910891089109, - "codegen_pass@1:16_stderr": 0.01992689560651361 - }, - "all": { - "codegen_pass@1:16": 0.0983910891089109, - "codegen_pass@1:16_stderr": 0.01992689560651361 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "99f5478f5867964b" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "c6ae0af3cc4b2079" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json deleted file mode 100644 index 378a618e3881cfc52974c5a8ad009344d3cb207b..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/aime24/results_2025-05-14T11-18-50.745764.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1036513.965434865, - "end_time": 1038606.671981354, - "total_evaluation_time_secondes": "2092.706546489033", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.3, - "math_pass@1:1_samples_stderr": 0.0850962943396763, - "math_pass@1:4_samples": 0.25833333333333336, - "math_pass@1:4_samples_stderr": 0.060509490721976415, - "math_pass@1:8_samples": 0.23333333333333334, - "math_pass@1:8_samples_stderr": 0.056136806073810275, - "math_pass@1:16_samples": 0.23541666666666666, - "math_pass@1:16_samples_stderr": 0.05651544435100526, - "math_pass@1:32_samples": 0.23333333333333334, - "math_pass@1:32_samples_stderr": 0.05619676067142738, - "math_pass@1:64_samples": 0.225, - "math_pass@1:64_samples_stderr": 0.05496823468797815 - }, - "all": { - "math_pass@1:1_samples": 0.3, - "math_pass@1:1_samples_stderr": 0.0850962943396763, - "math_pass@1:4_samples": 0.25833333333333336, - "math_pass@1:4_samples_stderr": 0.060509490721976415, - "math_pass@1:8_samples": 0.23333333333333334, - "math_pass@1:8_samples_stderr": 0.056136806073810275, - "math_pass@1:16_samples": 0.23541666666666666, - "math_pass@1:16_samples_stderr": 0.05651544435100526, - "math_pass@1:32_samples": 0.23333333333333334, - "math_pass@1:32_samples_stderr": 0.05619676067142738, - "math_pass@1:64_samples": 0.225, - "math_pass@1:64_samples_stderr": 0.05496823468797815 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "35294f76776e888f" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "1a1b4593f16e83ad" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json deleted file mode 100644 index 80a1bf9b3b73d86ced1294112fed5eba73013a2b..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/gpqa/results_2025-05-14T11-07-53.758948.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1788957.695045191, - "end_time": 1790333.486523546, - "total_evaluation_time_secondes": "1375.7914783549495", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.41919191919191917, - "gpqa_pass@1:1_samples_stderr": 0.035155207286704175, - "gpqa_pass@1:4_samples": 0.38762626262626265, - "gpqa_pass@1:4_samples_stderr": 0.024117072654924694, - "gpqa_pass@1:8_samples": 0.3952020202020202, - "gpqa_pass@1:8_samples_stderr": 0.02069180007019445 - }, - "all": { - "gpqa_pass@1:1_samples": 0.41919191919191917, - "gpqa_pass@1:1_samples_stderr": 0.035155207286704175, - "gpqa_pass@1:4_samples": 0.38762626262626265, - "gpqa_pass@1:4_samples_stderr": 0.024117072654924694, - "gpqa_pass@1:8_samples": 0.3952020202020202, - "gpqa_pass@1:8_samples_stderr": 0.02069180007019445 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "ed5bee871908dfc5" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "9265af1b081a4052" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json deleted file mode 100644 index 272947ee0efaab97b1d218da65bb97bb822a5fb6..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.6/lcb_v4/results_2025-05-14T11-34-56.129181.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 142042.086601042, - "end_time": 144975.10104017, - "total_evaluation_time_secondes": "2933.01443912799", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.1219059405940594, - "codegen_pass@1:16_stderr": 0.02220405336387271 - }, - "all": { - "codegen_pass@1:16": 0.1219059405940594, - "codegen_pass@1:16_stderr": 0.02220405336387271 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "04c3944ff3ccf570" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "a7aea995096e87e1" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json deleted file mode 100644 index ad59d13ccda0e848dbc1e1ef2ec1e2c24fdab0fb..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/aime24/results_2025-05-14T11-20-42.747692.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 193540.0209108, - "end_time": 195561.734842471, - "total_evaluation_time_secondes": "2021.7139316709945", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.08753762190648169, - "math_pass@1:4_samples": 0.26666666666666666, - "math_pass@1:4_samples_stderr": 0.06447548466359415, - "math_pass@1:8_samples": 0.3, - "math_pass@1:8_samples_stderr": 0.06608945522512666, - "math_pass@1:16_samples": 0.30833333333333335, - "math_pass@1:16_samples_stderr": 0.06537451054006056, - "math_pass@1:32_samples": 0.3, - "math_pass@1:32_samples_stderr": 0.06204214478791862, - "math_pass@1:64_samples": 0.28958333333333336, - "math_pass@1:64_samples_stderr": 0.06107266238666994 - }, - "all": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.08753762190648169, - "math_pass@1:4_samples": 0.26666666666666666, - "math_pass@1:4_samples_stderr": 0.06447548466359415, - "math_pass@1:8_samples": 0.3, - "math_pass@1:8_samples_stderr": 0.06608945522512666, - "math_pass@1:16_samples": 0.30833333333333335, - "math_pass@1:16_samples_stderr": 0.06537451054006056, - "math_pass@1:32_samples": 0.3, - "math_pass@1:32_samples_stderr": 0.06204214478791862, - "math_pass@1:64_samples": 0.28958333333333336, - "math_pass@1:64_samples_stderr": 0.06107266238666994 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "9c99493d67fe2f24" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "bed1b477cfa0b1ff" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json deleted file mode 100644 index 169b8af2be2f775a726b8b3803a234898293623b..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/gpqa/results_2025-05-14T11-24-34.212351.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 666253.876774008, - "end_time": 668195.952495732, - "total_evaluation_time_secondes": "1942.0757217239588", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.03521224908841583, - "gpqa_pass@1:4_samples": 0.43308080808080807, - "gpqa_pass@1:4_samples_stderr": 0.024044484352700964, - "gpqa_pass@1:8_samples": 0.42803030303030304, - "gpqa_pass@1:8_samples_stderr": 0.020899215514001906 - }, - "all": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.03521224908841583, - "gpqa_pass@1:4_samples": 0.43308080808080807, - "gpqa_pass@1:4_samples_stderr": 0.024044484352700964, - "gpqa_pass@1:8_samples": 0.42803030303030304, - "gpqa_pass@1:8_samples_stderr": 0.020899215514001906 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "0124da80a3b3148c" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "640d6ef92615cdc9" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json deleted file mode 100644 index d87e4a8d81a95eeeb9dfc508768891111810df70..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.7/lcb_v4/results_2025-05-14T11-53-41.171704.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4103397.762373666, - "end_time": 4107093.315890551, - "total_evaluation_time_secondes": "3695.5535168852657", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.13242574257425743, - "codegen_pass@1:16_stderr": 0.023340448876772665 - }, - "all": { - "codegen_pass@1:16": 0.13242574257425743, - "codegen_pass@1:16_stderr": 0.023340448876772665 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "bbd21ba6aa3e9ebc" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "42b830bb8babc2e2" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json deleted file mode 100644 index 4c7d45e253ec6e21ac4adaf6464a4f0129818315..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/aime24/results_2025-05-14T11-45-10.614790.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3105542.202297136, - "end_time": 3108662.362517458, - "total_evaluation_time_secondes": "3120.1602203217335", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.23333333333333334, - "math_pass@1:1_samples_stderr": 0.07854032324531728, - "math_pass@1:4_samples": 0.2833333333333333, - "math_pass@1:4_samples_stderr": 0.06645079993694987, - "math_pass@1:8_samples": 0.325, - "math_pass@1:8_samples_stderr": 0.06581714481595306, - "math_pass@1:16_samples": 0.3375, - "math_pass@1:16_samples_stderr": 0.06596705464738117, - "math_pass@1:32_samples": 0.33333333333333326, - "math_pass@1:32_samples_stderr": 0.06514402515511274, - "math_pass@1:64_samples": 0.3302083333333333, - "math_pass@1:64_samples_stderr": 0.06416922063509661 - }, - "all": { - "math_pass@1:1_samples": 0.23333333333333334, - "math_pass@1:1_samples_stderr": 0.07854032324531728, - "math_pass@1:4_samples": 0.2833333333333333, - "math_pass@1:4_samples_stderr": 0.06645079993694987, - "math_pass@1:8_samples": 0.325, - "math_pass@1:8_samples_stderr": 0.06581714481595306, - "math_pass@1:16_samples": 0.3375, - "math_pass@1:16_samples_stderr": 0.06596705464738117, - "math_pass@1:32_samples": 0.33333333333333326, - "math_pass@1:32_samples_stderr": 0.06514402515511274, - "math_pass@1:64_samples": 0.3302083333333333, - "math_pass@1:64_samples_stderr": 0.06416922063509661 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "ad2c9153cc62d80c" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "39dba3b375f55f0a" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json deleted file mode 100644 index 945efa5104d850d5dae39bcfe677968c4fa0a816..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/gpqa/results_2025-05-14T11-45-03.177351.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1451567.197422397, - "end_time": 1454324.521386077, - "total_evaluation_time_secondes": "2757.323963679839", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.47474747474747475, - "gpqa_pass@1:1_samples_stderr": 0.03557806245087314, - "gpqa_pass@1:4_samples": 0.44065656565656564, - "gpqa_pass@1:4_samples_stderr": 0.02492886595604172, - "gpqa_pass@1:8_samples": 0.43813131313131315, - "gpqa_pass@1:8_samples_stderr": 0.022425998553299985 - }, - "all": { - "gpqa_pass@1:1_samples": 0.47474747474747475, - "gpqa_pass@1:1_samples_stderr": 0.03557806245087314, - "gpqa_pass@1:4_samples": 0.44065656565656564, - "gpqa_pass@1:4_samples_stderr": 0.02492886595604172, - "gpqa_pass@1:8_samples": 0.43813131313131315, - "gpqa_pass@1:8_samples_stderr": 0.022425998553299985 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "53e37624af23b9ca" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "a1cb53db5b9766ea" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json deleted file mode 100644 index 14524e841865df15fb6854975871c7263e269a0e..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.8/lcb_v4/results_2025-05-14T12-18-31.894813.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1732255.309611542, - "end_time": 1737024.368172801, - "total_evaluation_time_secondes": "4769.058561259182", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.1373762376237624, - "codegen_pass@1:16_stderr": 0.022884162024885638 - }, - "all": { - "codegen_pass@1:16": 0.1373762376237624, - "codegen_pass@1:16_stderr": 0.022884162024885638 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "27c2f3670bc0b1b6" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "989470b0bec8abc3" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json deleted file mode 100644 index 83f74b511e80909b07896bfd1328c052698b86e0..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/aime24/results_2025-05-14T12-28-41.332650.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 666848.134926491, - "end_time": 672038.016941095, - "total_evaluation_time_secondes": "5189.8820146040525", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.31666666666666665, - "math_pass@1:4_samples_stderr": 0.06447548466359414, - "math_pass@1:8_samples": 0.3416666666666667, - "math_pass@1:8_samples_stderr": 0.06369076392980136, - "math_pass@1:16_samples": 0.36666666666666664, - "math_pass@1:16_samples_stderr": 0.06269605266009247, - "math_pass@1:32_samples": 0.35625, - "math_pass@1:32_samples_stderr": 0.06415901581496421, - "math_pass@1:64_samples": 0.3536458333333333, - "math_pass@1:64_samples_stderr": 0.0623673310546343 - }, - "all": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.31666666666666665, - "math_pass@1:4_samples_stderr": 0.06447548466359414, - "math_pass@1:8_samples": 0.3416666666666667, - "math_pass@1:8_samples_stderr": 0.06369076392980136, - "math_pass@1:16_samples": 0.36666666666666664, - "math_pass@1:16_samples_stderr": 0.06269605266009247, - "math_pass@1:32_samples": 0.35625, - "math_pass@1:32_samples_stderr": 0.06415901581496421, - "math_pass@1:64_samples": 0.3536458333333333, - "math_pass@1:64_samples_stderr": 0.0623673310546343 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "db9f011765ce1405" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "99e409b11cfeb8fe" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json deleted file mode 100644 index 14f7e80acff497df95bf6ea2c0b4e9651cbe9d99..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/gpqa/results_2025-05-14T12-08-54.197133.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 2310674.72759561, - "end_time": 2314614.622140041, - "total_evaluation_time_secondes": "3939.8945444310084", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.4696969696969697, - "gpqa_pass@1:1_samples_stderr": 0.03555804051763929, - "gpqa_pass@1:4_samples": 0.4734848484848485, - "gpqa_pass@1:4_samples_stderr": 0.02495871351378464, - "gpqa_pass@1:8_samples": 0.47474747474747475, - "gpqa_pass@1:8_samples_stderr": 0.02386611685973234 - }, - "all": { - "gpqa_pass@1:1_samples": 0.4696969696969697, - "gpqa_pass@1:1_samples_stderr": 0.03555804051763929, - "gpqa_pass@1:4_samples": 0.4734848484848485, - "gpqa_pass@1:4_samples_stderr": 0.02495871351378464, - "gpqa_pass@1:8_samples": 0.47474747474747475, - "gpqa_pass@1:8_samples_stderr": 0.02386611685973234 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "e267619bde58d2f4" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "25cf5bcfcac50bfe" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json deleted file mode 100644 index b1f8530dd7366bdf7ba9aac6c50f8b9eab21ccf2..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-0.9/lcb_v4/results_2025-05-14T13-06-33.737278.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1790406.352589241, - "end_time": 1797453.289874955, - "total_evaluation_time_secondes": "7046.937285714084", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.12933168316831684, - "codegen_pass@1:16_stderr": 0.021057614597480553 - }, - "all": { - "codegen_pass@1:16": 0.12933168316831684, - "codegen_pass@1:16_stderr": 0.021057614597480553 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "5cb3b09dbd0d8421" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "3b6e90f800171033" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json deleted file mode 100644 index b0ba5b780016ef93a8e105632ca86f4f64a9e302..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/aime24/results_2025-05-14T12-51-56.018021.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 667672.719953287, - "end_time": 673240.688099726, - "total_evaluation_time_secondes": "5567.968146438943", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.4, - "math_pass@1:4_samples_stderr": 0.06297235299224026, - "math_pass@1:8_samples": 0.3958333333333333, - "math_pass@1:8_samples_stderr": 0.05874750341153335, - "math_pass@1:16_samples": 0.40208333333333335, - "math_pass@1:16_samples_stderr": 0.059457809263169785, - "math_pass@1:32_samples": 0.3968749999999999, - "math_pass@1:32_samples_stderr": 0.060234627158371676, - "math_pass@1:64_samples": 0.3989583333333334, - "math_pass@1:64_samples_stderr": 0.058255348631323874 - }, - "all": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.4, - "math_pass@1:4_samples_stderr": 0.06297235299224026, - "math_pass@1:8_samples": 0.3958333333333333, - "math_pass@1:8_samples_stderr": 0.05874750341153335, - "math_pass@1:16_samples": 0.40208333333333335, - "math_pass@1:16_samples_stderr": 0.059457809263169785, - "math_pass@1:32_samples": 0.3968749999999999, - "math_pass@1:32_samples_stderr": 0.060234627158371676, - "math_pass@1:64_samples": 0.3989583333333334, - "math_pass@1:64_samples_stderr": 0.058255348631323874 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "3c721c7a9344ad5e" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "316677ce6ea763a4" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json deleted file mode 100644 index bdb4c50fb6c3e224ebee51840f577bf4c407d297..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/gpqa/results_2025-05-14T12-30-50.790715.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1038687.737303237, - "end_time": 1042926.270903001, - "total_evaluation_time_secondes": "4238.533599763992", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.41919191919191917, - "gpqa_pass@1:1_samples_stderr": 0.035155207286704175, - "gpqa_pass@1:4_samples": 0.4911616161616162, - "gpqa_pass@1:4_samples_stderr": 0.026149437445442125, - "gpqa_pass@1:8_samples": 0.49747474747474746, - "gpqa_pass@1:8_samples_stderr": 0.02373146147193087 - }, - "all": { - "gpqa_pass@1:1_samples": 0.41919191919191917, - "gpqa_pass@1:1_samples_stderr": 0.035155207286704175, - "gpqa_pass@1:4_samples": 0.4911616161616162, - "gpqa_pass@1:4_samples_stderr": 0.026149437445442125, - "gpqa_pass@1:8_samples": 0.49747474747474746, - "gpqa_pass@1:8_samples_stderr": 0.02373146147193087 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "5f36388c28f23b2f" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "b4ff756ebb6cb76f" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json deleted file mode 100644 index 8940c03082a2fb0c316697e5786f0da5cc38ad26..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_task-arithmetic_weights-0.5-0.5_lambda-1.0/lcb_v4/results_2025-05-14T13-24-55.996930.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 195649.240633843, - "end_time": 203014.740776724, - "total_evaluation_time_secondes": "7365.5001428810065", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.12995049504950495, - "codegen_pass@1:16_stderr": 0.019894585442980322 - }, - "all": { - "codegen_pass@1:16": 0.12995049504950495, - "codegen_pass@1:16_stderr": 0.019894585442980322 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "058943720bd4b8b5" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "a230e53cef0867b7" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json deleted file mode 100644 index b4af3a24525e1d3d70c841ca6e7c600bed3e0297..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/aime24/results_2025-05-14T10-22-19.943965.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3515687.21682791, - "end_time": 3535561.065907141, - "total_evaluation_time_secondes": "19873.849079231266", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.26666666666666666, - "math_pass@1:1_samples_stderr": 0.0821175682735253, - "math_pass@1:4_samples": 0.3416666666666667, - "math_pass@1:4_samples_stderr": 0.07140154113622724, - "math_pass@1:8_samples": 0.32916666666666666, - "math_pass@1:8_samples_stderr": 0.05947668495839227, - "math_pass@1:16_samples": 0.3333333333333333, - "math_pass@1:16_samples_stderr": 0.06099725984675264, - "math_pass@1:32_samples": 0.3510416666666666, - "math_pass@1:32_samples_stderr": 0.06140717235446427, - "math_pass@1:64_samples": 0.3567708333333333, - "math_pass@1:64_samples_stderr": 0.06195924170213546 - }, - "all": { - "math_pass@1:1_samples": 0.26666666666666666, - "math_pass@1:1_samples_stderr": 0.0821175682735253, - "math_pass@1:4_samples": 0.3416666666666667, - "math_pass@1:4_samples_stderr": 0.07140154113622724, - "math_pass@1:8_samples": 0.32916666666666666, - "math_pass@1:8_samples_stderr": 0.05947668495839227, - "math_pass@1:16_samples": 0.3333333333333333, - "math_pass@1:16_samples_stderr": 0.06099725984675264, - "math_pass@1:32_samples": 0.3510416666666666, - "math_pass@1:32_samples_stderr": 0.06140717235446427, - "math_pass@1:64_samples": 0.3567708333333333, - "math_pass@1:64_samples_stderr": 0.06195924170213546 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "1d3092217357c073" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "2c87a9a122dd4f20" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json deleted file mode 100644 index b782d89bf6dd6a2fce5828e87ab4f37932d34686..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/gpqa/results_2025-05-14T09-20-04.118804.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3515687.21683147, - "end_time": 3531824.923517652, - "total_evaluation_time_secondes": "16137.706686181948", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.4797979797979798, - "gpqa_pass@1:1_samples_stderr": 0.035594435655639196, - "gpqa_pass@1:4_samples": 0.4684343434343434, - "gpqa_pass@1:4_samples_stderr": 0.026426494324412967, - "gpqa_pass@1:8_samples": 0.461489898989899, - "gpqa_pass@1:8_samples_stderr": 0.024035774175898385 - }, - "all": { - "gpqa_pass@1:1_samples": 0.4797979797979798, - "gpqa_pass@1:1_samples_stderr": 0.035594435655639196, - "gpqa_pass@1:4_samples": 0.4684343434343434, - "gpqa_pass@1:4_samples_stderr": 0.026426494324412967, - "gpqa_pass@1:8_samples": 0.461489898989899, - "gpqa_pass@1:8_samples_stderr": 0.024035774175898385 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "e7d8965b0240d400" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "17f24ae7e354a0f4" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json deleted file mode 100644 index bda62e01dac7de55f3e748a1e0865786a78a8716..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_ties_densities-0.2-0.2_lambda-1.0/lcb_v4/results_2025-05-14T10-16-21.504316.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3515687.21683316, - "end_time": 3535202.419424415, - "total_evaluation_time_secondes": "19515.202591255307", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.14913366336633663, - "codegen_pass@1:16_stderr": 0.022127176170976755 - }, - "all": { - "codegen_pass@1:16": 0.14913366336633663, - "codegen_pass@1:16_stderr": 0.022127176170976755 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "8c430f2397c9f2d6" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "737d8b05e5405367" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json deleted file mode 100644 index 6414e0e18936d2175ed5fe61d983e7c135df427c..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/aime24/results_2025-05-13T14-36-03.651861.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1459863.759389219, - "end_time": 1464656.175600588, - "total_evaluation_time_secondes": "4792.416211368982", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.08753762190648169, - "math_pass@1:4_samples": 0.375, - "math_pass@1:4_samples_stderr": 0.07266479487202247, - "math_pass@1:8_samples": 0.36666666666666664, - "math_pass@1:8_samples_stderr": 0.06752323284011273, - "math_pass@1:16_samples": 0.3458333333333333, - "math_pass@1:16_samples_stderr": 0.0651543628329568, - "math_pass@1:32_samples": 0.35625, - "math_pass@1:32_samples_stderr": 0.06644516903813058, - "math_pass@1:64_samples": 0.3447916666666667, - "math_pass@1:64_samples_stderr": 0.06668770995112147 - }, - "all": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.08753762190648169, - "math_pass@1:4_samples": 0.375, - "math_pass@1:4_samples_stderr": 0.07266479487202247, - "math_pass@1:8_samples": 0.36666666666666664, - "math_pass@1:8_samples_stderr": 0.06752323284011273, - "math_pass@1:16_samples": 0.3458333333333333, - "math_pass@1:16_samples_stderr": 0.0651543628329568, - "math_pass@1:32_samples": 0.35625, - "math_pass@1:32_samples_stderr": 0.06644516903813058, - "math_pass@1:64_samples": 0.3447916666666667, - "math_pass@1:64_samples_stderr": 0.06668770995112147 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "f1a35fbe9cb5d79d" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "9a1149edaf877e78" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json deleted file mode 100644 index ddea0335fcb306e38229469a219ca2362d263495..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/gpqa/results_2025-05-13T13-58-58.774126.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1459863.759393328, - "end_time": 1462431.22071959, - "total_evaluation_time_secondes": "2567.4613262619823", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.4494949494949495, - "gpqa_pass@1:1_samples_stderr": 0.0354413249194797, - "gpqa_pass@1:4_samples": 0.45580808080808083, - "gpqa_pass@1:4_samples_stderr": 0.025905050629807098, - "gpqa_pass@1:8_samples": 0.4659090909090909, - "gpqa_pass@1:8_samples_stderr": 0.023860690969531115 - }, - "all": { - "gpqa_pass@1:1_samples": 0.4494949494949495, - "gpqa_pass@1:1_samples_stderr": 0.0354413249194797, - "gpqa_pass@1:4_samples": 0.45580808080808083, - "gpqa_pass@1:4_samples_stderr": 0.025905050629807098, - "gpqa_pass@1:8_samples": 0.4659090909090909, - "gpqa_pass@1:8_samples_stderr": 0.023860690969531115 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "95e4a38cf316bba8" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "20add5806d35746d" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json deleted file mode 100644 index 5d36bb2fbd1639d20f5ffe6a647036b56f757fcf..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.1-0.9/lcb_v4/results_2025-05-13T15-09-04.153215.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4549324.508904906, - "end_time": 4556098.186270666, - "total_evaluation_time_secondes": "6773.677365760319", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.28898514851485146, - "codegen_pass@1:16_stderr": 0.031818486170004524 - }, - "all": { - "codegen_pass@1:16": 0.28898514851485146, - "codegen_pass@1:16_stderr": 0.031818486170004524 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "286a12cf9de34606" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "22a2678d6edd498b" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json deleted file mode 100644 index e348c97b9ce33353b4a957001006890b07195cac..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/aime24/results_2025-05-13T14-41-06.734989.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 2232253.173363435, - "end_time": 2237347.560457066, - "total_evaluation_time_secondes": "5094.387093631085", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.08753762190648172, - "math_pass@1:4_samples": 0.36666666666666664, - "math_pass@1:4_samples_stderr": 0.07459982640473675, - "math_pass@1:8_samples": 0.3625, - "math_pass@1:8_samples_stderr": 0.06842596048990117, - "math_pass@1:16_samples": 0.3729166666666667, - "math_pass@1:16_samples_stderr": 0.06849700926155665, - "math_pass@1:32_samples": 0.3625, - "math_pass@1:32_samples_stderr": 0.06721777418709032, - "math_pass@1:64_samples": 0.3661458333333334, - "math_pass@1:64_samples_stderr": 0.06797451270875941 - }, - "all": { - "math_pass@1:1_samples": 0.3333333333333333, - "math_pass@1:1_samples_stderr": 0.08753762190648172, - "math_pass@1:4_samples": 0.36666666666666664, - "math_pass@1:4_samples_stderr": 0.07459982640473675, - "math_pass@1:8_samples": 0.3625, - "math_pass@1:8_samples_stderr": 0.06842596048990117, - "math_pass@1:16_samples": 0.3729166666666667, - "math_pass@1:16_samples_stderr": 0.06849700926155665, - "math_pass@1:32_samples": 0.3625, - "math_pass@1:32_samples_stderr": 0.06721777418709032, - "math_pass@1:64_samples": 0.3661458333333334, - "math_pass@1:64_samples_stderr": 0.06797451270875941 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "414e787f40171ca1" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "ddef51b0406c3982" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json deleted file mode 100644 index 859a0b9aee295fe2b89d5b79037a943bf0634527..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/gpqa/results_2025-05-13T13-58-27.983347.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 2232253.173368685, - "end_time": 2234788.75330911, - "total_evaluation_time_secondes": "2535.579940425232", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.47474747474747475, - "gpqa_pass@1:1_samples_stderr": 0.035578062450873145, - "gpqa_pass@1:4_samples": 0.4671717171717172, - "gpqa_pass@1:4_samples_stderr": 0.02577398329081247, - "gpqa_pass@1:8_samples": 0.46275252525252525, - "gpqa_pass@1:8_samples_stderr": 0.023284154380105133 - }, - "all": { - "gpqa_pass@1:1_samples": 0.47474747474747475, - "gpqa_pass@1:1_samples_stderr": 0.035578062450873145, - "gpqa_pass@1:4_samples": 0.4671717171717172, - "gpqa_pass@1:4_samples_stderr": 0.02577398329081247, - "gpqa_pass@1:8_samples": 0.46275252525252525, - "gpqa_pass@1:8_samples_stderr": 0.023284154380105133 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "e1585b875c6c5ab7" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "931f077715c52375" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json deleted file mode 100644 index 4ace43f485df685bc2a7111ddb7ec279ffb6b280..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.2-0.8/lcb_v4/results_2025-05-13T15-14-37.971063.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 2232253.173378256, - "end_time": 2239358.756926505, - "total_evaluation_time_secondes": "7105.5835482487455", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.31064356435643564, - "codegen_pass@1:16_stderr": 0.03401113048493175 - }, - "all": { - "codegen_pass@1:16": 0.31064356435643564, - "codegen_pass@1:16_stderr": 0.03401113048493175 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "53e4514580daf13c" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "181ee4063ef7c525" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json deleted file mode 100644 index 4fe00b5c7b8a62da0d6cffe0f847dcec5e0897f3..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/aime24/results_2025-05-13T14-41-59.359219.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 45759.384835112, - "end_time": 50908.194217762, - "total_evaluation_time_secondes": "5148.809382649997", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.4083333333333333, - "math_pass@1:4_samples_stderr": 0.07904179504530925, - "math_pass@1:8_samples": 0.4, - "math_pass@1:8_samples_stderr": 0.0725163477407348, - "math_pass@1:16_samples": 0.4, - "math_pass@1:16_samples_stderr": 0.06999589478931119, - "math_pass@1:32_samples": 0.384375, - "math_pass@1:32_samples_stderr": 0.06687540283475192, - "math_pass@1:64_samples": 0.3895833333333334, - "math_pass@1:64_samples_stderr": 0.06801268906639514 - }, - "all": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.4083333333333333, - "math_pass@1:4_samples_stderr": 0.07904179504530925, - "math_pass@1:8_samples": 0.4, - "math_pass@1:8_samples_stderr": 0.0725163477407348, - "math_pass@1:16_samples": 0.4, - "math_pass@1:16_samples_stderr": 0.06999589478931119, - "math_pass@1:32_samples": 0.384375, - "math_pass@1:32_samples_stderr": 0.06687540283475192, - "math_pass@1:64_samples": 0.3895833333333334, - "math_pass@1:64_samples_stderr": 0.06801268906639514 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "6539c0aead2e735a" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "42ffa955b6687226" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json deleted file mode 100644 index d66eecda0242347282a2fca771bbddae3b93ad4e..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/gpqa/results_2025-05-13T13-57-04.900333.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 45759.384842682, - "end_time": 48213.678803292, - "total_evaluation_time_secondes": "2454.2939606100044", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.494949494949495, - "gpqa_pass@1:1_samples_stderr": 0.035621707606254015, - "gpqa_pass@1:4_samples": 0.48737373737373735, - "gpqa_pass@1:4_samples_stderr": 0.02574005112363341, - "gpqa_pass@1:8_samples": 0.4722222222222222, - "gpqa_pass@1:8_samples_stderr": 0.023598609724334477 - }, - "all": { - "gpqa_pass@1:1_samples": 0.494949494949495, - "gpqa_pass@1:1_samples_stderr": 0.035621707606254015, - "gpqa_pass@1:4_samples": 0.48737373737373735, - "gpqa_pass@1:4_samples_stderr": 0.02574005112363341, - "gpqa_pass@1:8_samples": 0.4722222222222222, - "gpqa_pass@1:8_samples_stderr": 0.023598609724334477 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "37b107e03a2fa405" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "6503080bbaae38cf" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json deleted file mode 100644 index af9bebdebcd2137bf154fcae123d050bf788b7fe..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.3-0.7/lcb_v4/results_2025-05-13T15-07-31.803299.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3757625.063178942, - "end_time": 3764306.98508252, - "total_evaluation_time_secondes": "6681.921903578099", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.3087871287128713, - "codegen_pass@1:16_stderr": 0.03410613555508327 - }, - "all": { - "codegen_pass@1:16": 0.3087871287128713, - "codegen_pass@1:16_stderr": 0.03410613555508327 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "fd6fcb680afd951f" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "4f8fb4bb608ae2ab" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/aime24/results_2025-05-13T14-39-59.230333.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/aime24/results_2025-05-13T14-39-59.230333.json deleted file mode 100644 index 586fd29480e030fbc327f379dc85ab0a49dd580a..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/aime24/results_2025-05-13T14-39-59.230333.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3799751.29225486, - "end_time": 3804778.284422315, - "total_evaluation_time_secondes": "5026.992167455144", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.4, - "math_pass@1:4_samples_stderr": 0.07050719373096687, - "math_pass@1:8_samples": 0.39166666666666666, - "math_pass@1:8_samples_stderr": 0.06800030989336643, - "math_pass@1:16_samples": 0.38958333333333334, - "math_pass@1:16_samples_stderr": 0.06711973382383435, - "math_pass@1:32_samples": 0.403125, - "math_pass@1:32_samples_stderr": 0.06704638850234139, - "math_pass@1:64_samples": 0.4041666666666667, - "math_pass@1:64_samples_stderr": 0.06478723143336784 - }, - "all": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.4, - "math_pass@1:4_samples_stderr": 0.07050719373096687, - "math_pass@1:8_samples": 0.39166666666666666, - "math_pass@1:8_samples_stderr": 0.06800030989336643, - "math_pass@1:16_samples": 0.38958333333333334, - "math_pass@1:16_samples_stderr": 0.06711973382383435, - "math_pass@1:32_samples": 0.403125, - "math_pass@1:32_samples_stderr": 0.06704638850234139, - "math_pass@1:64_samples": 0.4041666666666667, - "math_pass@1:64_samples_stderr": 0.06478723143336784 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "2c40520fc01c01e1" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "672c3fa6fb618daf" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/gpqa/results_2025-05-13T13-57-02.293635.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/gpqa/results_2025-05-13T13-57-02.293635.json deleted file mode 100644 index 9270de3af693a3d329d20652615b5bb50b4b0ab7..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/gpqa/results_2025-05-13T13-57-02.293635.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3799751.29224775, - "end_time": 3802201.300240822, - "total_evaluation_time_secondes": "2450.0079930722713", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.48484848484848486, - "gpqa_pass@1:1_samples_stderr": 0.03560716516531061, - "gpqa_pass@1:4_samples": 0.4911616161616162, - "gpqa_pass@1:4_samples_stderr": 0.026874685131644954, - "gpqa_pass@1:8_samples": 0.47664141414141414, - "gpqa_pass@1:8_samples_stderr": 0.025380426813414573 - }, - "all": { - "gpqa_pass@1:1_samples": 0.48484848484848486, - "gpqa_pass@1:1_samples_stderr": 0.03560716516531061, - "gpqa_pass@1:4_samples": 0.4911616161616162, - "gpqa_pass@1:4_samples_stderr": 0.026874685131644954, - "gpqa_pass@1:8_samples": 0.47664141414141414, - "gpqa_pass@1:8_samples_stderr": 0.025380426813414573 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "ffd6f39ab2d65bf6" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "ab2ac6951ad6978c" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/lcb_v4/results_2025-05-13T15-00-48.328798.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/lcb_v4/results_2025-05-13T15-00-48.328798.json deleted file mode 100644 index dc1e6d6973443de4f05210379377d7907bcb6ea5..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.4-0.6/lcb_v4/results_2025-05-13T15-00-48.328798.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3799751.29225413, - "end_time": 3806027.297762342, - "total_evaluation_time_secondes": "6276.005508211907", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.28155940594059403, - "codegen_pass@1:16_stderr": 0.03287982264226147 - }, - "all": { - "codegen_pass@1:16": 0.28155940594059403, - "codegen_pass@1:16_stderr": 0.03287982264226147 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "8feeeec45beeebab" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "d135a42acf59f7e7" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/aime24/results_2025-05-12T10-46-04.540285.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/aime24/results_2025-05-12T10-46-04.540285.json deleted file mode 100644 index b43a1caa50320f76561353f9e1dfeb4f1e5c2bf6..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/aime24/results_2025-05-12T10-46-04.540285.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4463838.714018859, - "end_time": 4468756.585755507, - "total_evaluation_time_secondes": "4917.8717366484925", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.4166666666666667, - "math_pass@1:4_samples_stderr": 0.07890534607899988, - "math_pass@1:8_samples": 0.42083333333333334, - "math_pass@1:8_samples_stderr": 0.06872278283965262, - "math_pass@1:16_samples": 0.425, - "math_pass@1:16_samples_stderr": 0.06907901938751179, - "math_pass@1:32_samples": 0.42083333333333334, - "math_pass@1:32_samples_stderr": 0.06661613566285078, - "math_pass@1:64_samples": 0.4234375, - "math_pass@1:64_samples_stderr": 0.06635197875279168 - }, - "all": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.4166666666666667, - "math_pass@1:4_samples_stderr": 0.07890534607899988, - "math_pass@1:8_samples": 0.42083333333333334, - "math_pass@1:8_samples_stderr": 0.06872278283965262, - "math_pass@1:16_samples": 0.425, - "math_pass@1:16_samples_stderr": 0.06907901938751179, - "math_pass@1:32_samples": 0.42083333333333334, - "math_pass@1:32_samples_stderr": 0.06661613566285078, - "math_pass@1:64_samples": 0.4234375, - "math_pass@1:64_samples_stderr": 0.06635197875279168 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "5c085603b5afe559" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "938d735a1cc90b1d" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/aime24/results_2025-05-13T14-38-23.245357.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/aime24/results_2025-05-13T14-38-23.245357.json deleted file mode 100644 index c6ef0246db39ff32db2624d17c83981c59ce4b6a..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/aime24/results_2025-05-13T14-38-23.245357.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 3799803.384129785, - "end_time": 3804682.29999853, - "total_evaluation_time_secondes": "4878.915868745185", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.4166666666666667, - "math_pass@1:4_samples_stderr": 0.07890534607899988, - "math_pass@1:8_samples": 0.42083333333333334, - "math_pass@1:8_samples_stderr": 0.06872278283965262, - "math_pass@1:16_samples": 0.425, - "math_pass@1:16_samples_stderr": 0.06907901938751179, - "math_pass@1:32_samples": 0.42083333333333334, - "math_pass@1:32_samples_stderr": 0.06661613566285078, - "math_pass@1:64_samples": 0.4234375, - "math_pass@1:64_samples_stderr": 0.06635197875279168 - }, - "all": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.4166666666666667, - "math_pass@1:4_samples_stderr": 0.07890534607899988, - "math_pass@1:8_samples": 0.42083333333333334, - "math_pass@1:8_samples_stderr": 0.06872278283965262, - "math_pass@1:16_samples": 0.425, - "math_pass@1:16_samples_stderr": 0.06907901938751179, - "math_pass@1:32_samples": 0.42083333333333334, - "math_pass@1:32_samples_stderr": 0.06661613566285078, - "math_pass@1:64_samples": 0.4234375, - "math_pass@1:64_samples_stderr": 0.06635197875279168 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "5c085603b5afe559" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "938d735a1cc90b1d" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/gpqa/results_2025-05-12T10-02-22.317488.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/gpqa/results_2025-05-12T10-02-22.317488.json deleted file mode 100644 index 2c2202fd3b1157d99783400664c4077e66d9a6a4..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/gpqa/results_2025-05-12T10-02-22.317488.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 488350.903281109, - "end_time": 490640.929643653, - "total_evaluation_time_secondes": "2290.026362543984", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.47474747474747475, - "gpqa_pass@1:1_samples_stderr": 0.03557806245087314, - "gpqa_pass@1:4_samples": 0.4861111111111111, - "gpqa_pass@1:4_samples_stderr": 0.02791683136210848, - "gpqa_pass@1:8_samples": 0.4930555555555556, - "gpqa_pass@1:8_samples_stderr": 0.026236447677077328 - }, - "all": { - "gpqa_pass@1:1_samples": 0.47474747474747475, - "gpqa_pass@1:1_samples_stderr": 0.03557806245087314, - "gpqa_pass@1:4_samples": 0.4861111111111111, - "gpqa_pass@1:4_samples_stderr": 0.02791683136210848, - "gpqa_pass@1:8_samples": 0.4930555555555556, - "gpqa_pass@1:8_samples_stderr": 0.026236447677077328 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "b7b92997c8ffd571" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "b51c44e1ced24613" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/gpqa/results_2025-05-13T13-55-35.083431.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/gpqa/results_2025-05-13T13-55-35.083431.json deleted file mode 100644 index a62c57de56fd33600122e84f743346745a9fcf91..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/gpqa/results_2025-05-13T13-55-35.083431.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4567329.908377136, - "end_time": 4569629.97469897, - "total_evaluation_time_secondes": "2300.0663218339905", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.494949494949495, - "gpqa_pass@1:1_samples_stderr": 0.035621707606254015, - "gpqa_pass@1:4_samples": 0.48737373737373735, - "gpqa_pass@1:4_samples_stderr": 0.027544314973823158, - "gpqa_pass@1:8_samples": 0.4842171717171717, - "gpqa_pass@1:8_samples_stderr": 0.025986807193674563 - }, - "all": { - "gpqa_pass@1:1_samples": 0.494949494949495, - "gpqa_pass@1:1_samples_stderr": 0.035621707606254015, - "gpqa_pass@1:4_samples": 0.48737373737373735, - "gpqa_pass@1:4_samples_stderr": 0.027544314973823158, - "gpqa_pass@1:8_samples": 0.4842171717171717, - "gpqa_pass@1:8_samples_stderr": 0.025986807193674563 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "8fb4097147d61693" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "56b1870998d1a907" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/lcb_v4/results_2025-05-12T10-53-26.461126.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/lcb_v4/results_2025-05-12T10-53-26.461126.json deleted file mode 100644 index 77964b185dd2406876013cc41bcd943a9386c971..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/lcb_v4/results_2025-05-12T10-53-26.461126.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 488350.903285729, - "end_time": 493705.110562375, - "total_evaluation_time_secondes": "5354.20727664599", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.21349009900990099, - "codegen_pass@1:16_stderr": 0.029265811483572105 - }, - "all": { - "codegen_pass@1:16": 0.21349009900990099, - "codegen_pass@1:16_stderr": 0.029265811483572105 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "f13f5961bd062204" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "6258c81f7d37dc2a" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/lcb_v4/results_2025-05-13T14-43-51.711312.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/lcb_v4/results_2025-05-13T14-43-51.711312.json deleted file mode 100644 index 27b72e5100043a8d4c5f7b9b28b13ab6587a9189..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.5-0.5/lcb_v4/results_2025-05-13T14-43-51.711312.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4567329.908364696, - "end_time": 4572526.621674105, - "total_evaluation_time_secondes": "5196.7133094081655", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.21349009900990099, - "codegen_pass@1:16_stderr": 0.029265811483572105 - }, - "all": { - "codegen_pass@1:16": 0.21349009900990099, - "codegen_pass@1:16_stderr": 0.029265811483572105 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "f13f5961bd062204" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "6258c81f7d37dc2a" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/aime24/results_2025-05-13T14-40-09.968229.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/aime24/results_2025-05-13T14-40-09.968229.json deleted file mode 100644 index 1f16be17c5ef1f4dfabafb5f5559108d3c3a0712..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/aime24/results_2025-05-13T14-40-09.968229.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4567329.908369516, - "end_time": 4572304.903238916, - "total_evaluation_time_secondes": "4974.994869399816", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.475, - "math_pass@1:4_samples_stderr": 0.07422976135799149, - "math_pass@1:8_samples": 0.4583333333333333, - "math_pass@1:8_samples_stderr": 0.0724998348525005, - "math_pass@1:16_samples": 0.46041666666666664, - "math_pass@1:16_samples_stderr": 0.06816847403668663, - "math_pass@1:32_samples": 0.46145833333333336, - "math_pass@1:32_samples_stderr": 0.06592081204179029, - "math_pass@1:64_samples": 0.46718750000000003, - "math_pass@1:64_samples_stderr": 0.06570598280495424 - }, - "all": { - "math_pass@1:1_samples": 0.43333333333333335, - "math_pass@1:1_samples_stderr": 0.0920186554465537, - "math_pass@1:4_samples": 0.475, - "math_pass@1:4_samples_stderr": 0.07422976135799149, - "math_pass@1:8_samples": 0.4583333333333333, - "math_pass@1:8_samples_stderr": 0.0724998348525005, - "math_pass@1:16_samples": 0.46041666666666664, - "math_pass@1:16_samples_stderr": 0.06816847403668663, - "math_pass@1:32_samples": 0.46145833333333336, - "math_pass@1:32_samples_stderr": 0.06592081204179029, - "math_pass@1:64_samples": 0.46718750000000003, - "math_pass@1:64_samples_stderr": 0.06570598280495424 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "ac3119e2148fc694" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "fd245f19cb5c567a" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/gpqa/results_2025-05-13T13-55-40.393198.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/gpqa/results_2025-05-13T13-55-40.393198.json deleted file mode 100644 index 6c7acd9372381e2cb34e3c535a38bf1d27f565f1..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/gpqa/results_2025-05-13T13-55-40.393198.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4567329.908372706, - "end_time": 4569635.280799379, - "total_evaluation_time_secondes": "2305.3724266728386", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.48484848484848486, - "gpqa_pass@1:1_samples_stderr": 0.0356071651653106, - "gpqa_pass@1:4_samples": 0.4734848484848485, - "gpqa_pass@1:4_samples_stderr": 0.026695826741729375, - "gpqa_pass@1:8_samples": 0.4772727272727273, - "gpqa_pass@1:8_samples_stderr": 0.024784476971320545 - }, - "all": { - "gpqa_pass@1:1_samples": 0.48484848484848486, - "gpqa_pass@1:1_samples_stderr": 0.0356071651653106, - "gpqa_pass@1:4_samples": 0.4734848484848485, - "gpqa_pass@1:4_samples_stderr": 0.026695826741729375, - "gpqa_pass@1:8_samples": 0.4772727272727273, - "gpqa_pass@1:8_samples_stderr": 0.024784476971320545 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "31a7b3a06fd101be" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "d2544b3f33c9e84e" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/lcb_v4/results_2025-05-13T14-29-16.634138.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/lcb_v4/results_2025-05-13T14-29-16.634138.json deleted file mode 100644 index a986b399e91648b176e01c0778cd71d6f08e225f..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.6-0.4/lcb_v4/results_2025-05-13T14-29-16.634138.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 120785.490184567, - "end_time": 125107.235486563, - "total_evaluation_time_secondes": "4321.7453019960085", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.18502475247524752, - "codegen_pass@1:16_stderr": 0.02730580612852504 - }, - "all": { - "codegen_pass@1:16": 0.18502475247524752, - "codegen_pass@1:16_stderr": 0.02730580612852504 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "633f2326486641c3" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "335e23470cd180ec" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/aime24/results_2025-05-13T14-42-41.920156.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/aime24/results_2025-05-13T14-42-41.920156.json deleted file mode 100644 index 6ba7b0a4a63fdd4b173dd94473be5c082d0af9d8..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/aime24/results_2025-05-13T14-42-41.920156.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 120785.490179897, - "end_time": 125912.547007965, - "total_evaluation_time_secondes": "5127.056828068002", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.5333333333333333, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.49166666666666664, - "math_pass@1:4_samples_stderr": 0.07338621395376267, - "math_pass@1:8_samples": 0.4666666666666667, - "math_pass@1:8_samples_stderr": 0.06936010203848658, - "math_pass@1:16_samples": 0.475, - "math_pass@1:16_samples_stderr": 0.0666442131728276, - "math_pass@1:32_samples": 0.48020833333333335, - "math_pass@1:32_samples_stderr": 0.06613840860221049, - "math_pass@1:64_samples": 0.48541666666666666, - "math_pass@1:64_samples_stderr": 0.06600674636076301 - }, - "all": { - "math_pass@1:1_samples": 0.5333333333333333, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.49166666666666664, - "math_pass@1:4_samples_stderr": 0.07338621395376267, - "math_pass@1:8_samples": 0.4666666666666667, - "math_pass@1:8_samples_stderr": 0.06936010203848658, - "math_pass@1:16_samples": 0.475, - "math_pass@1:16_samples_stderr": 0.0666442131728276, - "math_pass@1:32_samples": 0.48020833333333335, - "math_pass@1:32_samples_stderr": 0.06613840860221049, - "math_pass@1:64_samples": 0.48541666666666666, - "math_pass@1:64_samples_stderr": 0.06600674636076301 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "5b65bb01be38812f" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "d97a7d52daf08627" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/gpqa/results_2025-05-13T13-53-10.911681.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/gpqa/results_2025-05-13T13-53-10.911681.json deleted file mode 100644 index cd90084cdeaf72f6912d8c27887048e9cc92428c..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/gpqa/results_2025-05-13T13-53-10.911681.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 120785.490182207, - "end_time": 122941.498510333, - "total_evaluation_time_secondes": "2156.0083281260013", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.46464646464646464, - "gpqa_pass@1:1_samples_stderr": 0.035534363688280626, - "gpqa_pass@1:4_samples": 0.4583333333333333, - "gpqa_pass@1:4_samples_stderr": 0.02701584096842624, - "gpqa_pass@1:8_samples": 0.4671717171717172, - "gpqa_pass@1:8_samples_stderr": 0.025255889895316533 - }, - "all": { - "gpqa_pass@1:1_samples": 0.46464646464646464, - "gpqa_pass@1:1_samples_stderr": 0.035534363688280626, - "gpqa_pass@1:4_samples": 0.4583333333333333, - "gpqa_pass@1:4_samples_stderr": 0.02701584096842624, - "gpqa_pass@1:8_samples": 0.4671717171717172, - "gpqa_pass@1:8_samples_stderr": 0.025255889895316533 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "7df0324fb9eaf9b8" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "f099fa3265bcfeb6" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/lcb_v4/results_2025-05-13T14-30-16.731482.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/lcb_v4/results_2025-05-13T14-30-16.731482.json deleted file mode 100644 index fc99e232b10c929c8682d00b7db24ad9259eac0c..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.7-0.3/lcb_v4/results_2025-05-13T14-30-16.731482.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 120785.490174427, - "end_time": 125167.352275467, - "total_evaluation_time_secondes": "4381.862101039995", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.17264851485148514, - "codegen_pass@1:16_stderr": 0.029501170932084736 - }, - "all": { - "codegen_pass@1:16": 0.17264851485148514, - "codegen_pass@1:16_stderr": 0.029501170932084736 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "072100132891f2f3" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "d61e4631446db0c9" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/aime24/results_2025-05-13T14-39-56.092523.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/aime24/results_2025-05-13T14-39-56.092523.json deleted file mode 100644 index a110e68b835b54ff37d694ae5e602f378254c4b6..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/aime24/results_2025-05-13T14-39-56.092523.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 588549.414868867, - "end_time": 593512.831112477, - "total_evaluation_time_secondes": "4963.416243609972", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.45, - "math_pass@1:4_samples_stderr": 0.07131764786188426, - "math_pass@1:8_samples": 0.4375, - "math_pass@1:8_samples_stderr": 0.06853086811837036, - "math_pass@1:16_samples": 0.4708333333333333, - "math_pass@1:16_samples_stderr": 0.06678778509239465, - "math_pass@1:32_samples": 0.484375, - "math_pass@1:32_samples_stderr": 0.06670061321427952, - "math_pass@1:64_samples": 0.4885416666666667, - "math_pass@1:64_samples_stderr": 0.06598123300028236 - }, - "all": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.45, - "math_pass@1:4_samples_stderr": 0.07131764786188426, - "math_pass@1:8_samples": 0.4375, - "math_pass@1:8_samples_stderr": 0.06853086811837036, - "math_pass@1:16_samples": 0.4708333333333333, - "math_pass@1:16_samples_stderr": 0.06678778509239465, - "math_pass@1:32_samples": 0.484375, - "math_pass@1:32_samples_stderr": 0.06670061321427952, - "math_pass@1:64_samples": 0.4885416666666667, - "math_pass@1:64_samples_stderr": 0.06598123300028236 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "4e06b0fbc1428b90" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "28dcec6effdf0497" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/gpqa/results_2025-05-13T13-58-12.428307.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/gpqa/results_2025-05-13T13-58-12.428307.json deleted file mode 100644 index 9e2a4f8e84629d3219590328d27993036747482d..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/gpqa/results_2025-05-13T13-58-12.428307.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 588549.414879617, - "end_time": 591009.09549895, - "total_evaluation_time_secondes": "2459.6806193329394", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.035212249088415824, - "gpqa_pass@1:4_samples": 0.4684343434343434, - "gpqa_pass@1:4_samples_stderr": 0.02593689497228966, - "gpqa_pass@1:8_samples": 0.4621212121212121, - "gpqa_pass@1:8_samples_stderr": 0.024868087139102255 - }, - "all": { - "gpqa_pass@1:1_samples": 0.42424242424242425, - "gpqa_pass@1:1_samples_stderr": 0.035212249088415824, - "gpqa_pass@1:4_samples": 0.4684343434343434, - "gpqa_pass@1:4_samples_stderr": 0.02593689497228966, - "gpqa_pass@1:8_samples": 0.4621212121212121, - "gpqa_pass@1:8_samples_stderr": 0.024868087139102255 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "6006e95295dd82be" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "b5a3da19fe9c8b50" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/lcb_v4/results_2025-05-13T14-16-08.367299.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/lcb_v4/results_2025-05-13T14-16-08.367299.json deleted file mode 100644 index 14fc49c1cc9c845ad0a259c76c508caaa91151af..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.8-0.2/lcb_v4/results_2025-05-13T14-16-08.367299.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 588549.414876887, - "end_time": 592085.098788629, - "total_evaluation_time_secondes": "3535.6839117420604", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.14603960396039603, - "codegen_pass@1:16_stderr": 0.025494448709637404 - }, - "all": { - "codegen_pass@1:16": 0.14603960396039603, - "codegen_pass@1:16_stderr": 0.025494448709637404 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "4c17d281cd76cd06" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "a6093eee379697d3" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/aime24/results_2025-05-13T14-41-47.702610.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/aime24/results_2025-05-13T14-41-47.702610.json deleted file mode 100644 index 1d4c976cf0fd00c6ac54581a0030b203c62deef1..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/aime24/results_2025-05-13T14-41-47.702610.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 588602.398473164, - "end_time": 593624.440975387, - "total_evaluation_time_secondes": "5022.042502223048", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.49166666666666664, - "math_pass@1:4_samples_stderr": 0.07531860806139815, - "math_pass@1:8_samples": 0.5125, - "math_pass@1:8_samples_stderr": 0.07074876628630695, - "math_pass@1:16_samples": 0.51875, - "math_pass@1:16_samples_stderr": 0.06715094401130332, - "math_pass@1:32_samples": 0.503125, - "math_pass@1:32_samples_stderr": 0.06514431233498211, - "math_pass@1:64_samples": 0.5036458333333333, - "math_pass@1:64_samples_stderr": 0.06574156371605695 - }, - "all": { - "math_pass@1:1_samples": 0.4666666666666667, - "math_pass@1:1_samples_stderr": 0.09264111117062017, - "math_pass@1:4_samples": 0.49166666666666664, - "math_pass@1:4_samples_stderr": 0.07531860806139815, - "math_pass@1:8_samples": 0.5125, - "math_pass@1:8_samples_stderr": 0.07074876628630695, - "math_pass@1:16_samples": 0.51875, - "math_pass@1:16_samples_stderr": 0.06715094401130332, - "math_pass@1:32_samples": 0.503125, - "math_pass@1:32_samples_stderr": 0.06514431233498211, - "math_pass@1:64_samples": 0.5036458333333333, - "math_pass@1:64_samples_stderr": 0.06574156371605695 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "a3c54f30a37d3fe6" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "68816fc3389112d1" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/gpqa/results_2025-05-13T13-57-20.752396.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/gpqa/results_2025-05-13T13-57-20.752396.json deleted file mode 100644 index a6f7efe295a285f0087d99ee81dec030cee07956..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/gpqa/results_2025-05-13T13-57-20.752396.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 588856.198575486, - "end_time": 591139.358761718, - "total_evaluation_time_secondes": "2283.1601862320676", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.5, - "gpqa_pass@1:1_samples_stderr": 0.035623524993954825, - "gpqa_pass@1:4_samples": 0.48484848484848486, - "gpqa_pass@1:4_samples_stderr": 0.02579532508916213, - "gpqa_pass@1:8_samples": 0.4715909090909091, - "gpqa_pass@1:8_samples_stderr": 0.024289141865092557 - }, - "all": { - "gpqa_pass@1:1_samples": 0.5, - "gpqa_pass@1:1_samples_stderr": 0.035623524993954825, - "gpqa_pass@1:4_samples": 0.48484848484848486, - "gpqa_pass@1:4_samples_stderr": 0.02579532508916213, - "gpqa_pass@1:8_samples": 0.4715909090909091, - "gpqa_pass@1:8_samples_stderr": 0.024289141865092557 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "dca447268752d598" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "7e7b3263ead6229f" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/lcb_v4/results_2025-05-13T13-58-49.253957.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/lcb_v4/results_2025-05-13T13-58-49.253957.json deleted file mode 100644 index 3a8c6095b99c7ddf2e18898e337d61f82ac16955..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v00.02-step-000003660_v01.02-step-000002600_weights-0.9-0.1/lcb_v4/results_2025-05-13T13-58-49.253957.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 588856.198585426, - "end_time": 591227.936264503, - "total_evaluation_time_secondes": "2371.7376790770795", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.10396039603960396, - "codegen_pass@1:16_stderr": 0.02145823330086779 - }, - "all": { - "codegen_pass@1:16": 0.10396039603960396, - "codegen_pass@1:16_stderr": 0.02145823330086779 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "ac2f343445a797fa" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "06587829e450fd2e" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/aime24/results_2025-05-14T08-58-38.097482.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/aime24/results_2025-05-14T08-58-38.097482.json deleted file mode 100644 index 4556569981023a23d30e449acf3f3190fa44f6e2..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/aime24/results_2025-05-14T08-58-38.097482.json +++ /dev/null @@ -1,174 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4603073.045028018, - "end_time": 4607398.072975843, - "total_evaluation_time_secondes": "4325.02794782538", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|aime24|0": { - "math_pass@1:1_samples": 0.26666666666666666, - "math_pass@1:1_samples_stderr": 0.0821175682735253, - "math_pass@1:4_samples": 0.30833333333333335, - "math_pass@1:4_samples_stderr": 0.07259885584313157, - "math_pass@1:8_samples": 0.30416666666666664, - "math_pass@1:8_samples_stderr": 0.06851339466879558, - "math_pass@1:16_samples": 0.30625, - "math_pass@1:16_samples_stderr": 0.06281172263200556, - "math_pass@1:32_samples": 0.3187499999999999, - "math_pass@1:32_samples_stderr": 0.06293669278608922, - "math_pass@1:64_samples": 0.3223958333333334, - "math_pass@1:64_samples_stderr": 0.06264567852195435 - }, - "all": { - "math_pass@1:1_samples": 0.26666666666666666, - "math_pass@1:1_samples_stderr": 0.0821175682735253, - "math_pass@1:4_samples": 0.30833333333333335, - "math_pass@1:4_samples_stderr": 0.07259885584313157, - "math_pass@1:8_samples": 0.30416666666666664, - "math_pass@1:8_samples_stderr": 0.06851339466879558, - "math_pass@1:16_samples": 0.30625, - "math_pass@1:16_samples_stderr": 0.06281172263200556, - "math_pass@1:32_samples": 0.3187499999999999, - "math_pass@1:32_samples_stderr": 0.06293669278608922, - "math_pass@1:64_samples": 0.3223958333333334, - "math_pass@1:64_samples_stderr": 0.06264567852195435 - } - }, - "versions": { - "lighteval|aime24|0": 2 - }, - "config_tasks": { - "lighteval|aime24": { - "name": "aime24", - "prompt_function": "aime_prompt_fn", - "hf_repo": "HuggingFaceH4/aime_2024", - "hf_subset": "default", - "metric": [ - { - "metric_name": "math_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:16_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:32_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "math_pass@1:64_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": false, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 30, - "effective_num_docs": 30, - "must_remove_duplicate_docs": false, - "version": 2 - } - }, - "summary_tasks": { - "lighteval|aime24|0": { - "hashes": { - "hash_examples": "ddec8fc79d0a014b", - "hash_full_prompts": "d1829811f23cf34b", - "hash_input_tokens": "7211f832bf7f8d79", - "hash_cont_tokens": "4934dfa735f6b3d0" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c903e836a519cf98", - "hash_full_prompts": "09fe8694776a7143", - "hash_input_tokens": "b52bc353fe82900e", - "hash_cont_tokens": "44342b40dc5e2187" - }, - "truncated": 0, - "non_truncated": 30, - "padded": 0, - "non_padded": 30, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/gpqa/results_2025-05-14T08-24-23.198465.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/gpqa/results_2025-05-14T08-24-23.198465.json deleted file mode 100644 index 7bc5eae0b371e817e81c90feb63bc4b4aa5f00ba..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/gpqa/results_2025-05-14T08-24-23.198465.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 4603073.045036878, - "end_time": 4605343.111123389, - "total_evaluation_time_secondes": "2270.0660865101963", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "lighteval|gpqa:diamond|0": { - "gpqa_pass@1:1_samples": 0.46464646464646464, - "gpqa_pass@1:1_samples_stderr": 0.035534363688280626, - "gpqa_pass@1:4_samples": 0.4936868686868687, - "gpqa_pass@1:4_samples_stderr": 0.02740957201231417, - "gpqa_pass@1:8_samples": 0.5031565656565656, - "gpqa_pass@1:8_samples_stderr": 0.026179013958548682 - }, - "all": { - "gpqa_pass@1:1_samples": 0.46464646464646464, - "gpqa_pass@1:1_samples_stderr": 0.035534363688280626, - "gpqa_pass@1:4_samples": 0.4936868686868687, - "gpqa_pass@1:4_samples_stderr": 0.02740957201231417, - "gpqa_pass@1:8_samples": 0.5031565656565656, - "gpqa_pass@1:8_samples_stderr": 0.026179013958548682 - } - }, - "versions": { - "lighteval|gpqa:diamond|0": 1 - }, - "config_tasks": { - "lighteval|gpqa:diamond": { - "name": "gpqa:diamond", - "prompt_function": "gpqa_instruct", - "hf_repo": "Idavidrein/gpqa", - "hf_subset": "gpqa_diamond", - "metric": [ - { - "metric_name": "gpqa_pass@1:1_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:4_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - }, - { - "metric_name": "gpqa_pass@1:8_samples", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "compute", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "train" - ], - "trust_dataset": true, - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "lighteval" - ], - "original_num_docs": 198, - "effective_num_docs": 198, - "must_remove_duplicate_docs": false, - "version": 1 - } - }, - "summary_tasks": { - "lighteval|gpqa:diamond|0": { - "hashes": { - "hash_examples": "50ecb6f5d091bd95", - "hash_full_prompts": "390ed6852fd81cbb", - "hash_input_tokens": "fa33490cb78df6ed", - "hash_cont_tokens": "56c550539537c32c" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "a9318dbdd867770b", - "hash_full_prompts": "d039833a1bb27d62", - "hash_input_tokens": "c464503d38a8a664", - "hash_cont_tokens": "dd5e19b46c7bd0c6" - }, - "truncated": 0, - "non_truncated": 198, - "padded": 0, - "non_padded": 198, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/lcb_v4/results_2025-05-14T09-29-02.154876.json b/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/lcb_v4/results_2025-05-14T09-29-02.154876.json deleted file mode 100644 index 9d974a8e300eba0cdd16f28e047979238a2bcdc7..0000000000000000000000000000000000000000 --- a/eval_results/open-r1/R1-Distill-Qwen-Math-7B-Merges/v01.02-step-000002600_v02.02-step-000004041_weights-0.5-0.5/lcb_v4/results_2025-05-14T09-29-02.154876.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "max_samples": null, - "job_id": 0, - "start_time": 1526482.383376942, - "end_time": 1532634.61994024, - "total_evaluation_time_secondes": "6152.23656329792", - "model_name": "open-r1/R1-Distill-Qwen-Math-7B-Merges", - "model_sha": "", - "model_dtype": null, - "model_size": null, - "generation_parameters": { - "early_stopping": null, - "repetition_penalty": null, - "frequency_penalty": null, - "length_penalty": null, - "presence_penalty": null, - "max_new_tokens": 32768, - "min_new_tokens": null, - "seed": null, - "stop_tokens": null, - "temperature": 0.6, - "top_k": null, - "min_p": null, - "top_p": 0.95, - "truncate_prompt": null, - "response_format": null - } - }, - "results": { - "extended|lcb:codegeneration_v4|0": { - "codegen_pass@1:16": 0.24814356435643564, - "codegen_pass@1:16_stderr": 0.034147315781491154 - }, - "all": { - "codegen_pass@1:16": 0.24814356435643564, - "codegen_pass@1:16_stderr": 0.034147315781491154 - } - }, - "versions": { - "extended|lcb:codegeneration_v4|0": 0 - }, - "config_tasks": { - "extended|lcb:codegeneration_v4": { - "name": "lcb:codegeneration_v4", - "prompt_function": "lcb_codegeneration_prompt_fn", - "hf_repo": "livecodebench/code_generation_lite", - "hf_subset": "v4", - "metric": [ - { - "metric_name": "codegen_pass@1:16", - "higher_is_better": true, - "category": "5", - "use_case": "6", - "sample_level_fn": "codegen_metric", - "corpus_level_fn": "mean" - } - ], - "hf_revision": null, - "hf_filter": null, - "hf_avail_splits": [ - "test" - ], - "trust_dataset": true, - "evaluation_splits": [ - "test" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 32768, - "generation_grammar": null, - "stop_sequence": [], - "num_samples": null, - "suite": [ - "extended" - ], - "original_num_docs": 101, - "effective_num_docs": 101, - "must_remove_duplicate_docs": false, - "version": 0 - } - }, - "summary_tasks": { - "extended|lcb:codegeneration_v4|0": { - "hashes": { - "hash_examples": "e7a7c74abad0ab2c", - "hash_full_prompts": "94b75660a771bba7", - "hash_input_tokens": "e3b779035e44a0d6", - "hash_cont_tokens": "ada1511f4b916d96" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "1e6aa5adc63aa1b2", - "hash_full_prompts": "3f758d56940d66a4", - "hash_input_tokens": "dcc90dd35f661f20", - "hash_cont_tokens": "bf6a547b720cb88e" - }, - "truncated": 0, - "non_truncated": 101, - "padded": 0, - "non_padded": 101, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file