Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

App Files Files Community

kostis-init commited on 12 days ago

Commit

2e2392c

1 Parent(s): bb9ae17

update leaderboard columns and enhance evaluation summary reporting

Browse files

Files changed (4) hide show

src/config.py +3 -1
src/eval.py +6 -2
src/hf_utils.py +12 -11
src/ui.py +10 -10

src/config.py CHANGED Viewed

@@ -6,7 +6,9 @@ DS_SUBMISSIONS_PATH = "submissions"
 DS_RESULTS_PATH = "results"
 # leaderboard
-LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
 # modelling frameworks
 CPMPY_FRAMEWORK = "CPMpy"

 DS_RESULTS_PATH = "results"
 # leaderboard
+# LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
+LDB_COLS = ["Name", "Models Submitted (%)", "Accuracy (%)", "Runtime Errors (%)"]
 # modelling frameworks
 CPMPY_FRAMEWORK = "CPMpy"

src/eval.py CHANGED Viewed

@@ -326,6 +326,10 @@ def main_eval(
                 succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
                 if timeout_occurred:
                     summary_f.write(f"      - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
                     continue
@@ -340,8 +344,6 @@ def main_eval(
                 if generated_solution is None:
                     summary_f.write(f"      - FAILED: Could not extract JSON solution from output: {output}\n")
                     continue
-                models_ran_successfully += 1
                 summary_f.write(f"      - SUCCESS: Got solution: {generated_solution}\n")
                 summary_f.write(f"    2. Checking against ground-truth for '{problem_name}'...\n")
@@ -398,7 +400,9 @@ def main_eval(
             summary_f.write(f"  Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
             summary_f.write("=" * 30 + "\n")
             summary_f.write("Final Evaluation Summary:\n")
             summary_f.write(f"  Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")

                 succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
+                if succ_exec:
+                    models_ran_successfully += 1
+                    summary_f.write("      - SUCCESS: Model executed successfully.\n")
                 if timeout_occurred:
                     summary_f.write(f"      - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
                     continue
                 if generated_solution is None:
                     summary_f.write(f"      - FAILED: Could not extract JSON solution from output: {output}\n")
                     continue
                 summary_f.write(f"      - SUCCESS: Got solution: {generated_solution}\n")
                 summary_f.write(f"    2. Checking against ground-truth for '{problem_name}'...\n")
             summary_f.write(f"  Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
             summary_f.write("=" * 30 + "\n")
             summary_f.write("Final Evaluation Summary:\n")
+            summary_f.write(f"  Submission coverage perc: {float(total_submitted_models) / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
+            summary_f.write(f"  Error perc: {(total_submitted_models - models_ran_successfully) / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
             summary_f.write(f"  Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")

src/hf_utils.py CHANGED Viewed

@@ -51,10 +51,9 @@ def load_leaderboard_data():
             processed_result_dirs.add(dir_name)
             entry = {LDB_COLS[0]: dir_name,
-                     LDB_COLS[1]: 'In Progress...',
-                     LDB_COLS[2]: 'In Progress...',
-                     LDB_COLS[3]: 'In Progress...',
-                     LDB_COLS[4]: 'In Progress...'}
             # check if summary file exists, otherwise skip
             if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
@@ -72,14 +71,16 @@ def load_leaderboard_data():
             if Path(local_summary_path).exists():
                 with open(local_summary_path, "r", encoding="utf-8") as f:
                     for line in f:
-                        if 'Execution perc' in line:
-                            entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
-                        if 'Consistency perc' in line:
-                            entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
-                        if 'Final Solution Accuracy' in line:
                             entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
-                        if 'Total Submitted Models Parsed' in line:
-                            entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
                 os.remove(local_summary_path)
             leaderboard_entries.append(entry)

             processed_result_dirs.add(dir_name)
             entry = {LDB_COLS[0]: dir_name,
+                     LDB_COLS[1]: '*Calculating...*',
+                     LDB_COLS[2]: '*Calculating...*',
+                     LDB_COLS[3]: '*Calculating...*'}
             # check if summary file exists, otherwise skip
             if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
             if Path(local_summary_path).exists():
                 with open(local_summary_path, "r", encoding="utf-8") as f:
                     for line in f:
+                        if 'Error perc' in line:
                             entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
+                        # if 'Execution perc' in line:
+                        #     entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
+                        # if 'Consistency perc' in line:
+                        #     entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
+                        if 'Final Solution Accuracy' in line:
+                            entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
+                        if 'Submission coverage perc' in line:
+                            entry[LDB_COLS[1]] = int(line.split(":")[1].strip())
                 os.remove(local_summary_path)
             leaderboard_entries.append(entry)

src/ui.py CHANGED Viewed

@@ -104,7 +104,7 @@ def create_ui():
             "   This will evaluate your submission locally and print the results to the console.\n"
             "4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
             "\n\n"
-            "### If you have any questions or issues, please feel free to reach out to us."
             "---\n"
         )
@@ -167,15 +167,15 @@ def create_ui():
         gr.Markdown(
             "### If you found our work useful, please consider citing our paper and dataset as follows:\n"
             "```bibtex\n"
-            "@dataset{michailidis_2025_15592407,"
-            "author       = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias},"
-            "title        = {CP-Bench},"
-            "month        = jun,"
-            "year         = 2025,"
-            "publisher    = {Zenodo},"
-            "version      = {1.0.0},"
-            "doi          = {10.5281/zenodo.15592407},"
-            "url          = {https://doi.org/10.5281/zenodo.15592407},"
             "}"
         )

             "   This will evaluate your submission locally and print the results to the console.\n"
             "4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
             "\n\n"
+            "### If you have any questions or issues, please feel free to reach out to us.\n"
             "---\n"
         )
         gr.Markdown(
             "### If you found our work useful, please consider citing our paper and dataset as follows:\n"
             "```bibtex\n"
+            "@dataset{michailidis_2025_15592407,\n"
+            "author       = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias},\n"
+            "title        = {CP-Bench},\n"
+            "month        = jun,\n"
+            "year         = 2025,\n"
+            "publisher    = {Zenodo},\n"
+            "version      = {1.0.0},\n"
+            "doi          = {10.5281/zenodo.15592407},\n"
+            "url          = {https://doi.org/10.5281/zenodo.15592407},\n"
             "}"
         )