Commit
Β·
2e2392c
1
Parent(s):
bb9ae17
update leaderboard columns and enhance evaluation summary reporting
Browse files- src/config.py +3 -1
- src/eval.py +6 -2
- src/hf_utils.py +12 -11
- src/ui.py +10 -10
src/config.py
CHANGED
@@ -6,7 +6,9 @@ DS_SUBMISSIONS_PATH = "submissions"
|
|
6 |
DS_RESULTS_PATH = "results"
|
7 |
|
8 |
# leaderboard
|
9 |
-
LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
|
|
|
|
|
10 |
|
11 |
# modelling frameworks
|
12 |
CPMPY_FRAMEWORK = "CPMpy"
|
|
|
6 |
DS_RESULTS_PATH = "results"
|
7 |
|
8 |
# leaderboard
|
9 |
+
# LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
|
10 |
+
LDB_COLS = ["Name", "Models Submitted (%)", "Accuracy (%)", "Runtime Errors (%)"]
|
11 |
+
|
12 |
|
13 |
# modelling frameworks
|
14 |
CPMPY_FRAMEWORK = "CPMpy"
|
src/eval.py
CHANGED
@@ -326,6 +326,10 @@ def main_eval(
|
|
326 |
|
327 |
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
|
328 |
|
|
|
|
|
|
|
|
|
329 |
if timeout_occurred:
|
330 |
summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
|
331 |
continue
|
@@ -340,8 +344,6 @@ def main_eval(
|
|
340 |
if generated_solution is None:
|
341 |
summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
|
342 |
continue
|
343 |
-
|
344 |
-
models_ran_successfully += 1
|
345 |
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
|
346 |
|
347 |
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
@@ -398,7 +400,9 @@ def main_eval(
|
|
398 |
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
|
399 |
summary_f.write("=" * 30 + "\n")
|
400 |
summary_f.write("Final Evaluation Summary:\n")
|
|
|
401 |
summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
|
|
|
402 |
summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
403 |
summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
404 |
summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
|
|
326 |
|
327 |
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
|
328 |
|
329 |
+
if succ_exec:
|
330 |
+
models_ran_successfully += 1
|
331 |
+
summary_f.write(" - SUCCESS: Model executed successfully.\n")
|
332 |
+
|
333 |
if timeout_occurred:
|
334 |
summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
|
335 |
continue
|
|
|
344 |
if generated_solution is None:
|
345 |
summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
|
346 |
continue
|
|
|
|
|
347 |
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
|
348 |
|
349 |
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
|
|
400 |
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
|
401 |
summary_f.write("=" * 30 + "\n")
|
402 |
summary_f.write("Final Evaluation Summary:\n")
|
403 |
+
summary_f.write(f" Submission coverage perc: {float(total_submitted_models) / len(ground_truth_models) * 100:.2f}%\n")
|
404 |
summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
|
405 |
+
summary_f.write(f" Error perc: {(total_submitted_models - models_ran_successfully) / len(ground_truth_models) * 100:.2f}%\n")
|
406 |
summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
407 |
summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
408 |
summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
src/hf_utils.py
CHANGED
@@ -51,10 +51,9 @@ def load_leaderboard_data():
|
|
51 |
|
52 |
processed_result_dirs.add(dir_name)
|
53 |
entry = {LDB_COLS[0]: dir_name,
|
54 |
-
LDB_COLS[1]: '
|
55 |
-
LDB_COLS[2]: '
|
56 |
-
LDB_COLS[3]: '
|
57 |
-
LDB_COLS[4]: 'In Progress...'}
|
58 |
|
59 |
# check if summary file exists, otherwise skip
|
60 |
if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
|
@@ -72,14 +71,16 @@ def load_leaderboard_data():
|
|
72 |
if Path(local_summary_path).exists():
|
73 |
with open(local_summary_path, "r", encoding="utf-8") as f:
|
74 |
for line in f:
|
75 |
-
if '
|
76 |
-
entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
|
77 |
-
if 'Consistency perc' in line:
|
78 |
-
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
79 |
-
if 'Final Solution Accuracy' in line:
|
80 |
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
|
81 |
-
if '
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
os.remove(local_summary_path)
|
84 |
|
85 |
leaderboard_entries.append(entry)
|
|
|
51 |
|
52 |
processed_result_dirs.add(dir_name)
|
53 |
entry = {LDB_COLS[0]: dir_name,
|
54 |
+
LDB_COLS[1]: '*Calculating...*',
|
55 |
+
LDB_COLS[2]: '*Calculating...*',
|
56 |
+
LDB_COLS[3]: '*Calculating...*'}
|
|
|
57 |
|
58 |
# check if summary file exists, otherwise skip
|
59 |
if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
|
|
|
71 |
if Path(local_summary_path).exists():
|
72 |
with open(local_summary_path, "r", encoding="utf-8") as f:
|
73 |
for line in f:
|
74 |
+
if 'Error perc' in line:
|
|
|
|
|
|
|
|
|
75 |
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
|
76 |
+
# if 'Execution perc' in line:
|
77 |
+
# entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
|
78 |
+
# if 'Consistency perc' in line:
|
79 |
+
# entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
80 |
+
if 'Final Solution Accuracy' in line:
|
81 |
+
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
82 |
+
if 'Submission coverage perc' in line:
|
83 |
+
entry[LDB_COLS[1]] = int(line.split(":")[1].strip())
|
84 |
os.remove(local_summary_path)
|
85 |
|
86 |
leaderboard_entries.append(entry)
|
src/ui.py
CHANGED
@@ -104,7 +104,7 @@ def create_ui():
|
|
104 |
" This will evaluate your submission locally and print the results to the console.\n"
|
105 |
"4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
|
106 |
"\n\n"
|
107 |
-
"### If you have any questions or issues, please feel free to reach out to us
|
108 |
"---\n"
|
109 |
)
|
110 |
|
@@ -167,15 +167,15 @@ def create_ui():
|
|
167 |
gr.Markdown(
|
168 |
"### If you found our work useful, please consider citing our paper and dataset as follows:\n"
|
169 |
"```bibtex\n"
|
170 |
-
"@dataset{michailidis_2025_15592407
|
171 |
-
"author = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias}
|
172 |
-
"title = {CP-Bench}
|
173 |
-
"month = jun
|
174 |
-
"year = 2025
|
175 |
-
"publisher = {Zenodo}
|
176 |
-
"version = {1.0.0}
|
177 |
-
"doi = {10.5281/zenodo.15592407}
|
178 |
-
"url = {https://doi.org/10.5281/zenodo.15592407}
|
179 |
"}"
|
180 |
)
|
181 |
|
|
|
104 |
" This will evaluate your submission locally and print the results to the console.\n"
|
105 |
"4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
|
106 |
"\n\n"
|
107 |
+
"### If you have any questions or issues, please feel free to reach out to us.\n"
|
108 |
"---\n"
|
109 |
)
|
110 |
|
|
|
167 |
gr.Markdown(
|
168 |
"### If you found our work useful, please consider citing our paper and dataset as follows:\n"
|
169 |
"```bibtex\n"
|
170 |
+
"@dataset{michailidis_2025_15592407,\n"
|
171 |
+
"author = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias},\n"
|
172 |
+
"title = {CP-Bench},\n"
|
173 |
+
"month = jun,\n"
|
174 |
+
"year = 2025,\n"
|
175 |
+
"publisher = {Zenodo},\n"
|
176 |
+
"version = {1.0.0},\n"
|
177 |
+
"doi = {10.5281/zenodo.15592407},\n"
|
178 |
+
"url = {https://doi.org/10.5281/zenodo.15592407},\n"
|
179 |
"}"
|
180 |
)
|
181 |
|