kostis-init commited on
Commit
2e2392c
Β·
1 Parent(s): bb9ae17

update leaderboard columns and enhance evaluation summary reporting

Browse files
Files changed (4) hide show
  1. src/config.py +3 -1
  2. src/eval.py +6 -2
  3. src/hf_utils.py +12 -11
  4. src/ui.py +10 -10
src/config.py CHANGED
@@ -6,7 +6,9 @@ DS_SUBMISSIONS_PATH = "submissions"
6
  DS_RESULTS_PATH = "results"
7
 
8
  # leaderboard
9
- LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
 
 
10
 
11
  # modelling frameworks
12
  CPMPY_FRAMEWORK = "CPMpy"
 
6
  DS_RESULTS_PATH = "results"
7
 
8
  # leaderboard
9
+ # LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
10
+ LDB_COLS = ["Name", "Models Submitted (%)", "Accuracy (%)", "Runtime Errors (%)"]
11
+
12
 
13
  # modelling frameworks
14
  CPMPY_FRAMEWORK = "CPMpy"
src/eval.py CHANGED
@@ -326,6 +326,10 @@ def main_eval(
326
 
327
  succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
328
 
 
 
 
 
329
  if timeout_occurred:
330
  summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
331
  continue
@@ -340,8 +344,6 @@ def main_eval(
340
  if generated_solution is None:
341
  summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
342
  continue
343
-
344
- models_ran_successfully += 1
345
  summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
346
 
347
  summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
@@ -398,7 +400,9 @@ def main_eval(
398
  summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
399
  summary_f.write("=" * 30 + "\n")
400
  summary_f.write("Final Evaluation Summary:\n")
 
401
  summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
 
402
  summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
403
  summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
404
  summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
 
326
 
327
  succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
328
 
329
+ if succ_exec:
330
+ models_ran_successfully += 1
331
+ summary_f.write(" - SUCCESS: Model executed successfully.\n")
332
+
333
  if timeout_occurred:
334
  summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
335
  continue
 
344
  if generated_solution is None:
345
  summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
346
  continue
 
 
347
  summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
348
 
349
  summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
 
400
  summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
401
  summary_f.write("=" * 30 + "\n")
402
  summary_f.write("Final Evaluation Summary:\n")
403
+ summary_f.write(f" Submission coverage perc: {float(total_submitted_models) / len(ground_truth_models) * 100:.2f}%\n")
404
  summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
405
+ summary_f.write(f" Error perc: {(total_submitted_models - models_ran_successfully) / len(ground_truth_models) * 100:.2f}%\n")
406
  summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
407
  summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
408
  summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
src/hf_utils.py CHANGED
@@ -51,10 +51,9 @@ def load_leaderboard_data():
51
 
52
  processed_result_dirs.add(dir_name)
53
  entry = {LDB_COLS[0]: dir_name,
54
- LDB_COLS[1]: 'In Progress...',
55
- LDB_COLS[2]: 'In Progress...',
56
- LDB_COLS[3]: 'In Progress...',
57
- LDB_COLS[4]: 'In Progress...'}
58
 
59
  # check if summary file exists, otherwise skip
60
  if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
@@ -72,14 +71,16 @@ def load_leaderboard_data():
72
  if Path(local_summary_path).exists():
73
  with open(local_summary_path, "r", encoding="utf-8") as f:
74
  for line in f:
75
- if 'Execution perc' in line:
76
- entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
77
- if 'Consistency perc' in line:
78
- entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
79
- if 'Final Solution Accuracy' in line:
80
  entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
81
- if 'Total Submitted Models Parsed' in line:
82
- entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
 
 
 
 
 
 
83
  os.remove(local_summary_path)
84
 
85
  leaderboard_entries.append(entry)
 
51
 
52
  processed_result_dirs.add(dir_name)
53
  entry = {LDB_COLS[0]: dir_name,
54
+ LDB_COLS[1]: '*Calculating...*',
55
+ LDB_COLS[2]: '*Calculating...*',
56
+ LDB_COLS[3]: '*Calculating...*'}
 
57
 
58
  # check if summary file exists, otherwise skip
59
  if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
 
71
  if Path(local_summary_path).exists():
72
  with open(local_summary_path, "r", encoding="utf-8") as f:
73
  for line in f:
74
+ if 'Error perc' in line:
 
 
 
 
75
  entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
76
+ # if 'Execution perc' in line:
77
+ # entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
78
+ # if 'Consistency perc' in line:
79
+ # entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
80
+ if 'Final Solution Accuracy' in line:
81
+ entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
82
+ if 'Submission coverage perc' in line:
83
+ entry[LDB_COLS[1]] = int(line.split(":")[1].strip())
84
  os.remove(local_summary_path)
85
 
86
  leaderboard_entries.append(entry)
src/ui.py CHANGED
@@ -104,7 +104,7 @@ def create_ui():
104
  " This will evaluate your submission locally and print the results to the console.\n"
105
  "4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
106
  "\n\n"
107
- "### If you have any questions or issues, please feel free to reach out to us."
108
  "---\n"
109
  )
110
 
@@ -167,15 +167,15 @@ def create_ui():
167
  gr.Markdown(
168
  "### If you found our work useful, please consider citing our paper and dataset as follows:\n"
169
  "```bibtex\n"
170
- "@dataset{michailidis_2025_15592407,"
171
- "author = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias},"
172
- "title = {CP-Bench},"
173
- "month = jun,"
174
- "year = 2025,"
175
- "publisher = {Zenodo},"
176
- "version = {1.0.0},"
177
- "doi = {10.5281/zenodo.15592407},"
178
- "url = {https://doi.org/10.5281/zenodo.15592407},"
179
  "}"
180
  )
181
 
 
104
  " This will evaluate your submission locally and print the results to the console.\n"
105
  "4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
106
  "\n\n"
107
+ "### If you have any questions or issues, please feel free to reach out to us.\n"
108
  "---\n"
109
  )
110
 
 
167
  gr.Markdown(
168
  "### If you found our work useful, please consider citing our paper and dataset as follows:\n"
169
  "```bibtex\n"
170
+ "@dataset{michailidis_2025_15592407,\n"
171
+ "author = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias},\n"
172
+ "title = {CP-Bench},\n"
173
+ "month = jun,\n"
174
+ "year = 2025,\n"
175
+ "publisher = {Zenodo},\n"
176
+ "version = {1.0.0},\n"
177
+ "doi = {10.5281/zenodo.15592407},\n"
178
+ "url = {https://doi.org/10.5281/zenodo.15592407},\n"
179
  "}"
180
  )
181