Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

App Files Files Community

kostis-init commited on Aug 8

Commit

bf018b4

1 Parent(s): 133e122

clean up model counting

Browse files

Files changed (1) hide show

src/user_eval.py +10 -16

src/user_eval.py CHANGED Viewed

@@ -245,7 +245,7 @@ def evaluate_submission(submitted_models, summary_file_path, modelling_framw, to
         return 1
     # Statistics
-    total_submitted_models = 0
     total_submitted_models_that_also_exist_in_gt = 0
     models_ran_successfully = 0
     consistency_checks_passed = 0
@@ -260,15 +260,17 @@ def evaluate_submission(submitted_models, summary_file_path, modelling_framw, to
             curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
             problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
-            total_submitted_models += 1
-            if problem_name in ground_truth_models:
-                total_submitted_models_that_also_exist_in_gt += 1
             print(f"\n  Processing model: {problem_name}", flush=True)
             summary_f.write(f"\n--- Model: {problem_name} ---\n")
-            summary_f.write("    1. Running submitted model...\n")
             succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT,
                                                             modelling_language=modelling_framw)
@@ -292,16 +294,8 @@ def evaluate_submission(submitted_models, summary_file_path, modelling_framw, to
                 continue
             summary_f.write(f"      - SUCCESS: Got solution: {generated_solution}\n")
-            summary_f.write(f"    2. Checking against ground-truth for '{problem_name}'...\n")
-            if problem_name not in ground_truth_models:
-                summary_f.write(f"      - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
-                continue
-            ground_truth_script_content = ground_truth_models[problem_name]
-            summary_f.write("      - SUCCESS: Found ground-truth model.\n")
-            summary_f.write("    3. Performing self-consistency check on ground-truth model...\n")
-            modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
             try:
                 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
                                                  dir=top_lvl_temp_dir) as tmp_file:

         return 1
     # Statistics
+    total_submitted_models = len(submitted_models)
     total_submitted_models_that_also_exist_in_gt = 0
     models_ran_successfully = 0
     consistency_checks_passed = 0
             curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
             problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
             print(f"\n  Processing model: {problem_name}", flush=True)
             summary_f.write(f"\n--- Model: {problem_name} ---\n")
+            if problem_name in ground_truth_models:
+                total_submitted_models_that_also_exist_in_gt += 1
+                summary_f.write(f"    0. Found ground-truth model for '{problem_name}' in dataset.\n")
+            else:
+                summary_f.write(f"      - SKIPPED: Ground-truth model for '{problem_name}' not found in dataset.\n")
+                continue
+            summary_f.write("    1. Running submitted model...\n")
             succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT,
                                                             modelling_language=modelling_framw)
                 continue
             summary_f.write(f"      - SUCCESS: Got solution: {generated_solution}\n")
+            summary_f.write("    2. Performing self-consistency check on ground-truth model...\n")
+            modified_gt_script = get_modified_script(ground_truth_models[problem_name], generated_solution)
             try:
                 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
                                                  dir=top_lvl_temp_dir) as tmp_file: