Commit
Β·
bf018b4
1
Parent(s):
133e122
clean up model counting
Browse files- src/user_eval.py +10 -16
src/user_eval.py
CHANGED
@@ -245,7 +245,7 @@ def evaluate_submission(submitted_models, summary_file_path, modelling_framw, to
|
|
245 |
return 1
|
246 |
|
247 |
# Statistics
|
248 |
-
total_submitted_models =
|
249 |
total_submitted_models_that_also_exist_in_gt = 0
|
250 |
models_ran_successfully = 0
|
251 |
consistency_checks_passed = 0
|
@@ -260,15 +260,17 @@ def evaluate_submission(submitted_models, summary_file_path, modelling_framw, to
|
|
260 |
curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
|
261 |
problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
|
262 |
|
263 |
-
total_submitted_models += 1
|
264 |
-
if problem_name in ground_truth_models:
|
265 |
-
total_submitted_models_that_also_exist_in_gt += 1
|
266 |
-
|
267 |
print(f"\n Processing model: {problem_name}", flush=True)
|
268 |
summary_f.write(f"\n--- Model: {problem_name} ---\n")
|
269 |
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
271 |
|
|
|
272 |
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT,
|
273 |
modelling_language=modelling_framw)
|
274 |
|
@@ -292,16 +294,8 @@ def evaluate_submission(submitted_models, summary_file_path, modelling_framw, to
|
|
292 |
continue
|
293 |
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
|
294 |
|
295 |
-
summary_f.write(
|
296 |
-
|
297 |
-
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
298 |
-
continue
|
299 |
-
ground_truth_script_content = ground_truth_models[problem_name]
|
300 |
-
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
|
301 |
-
|
302 |
-
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
|
303 |
-
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
304 |
-
|
305 |
try:
|
306 |
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
|
307 |
dir=top_lvl_temp_dir) as tmp_file:
|
|
|
245 |
return 1
|
246 |
|
247 |
# Statistics
|
248 |
+
total_submitted_models = len(submitted_models)
|
249 |
total_submitted_models_that_also_exist_in_gt = 0
|
250 |
models_ran_successfully = 0
|
251 |
consistency_checks_passed = 0
|
|
|
260 |
curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
|
261 |
problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
|
262 |
|
|
|
|
|
|
|
|
|
263 |
print(f"\n Processing model: {problem_name}", flush=True)
|
264 |
summary_f.write(f"\n--- Model: {problem_name} ---\n")
|
265 |
|
266 |
+
if problem_name in ground_truth_models:
|
267 |
+
total_submitted_models_that_also_exist_in_gt += 1
|
268 |
+
summary_f.write(f" 0. Found ground-truth model for '{problem_name}' in dataset.\n")
|
269 |
+
else:
|
270 |
+
summary_f.write(f" - SKIPPED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
271 |
+
continue
|
272 |
|
273 |
+
summary_f.write(" 1. Running submitted model...\n")
|
274 |
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT,
|
275 |
modelling_language=modelling_framw)
|
276 |
|
|
|
294 |
continue
|
295 |
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
|
296 |
|
297 |
+
summary_f.write(" 2. Performing self-consistency check on ground-truth model...\n")
|
298 |
+
modified_gt_script = get_modified_script(ground_truth_models[problem_name], generated_solution)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
try:
|
300 |
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
|
301 |
dir=top_lvl_temp_dir) as tmp_file:
|