Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

App Files Files Community

CP-Bench-Leaderboard / eval.py

kostis-init

replace with simpler app

b5712a3 29 days ago

raw

history blame

17.5 kB

	# eval.py
	import sys
	import os
	import time
	import json
	import subprocess
	import tempfile
	from pathlib import Path
	from datasets import load_dataset # Hugging Face datasets library

	# --- Configuration ---

	DATASET_NAME = "kostis-init/CP-Bench"

	# Column names in the Hugging Face dataset for problem identifier and model script
	PROBLEM_NAME_COLUMN = "id"
	MODEL_CODE_COLUMN = "model"

	# Timeout for running individual model scripts (both generated and modified ground-truth)
	SCRIPT_EXECUTION_TIMEOUT = 60 # seconds


	def extract_json_from_string(text_output: str):
	"""
	Attempts to find and parse the first valid JSON object or array from a string.
	Handles cases where JSON is preceded or followed by non-JSON text.
	"""
	idx = 0
	while idx < len(text_output):
	# Find the next potential start of a JSON structure
	start_brace = text_output.find('{', idx)
	start_bracket = text_output.find('[', idx)

	if start_brace == -1 and start_bracket == -1:
	# No more '{' or '[' found in the rest of the string
	return None

	# Determine the actual starting character for this attempt
	if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
	json_start_index = start_brace
	else:
	json_start_index = start_bracket

	potential_json_segment = text_output[json_start_index:]

	try:
	# Use raw_decode to parse the first valid JSON object from the segment
	decoder = json.JSONDecoder()
	json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
	# Successfully parsed a JSON object
	return json_obj
	except json.JSONDecodeError:
	# This segment (starting at json_start_index) wasn't a valid JSON.
	# Advance the search index past the character that caused the current attempt.
	idx = json_start_index + 1

	return None # No valid JSON found in the entire string


	def run_instance(instance_path_str: str,
	timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
	"""Run the instance file and robustly capture the JSON output."""
	command = [sys.executable, instance_path_str]
	instance_name = Path(instance_path_str).name
	try:
	result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
	errors='replace')

	# Check return code first
	if result.returncode != 0:
	# Log stderr for debugging if the script itself failed
	error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
	print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
	return None

	# Attempt to extract JSON from stdout
	stdout_text = result.stdout
	if not stdout_text or not stdout_text.strip():
	print(f" ERROR: No stdout from {instance_name}.", flush=True)
	return None

	solution = extract_json_from_string(stdout_text)

	if solution is None:
	# Be more verbose if JSON extraction fails
	abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
	print(
	f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
	flush=True)
	return None

	return solution

	except subprocess.TimeoutExpired:
	print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
	return None
	except Exception as e:
	print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
	return None


	def add_constraints_as_string(solution):
	"""Generate constraints as a string to be added to the original script."""
	constraints = ""
	if solution: # Ensure solution is not None
	for key, value in solution.items():
	# Basic escaping for string values if they occur, though typically solutions are numeric/boolean
	if isinstance(value, str):
	constraints += f"\nmodel += ({key} == \"{value}\")"
	else:
	constraints += f"\nmodel += ({key} == {value})"
	return constraints


	def get_modified_script(script_content, solution):
	"""Add constraints to the script content and self-consistency checks."""
	constraints_str = add_constraints_as_string(solution)
	modified_script = f"{script_content}\n{constraints_str}"
	modified_script += """

	# --- Self-consistency check appended by eval.py ---
	# Print the absolute path of the current directory along with the script name
	import os
	# print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug

	# Keep old objective
	old_objective_value = None
	objective_defined = False
	if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
	try:
	# This block assumes 'model' is the CPMpy model object or similar
	# Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
	# or might raise an error if objective_value() is called on an unsolved/unformulated objective.
	# This part might need adjustment based on the specific modeling library used in CP-Bench.
	# For now, we'll try to get it and catch errors.
	# A more robust way might be to inspect model.objective_
	if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
	if model._objective_value is not None: # cpmpy does not have objective_is_min
	objective_defined = True
	old_objective_value = model.objective_value()

	except Exception as e_obj_check:
	# print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
	pass # Objective might not be set or model not solved yet.

	# Check self-consistency
	solved_ok = False
	try:
	if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
	solved_ok = model.solve()
	else:
	print('ERROR: Model object not found or does not have a solve() method.')
	except Exception as e_solve:
	print(f'ERROR: Exception during model.solve(): {e_solve}')
	solved_ok = False # Ensure it's false on exception

	if not solved_ok:
	print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
	else:
	print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')

	# Check if the objective value is the same
	if not objective_defined:
	print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
	else:
	try:
	current_objective_value = model.objective_value()
	# Handle potential floating point inaccuracies if objectives can be floats
	if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
	if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
	print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
	else:
	print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
	elif current_objective_value != old_objective_value: # Integer comparison
	print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
	else:
	print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
	except Exception as e_obj_final:
	print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')

	"""
	return modified_script


	# --- Main Evaluation Logic ---
	def main(submission_path_str: str, results_base_dir_str: str):
	start_time = time.time()
	print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
	print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
	print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)

	submission_path = Path(submission_path_str)
	submission_name = submission_path.name
	result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
	os.makedirs(result_dir_for_submission, exist_ok=True)
	summary_file_path = result_dir_for_submission / "summary.txt"

	# Load ground-truth dataset
	try:
	# Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
	gt_dataset = load_dataset(DATASET_NAME, split="train")
	ground_truth_models = {
	item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
	for item in gt_dataset
	if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
	}
	if not ground_truth_models:
	raise ValueError(
	f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
	print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
	except Exception as e:
	print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
	with open(summary_file_path, "w") as f:
	f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
	return 1 # Indicate failure

	# Statistics
	total_submitted_models = 0
	models_ran_successfully = 0
	gt_models_found = 0
	consistency_checks_passed = 0
	objective_checks_passed = 0 # Includes "NO_OBJECTIVE_DEFINED" as a pass

	with open(summary_file_path, "w") as summary_f:
	summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
	summary_f.write(
	f"Ground-Truth Dataset: {DATASET_NAME}\n")
	summary_f.write("-" * 30 + "\n")

	submitted_model_files = list(submission_path.glob('*.py')) # Assuming Python models
	if not submitted_model_files:
	summary_f.write("No .py model files found in submission.\n")
	print("eval.py: No .py model files found in submission.", flush=True)
	return 0 # No models to evaluate, but script ran.

	for model_file_path in submitted_model_files:
	total_submitted_models += 1
	problem_name = model_file_path.stem # Filename without .py extension
	print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
	summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")

	# 1. Run the submitted model to get its solution
	summary_f.write(" 1. Running submitted model...\n")
	generated_solution = run_instance(str(model_file_path))
	if generated_solution is None:
	summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
	continue # Move to the next model
	models_ran_successfully += 1
	summary_f.write(f" - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")

	# 2. Find corresponding ground-truth model
	summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
	if problem_name not in ground_truth_models:
	summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
	print(f" WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
	continue
	gt_models_found += 1
	ground_truth_script_content = ground_truth_models[problem_name]
	summary_f.write(" - SUCCESS: Found ground-truth model.\n")

	# 3. Modify ground-truth script with solution and run self-consistency check
	summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
	modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)

	consistency_passed_this_model = False
	objective_passed_this_model = False

	try:
	with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
	tmp_file.write(modified_gt_script)
	tmp_file_path_str = tmp_file.name

	# Run the modified ground-truth script
	gt_check_result = subprocess.run(
	[sys.executable, tmp_file_path_str],
	capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
	)
	os.unlink(tmp_file_path_str) # Clean up temp file

	# 4. Parse output of modified ground-truth
	gt_stdout = gt_check_result.stdout
	gt_stderr = gt_check_result.stderr
	# summary_f.write(f" Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
	if gt_stderr:
	summary_f.write(f" Modified GT STDERR: {gt_stderr[:500]}...\n")

	if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
	summary_f.write(" - CONSISTENCY: PASSED\n")
	consistency_checks_passed += 1
	consistency_passed_this_model = True
	elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
	summary_f.write(" - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
	else:
	summary_f.write(" - CONSISTENCY: FAILED (Could not determine consistency from output)\n")

	if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
	"EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
	summary_f.write(" - OBJECTIVE: PASSED (Consistent or no objective)\n")
	objective_checks_passed += 1
	objective_passed_this_model = True
	elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
	summary_f.write(f" - OBJECTIVE: FAILED (Value changed)\n")
	elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
	summary_f.write(f" - OBJECTIVE: FAILED (Error accessing final objective)\n")
	else:
	summary_f.write(" - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")

	except subprocess.TimeoutExpired:
	summary_f.write(
	f" - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
	print(f" ERROR: Timeout running modified GT for {problem_name}", flush=True)
	except Exception as e_gt_run:
	summary_f.write(
	f" - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
	print(f" ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)

	# Final statistics
	summary_f.write("\n" + "=" * 30 + "\n")
	summary_f.write("Overall Evaluation Statistics:\n")
	summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
	summary_f.write(
	f" Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
	summary_f.write(
	f" Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
	summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
	summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")

	# Define an overall score, e.g. number of models that passed both checks against found GT
	fully_passed_models = 0
	# This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
	# This simple score is just the sum of passes, could be more nuanced
	overall_score = consistency_checks_passed + objective_checks_passed
	summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n") # For Gradio app to parse

	elapsed_time = time.time() - start_time
	print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
	print(f"eval.py: Summary written to {summary_file_path}", flush=True)
	return 0 # Success


	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
	print("Example: python eval.py ./submissions/my_run ./results")
	sys.exit(1)

	submission_dir = sys.argv[1]
	results_base_dir = sys.argv[2]

	# Simple check if submission_dir exists
	if not Path(submission_dir).is_dir():
	print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
	sys.exit(1)

	exit_code = main(submission_dir, results_base_dir)
	sys.exit(exit_code)