Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

File size: 17,528 Bytes

b5712a3

# eval.py
import sys
import os
import time
import json
import subprocess
import tempfile
from pathlib import Path
from datasets import load_dataset  # Hugging Face datasets library

# --- Configuration ---

DATASET_NAME = "kostis-init/CP-Bench"

# Column names in the Hugging Face dataset for problem identifier and model script
PROBLEM_NAME_COLUMN = "id"
MODEL_CODE_COLUMN = "model"

# Timeout for running individual model scripts (both generated and modified ground-truth)
SCRIPT_EXECUTION_TIMEOUT = 60  # seconds


def extract_json_from_string(text_output: str):
    """
    Attempts to find and parse the first valid JSON object or array from a string.
    Handles cases where JSON is preceded or followed by non-JSON text.
    """
    idx = 0
    while idx < len(text_output):
        # Find the next potential start of a JSON structure
        start_brace = text_output.find('{', idx)
        start_bracket = text_output.find('[', idx)

        if start_brace == -1 and start_bracket == -1:
            # No more '{' or '[' found in the rest of the string
            return None

        # Determine the actual starting character for this attempt
        if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
            json_start_index = start_brace
        else:
            json_start_index = start_bracket

        potential_json_segment = text_output[json_start_index:]

        try:
            # Use raw_decode to parse the first valid JSON object from the segment
            decoder = json.JSONDecoder()
            json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
            # Successfully parsed a JSON object
            return json_obj
        except json.JSONDecodeError:
            # This segment (starting at json_start_index) wasn't a valid JSON.
            # Advance the search index past the character that caused the current attempt.
            idx = json_start_index + 1

    return None  # No valid JSON found in the entire string


def run_instance(instance_path_str: str,
                 timeout: int = SCRIPT_EXECUTION_TIMEOUT):  # SCRIPT_EXECUTION_TIMEOUT should be defined
    """Run the instance file and robustly capture the JSON output."""
    command = [sys.executable, instance_path_str]
    instance_name = Path(instance_path_str).name
    try:
        result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
                                errors='replace')

        # Check return code first
        if result.returncode != 0:
            # Log stderr for debugging if the script itself failed
            error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
            print(f"  ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
            return None

        # Attempt to extract JSON from stdout
        stdout_text = result.stdout
        if not stdout_text or not stdout_text.strip():
            print(f"  ERROR: No stdout from {instance_name}.", flush=True)
            return None

        solution = extract_json_from_string(stdout_text)

        if solution is None:
            # Be more verbose if JSON extraction fails
            abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300]  # Show newlines as \n for brevity
            print(
                f"  ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
                flush=True)
            return None

        return solution

    except subprocess.TimeoutExpired:
        print(f"  ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
        return None
    except Exception as e:
        print(f"  ERROR: Unexpected error running {instance_name}: {e}", flush=True)
        return None


def add_constraints_as_string(solution):
    """Generate constraints as a string to be added to the original script."""
    constraints = ""
    if solution:  # Ensure solution is not None
        for key, value in solution.items():
            # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
            if isinstance(value, str):
                constraints += f"\nmodel += ({key} == \"{value}\")"
            else:
                constraints += f"\nmodel += ({key} == {value})"
    return constraints


def get_modified_script(script_content, solution):
    """Add constraints to the script content and self-consistency checks."""
    constraints_str = add_constraints_as_string(solution)
    modified_script = f"{script_content}\n{constraints_str}"
    modified_script += """

# --- Self-consistency check appended by eval.py ---
# Print the absolute path of the current directory along with the script name
import os
# print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug

# Keep old objective
old_objective_value = None
objective_defined = False
if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
    try:
        # This block assumes 'model' is the CPMpy model object or similar
        # Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
        # or might raise an error if objective_value() is called on an unsolved/unformulated objective.
        # This part might need adjustment based on the specific modeling library used in CP-Bench.
        # For now, we'll try to get it and catch errors.
        # A more robust way might be to inspect model.objective_
        if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
             if model._objective_value is not None: # cpmpy does not have objective_is_min
                objective_defined = True
                old_objective_value = model.objective_value()

    except Exception as e_obj_check:
        # print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
        pass # Objective might not be set or model not solved yet.

# Check self-consistency
solved_ok = False
try:
    if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
        solved_ok = model.solve()
    else:
        print('ERROR: Model object not found or does not have a solve() method.')
except Exception as e_solve:
    print(f'ERROR: Exception during model.solve(): {e_solve}')
    solved_ok = False # Ensure it's false on exception

if not solved_ok:
    print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
else:
    print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')

    # Check if the objective value is the same
    if not objective_defined:
        print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
    else:
        try:
            current_objective_value = model.objective_value()
            # Handle potential floating point inaccuracies if objectives can be floats
            if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
                if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
                    print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
                else:
                    print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
            elif current_objective_value != old_objective_value: # Integer comparison
                print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
            else:
                print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
        except Exception as e_obj_final:
            print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')

"""
    return modified_script


# --- Main Evaluation Logic ---
def main(submission_path_str: str, results_base_dir_str: str):
    start_time = time.time()
    print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
    print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
    print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)

    submission_path = Path(submission_path_str)
    submission_name = submission_path.name
    result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
    os.makedirs(result_dir_for_submission, exist_ok=True)
    summary_file_path = result_dir_for_submission / "summary.txt"

    # Load ground-truth dataset
    try:
        # Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
        gt_dataset = load_dataset(DATASET_NAME, split="train")
        ground_truth_models = {
            item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
            for item in gt_dataset
            if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
        }
        if not ground_truth_models:
            raise ValueError(
                f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
        print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
    except Exception as e:
        print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
        with open(summary_file_path, "w") as f:
            f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
        return 1  # Indicate failure

    # Statistics
    total_submitted_models = 0
    models_ran_successfully = 0
    gt_models_found = 0
    consistency_checks_passed = 0
    objective_checks_passed = 0  # Includes "NO_OBJECTIVE_DEFINED" as a pass

    with open(summary_file_path, "w") as summary_f:
        summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
        summary_f.write(
            f"Ground-Truth Dataset: {DATASET_NAME}\n")
        summary_f.write("-" * 30 + "\n")

        submitted_model_files = list(submission_path.glob('*.py'))  # Assuming Python models
        if not submitted_model_files:
            summary_f.write("No .py model files found in submission.\n")
            print("eval.py: No .py model files found in submission.", flush=True)
            return 0  # No models to evaluate, but script ran.

        for model_file_path in submitted_model_files:
            total_submitted_models += 1
            problem_name = model_file_path.stem  # Filename without .py extension
            print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
            summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")

            # 1. Run the submitted model to get its solution
            summary_f.write("  1. Running submitted model...\n")
            generated_solution = run_instance(str(model_file_path))
            if generated_solution is None:
                summary_f.write("    - FAILED to run or get valid JSON solution from submitted model.\n")
                continue  # Move to the next model
            models_ran_successfully += 1
            summary_f.write(f"    - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")

            # 2. Find corresponding ground-truth model
            summary_f.write(f"  2. Checking against ground-truth for '{problem_name}'...\n")
            if problem_name not in ground_truth_models:
                summary_f.write(f"    - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
                print(f"  WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
                continue
            gt_models_found += 1
            ground_truth_script_content = ground_truth_models[problem_name]
            summary_f.write("    - SUCCESS: Found ground-truth model.\n")

            # 3. Modify ground-truth script with solution and run self-consistency check
            summary_f.write("  3. Performing self-consistency check on ground-truth model...\n")
            modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)

            consistency_passed_this_model = False
            objective_passed_this_model = False

            try:
                with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
                    tmp_file.write(modified_gt_script)
                    tmp_file_path_str = tmp_file.name

                # Run the modified ground-truth script
                gt_check_result = subprocess.run(
                    [sys.executable, tmp_file_path_str],
                    capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
                )
                os.unlink(tmp_file_path_str)  # Clean up temp file

                # 4. Parse output of modified ground-truth
                gt_stdout = gt_check_result.stdout
                gt_stderr = gt_check_result.stderr
                # summary_f.write(f"    Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
                if gt_stderr:
                    summary_f.write(f"    Modified GT STDERR: {gt_stderr[:500]}...\n")

                if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
                    summary_f.write("    - CONSISTENCY: PASSED\n")
                    consistency_checks_passed += 1
                    consistency_passed_this_model = True
                elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
                    summary_f.write("    - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
                else:
                    summary_f.write("    - CONSISTENCY: FAILED (Could not determine consistency from output)\n")

                if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
                        "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
                    summary_f.write("    - OBJECTIVE: PASSED (Consistent or no objective)\n")
                    objective_checks_passed += 1
                    objective_passed_this_model = True
                elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
                    summary_f.write(f"    - OBJECTIVE: FAILED (Value changed)\n")
                elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
                    summary_f.write(f"    - OBJECTIVE: FAILED (Error accessing final objective)\n")
                else:
                    summary_f.write("    - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")

            except subprocess.TimeoutExpired:
                summary_f.write(
                    f"    - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
                print(f"  ERROR: Timeout running modified GT for {problem_name}", flush=True)
            except Exception as e_gt_run:
                summary_f.write(
                    f"    - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
                print(f"  ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)

        # Final statistics
        summary_f.write("\n" + "=" * 30 + "\n")
        summary_f.write("Overall Evaluation Statistics:\n")
        summary_f.write(f"  Total Submitted Models Parsed: {total_submitted_models}\n")
        summary_f.write(
            f"  Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
        summary_f.write(
            f"  Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
        summary_f.write(f"  Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
        summary_f.write(f"  Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")

        # Define an overall score, e.g. number of models that passed both checks against found GT
        fully_passed_models = 0
        # This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
        # This simple score is just the sum of passes, could be more nuanced
        overall_score = consistency_checks_passed + objective_checks_passed
        summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n")  # For Gradio app to parse

    elapsed_time = time.time() - start_time
    print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
    print(f"eval.py: Summary written to {summary_file_path}", flush=True)
    return 0  # Success


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
        print("Example: python eval.py ./submissions/my_run ./results")
        sys.exit(1)

    submission_dir = sys.argv[1]
    results_base_dir = sys.argv[2]

    # Simple check if submission_dir exists
    if not Path(submission_dir).is_dir():
        print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
        sys.exit(1)

    exit_code = main(submission_dir, results_base_dir)
    sys.exit(exit_code)