File size: 20,257 Bytes
e67d561 180f9fe e67d561 5e53d23 180f9fe ecfa9d2 15bdc2b 180f9fe 5e53d23 21ed616 180f9fe e67d561 21ed616 180f9fe 21ed616 180f9fe 21ed616 1cb4bfb 21ed616 e67d561 21ed616 180f9fe 21ed616 e67d561 21ed616 e67d561 21ed616 180f9fe 21ed616 180f9fe 21ed616 180f9fe ecfa9d2 180f9fe 21ed616 180f9fe 21ed616 180f9fe 21ed616 308cfba 180f9fe 27b03dc 180f9fe 21ed616 e67d561 21ed616 e67d561 180f9fe 21ed616 180f9fe 21ed616 180f9fe 21ed616 e67d561 21ed616 180f9fe 21ed616 180f9fe 21ed616 180f9fe 21ed616 180f9fe ecfa9d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 |
import datetime
import time
import json
import tempfile
import minizinc
from datasets import load_dataset
from huggingface_hub import HfApi, hf_hub_download
import os
import sys
import subprocess
import threading
from pathlib import Path
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, CPMPY_FRAMEWORK, ORTOOLS_FRAMEWORK, \
MINIZINC_FRAMEWORK
# --- Configuration ---
GT_DATASET_NAME = "kostis-init/CP-Bench"
# Column names in the Hugging Face dataset for problem identifier and model script
GT_PROBLEM_NAME_COLUMN = "id"
GT_MODEL_CODE_COLUMN = "model"
# Timeout for running individual model scripts (both generated and modified ground-truth)
SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
def run_evaluation(submission_path):
print(f"Starting evaluation for: {submission_path}")
main_eval(DATASET_REPO_ID, submission_path, DS_RESULTS_PATH)
print(f"Evaluation process complete for: {submission_path}", flush=True)
def start_background_evaluation(submission_path):
"""Start evaluation in a background thread."""
thread = threading.Thread(
target=lambda: run_evaluation(submission_path),
daemon=True
)
thread.start()
return True
def extract_json_from_code_output(output: str):
try:
start_index = output.find('{')
end_index = output.rfind('}') + 1
# Extract the JSON part
json_part = output[start_index:end_index]
return json.loads(json_part)
except json.JSONDecodeError:
return None
def exec_code_minizinc(code: str, timeout_sec):
"""
Executes a MiniZinc model string using the minizinc-python library.
:param code: The MiniZinc model code as a string.
:param timeout_sec: The maximum time to wait for the solver in seconds.
:return: A tuple of (success, output, timeout_occured)
"""
successfully_executed = False
output = ""
timeout_occurred = False
timeout_duration = datetime.timedelta(seconds=timeout_sec)
try:
# 1. Create a MiniZinc model instance
model = minizinc.Model()
model.add_string(code)
# 2. Find a default solver configured with MiniZinc
# You can be more specific, e.g., solver = minizinc.Solver.lookup("gecode")
# If the default solver isn't found or suitable, this will raise an error.
gecode = minizinc.Solver.lookup("gecode")
if gecode is None:
raise RuntimeError("No suitable solver found. Please install a MiniZinc solver.")
# 3. Create an Instance to solve
instance = minizinc.Instance(gecode, model)
# 4. Solve the instance with the specified timeout
# The solve() method handles the timeout internally.
result = instance.solve(timeout=timeout_duration)
# 5. Process the result
if result.status in {minizinc.Status.SATISFIED, minizinc.Status.OPTIMAL_SOLUTION}:
successfully_executed = True
output = str(result.solution) if result.solution is not None else ""
timeout_occurred = False
elif result.status == minizinc.Status.UNKNOWN:
successfully_executed = False
output = f"Timeout Error: Solver stopped after {timeout_sec} seconds (Status: UNKNOWN)."
timeout_occurred = True
else:
# Handle other non-success statuses (UNSAT, ERROR, etc.)
successfully_executed = False
output = f"Solving failed. Status: {result.status}"
timeout_occurred = False
except minizinc.MiniZincError as e:
# Catch MiniZinc specific errors (e.g., syntax errors, solver not found)
successfully_executed = False
output = f"MiniZinc Error: {e}"
timeout_occurred = False
except Exception as e:
# Catch other unexpected errors
successfully_executed = False
output = f"Unexpected Error during MiniZinc execution: {e}"
timeout_occurred = False
return successfully_executed, output, timeout_occurred
def exec_code(code: str, timeout=10, modelling_language='cpmpy'):
"""
Execute the given code and return the output
:param code: The code to execute as a string
:param timeout: The maximum time to wait for the code to execute in seconds
:param modelling_language: The language to use for execution (cpmpy, minizinc, or-tools)
:return: A tuple of (success, output, timeout_occured)
"""
# create a temp directory to store the temporary file
temp_dir_name = "temp_dir_for_exec_code"
temp_dir = os.path.join(os.getcwd(), temp_dir_name)
os.makedirs(temp_dir, exist_ok=True)
# write the code to a temporary file
suffix = '.__hidden_py__' if modelling_language == CPMPY_FRAMEWORK or modelling_language == ORTOOLS_FRAMEWORK else '.mzn'
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=suffix, dir=temp_dir, encoding='utf-8') as temp_file:
temp_instance_path = temp_file.name
temp_file.write(code)
try:
# execute the code
if modelling_language == CPMPY_FRAMEWORK or modelling_language == ORTOOLS_FRAMEWORK:
command = [sys.executable, temp_instance_path]
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8')
successfully_executed = (result.returncode == 0)
output = result.stdout if successfully_executed else result.stderr
timeout_occurred = False
elif modelling_language == MINIZINC_FRAMEWORK:
successfully_executed, output, timeout_occurred = exec_code_minizinc(code, timeout)
else:
raise ValueError(f"MODELLING_LANGUAGE not supported: {modelling_language}")
except subprocess.TimeoutExpired as e:
successfully_executed = False
output = f"Timeout Error: Execution time exceeded {timeout} seconds"
timeout_occurred = True
except Exception as e:
successfully_executed = False
output = f"Error: {e}"
timeout_occurred = False
os.remove(temp_instance_path)
return successfully_executed, output, timeout_occurred
def add_constraints_as_string(solution):
"""Generate constraints as a string to be added to the original script."""
constraints = ""
if solution: # Ensure solution is not None
for key, value in solution.items():
# Basic escaping for string values if they occur, though typically solutions are numeric/boolean
if isinstance(value, str):
constraints += f"\nmodel += ({key} == \"{value}\")"
else:
constraints += f"\nmodel += ({key} == {value})"
return constraints
def get_modified_script(script_content, solution):
"""Add constraints to the script content and self-consistency checks."""
constraints_str = add_constraints_as_string(solution)
modified_script = f"{script_content}\n{constraints_str}"
modified_script += """
# Print the absolute path of the current directory along with the script name
import os
print(os.path.abspath(__file__))
# Keep old objective
old_objective = None
if hasattr(model, 'objective_is_min') and model.objective_is_min is not None:
old_objective = model.objective_value()
# Check self-consistency
if not model.solve():
print('ERROR: The model is unsatisfiable with the self-consistency constraints')
else:
print('SUCCESS: Model is consistent')
# Check if the objective value is the same
if old_objective is None:
print('SUCCESS: No objective defined')
elif model.objective_value() != old_objective:
print('ERROR: The objective value has changed')
else:
print('SUCCESS: Objective value is consistent')
"""
return modified_script
# --- Main Evaluation Logic ---
def main_eval(
user_dataset_repo_id: str,
submission_path_in_dataset: str, # e.g., "submissions/uploaded_dir_name"
results_base_path_in_dataset: str # e.g., "results"
):
start_time = time.time()
# Infer submission name for logging and result path generation
submission_name_for_files = Path(submission_path_in_dataset).name
print(f"eval.py: Starting evaluation for submission: '{submission_name_for_files}'", flush=True)
print(f" User Data Repo: {user_dataset_repo_id}", flush=True)
print(f" Submission to download from: {submission_path_in_dataset}", flush=True)
print(f" Results to upload to: {results_base_path_in_dataset}/{submission_name_for_files}", flush=True)
hf_api = HfApi() # Will use HF_TOKEN from environment
# Create a top-level temporary directory for all operations for this eval run
with tempfile.TemporaryDirectory(prefix="eval_run_") as top_level_temp_dir_str:
top_level_temp_dir = Path(top_level_temp_dir_str)
local_submission_dir = top_level_temp_dir / "submissions"
local_result_dir_for_upload = top_level_temp_dir / "results"
os.makedirs(local_submission_dir, exist_ok=True)
os.makedirs(local_result_dir_for_upload, exist_ok=True)
# Path for the summary file within the local temporary result directory
summary_file_path = local_result_dir_for_upload / "summary.txt"
# 1. Download submitted files from HF Dataset
print(f" Downloading submission files from '{submission_path_in_dataset}' to '{local_submission_dir}'...",
flush=True)
try:
# Download the relevant submission file
hf_hub_download(
repo_id=user_dataset_repo_id,
repo_type="dataset",
local_dir=local_submission_dir,
filename=f"{submission_path_in_dataset}/submission.jsonl",
)
print(f" Downloaded submission file successfully.", flush=True)
# Download the metadata file
hf_hub_download(
repo_id=user_dataset_repo_id,
repo_type="dataset",
local_dir=local_submission_dir,
filename=f"{submission_path_in_dataset}/metadata.json",
)
print(f" Downloaded metadata file successfully.", flush=True)
except Exception as e_download:
print(f" CRITICAL ERROR - Failed to download submission files: {e_download}", flush=True)
return 1
# 2. Load ground-truth dataset
print(f" Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
try:
gt_dataset = load_dataset(GT_DATASET_NAME, split="train", trust_remote_code=True)
ground_truth_models = {
item[GT_PROBLEM_NAME_COLUMN]: item[GT_MODEL_CODE_COLUMN]
for item in gt_dataset if
GT_PROBLEM_NAME_COLUMN in item and GT_MODEL_CODE_COLUMN in item and item[GT_MODEL_CODE_COLUMN]
}
if not ground_truth_models: raise ValueError("No models in GT dataset.")
print(f" Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
except Exception as e_gt:
print(f" CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
with open(summary_file_path, "w") as f:
f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{GT_DATASET_NAME}'.\nError: {e_gt}\n")
# (Attempt to upload error summary)
return 1
# load generated models from jsonl to memory
print(f" Loading generated models from '{local_submission_dir}'...", flush=True)
submitted_models = []
with open(os.path.join(local_submission_dir, submission_path_in_dataset, "submission.jsonl"), "r", encoding="utf-8") as f:
for line in f:
try:
json_obj = json.loads(line)
submitted_models.append(json_obj)
except json.JSONDecodeError as e:
print(f" ERROR: Failed to parse JSON object from line: {line}. Error: {e}", flush=True)
# load metadata file
with open(os.path.join(local_submission_dir, submission_path_in_dataset, "metadata.json"), "r", encoding="utf-8") as f:
metadata = json.load(f)
print(f" Loaded {len(submitted_models)} generated models.", flush=True)
# Statistics
total_submitted_models = 0
models_ran_successfully = 0
consistency_checks_passed = 0
objective_checks_passed = 0
all_checks_passed = 0
gt_models_found = 0
with open(summary_file_path, "w", encoding="utf-8") as summary_f:
summary_f.write(f"Evaluation Summary for Submission: {submission_name_for_files}\n")
summary_f.write(f"User Data Repo: {user_dataset_repo_id}\n")
summary_f.write(f"Submission Path in Dataset: {submission_path_in_dataset}\n")
summary_f.write(f"Ground-Truth Dataset: {GT_DATASET_NAME}\n")
summary_f.write("-" * 30 + "\n")
# Iterate through downloaded submitted models
for submitted_model in submitted_models:
curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
total_submitted_models += 1
problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
print(f"\n Processing downloaded model: {problem_name}", flush=True)
summary_f.write(f"\n--- Model: {problem_name} ---\n")
summary_f.write(" 1. Running submitted model...\n")
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
if timeout_occurred:
summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
continue
if not succ_exec:
summary_f.write(f" - FAILED: Execution failed with error: {output}\n")
continue
if output is None or not output.strip():
summary_f.write(f" - FAILED: No output from execution.\n")
continue
# Attempt to extract JSON from stdout
generated_solution = extract_json_from_code_output(output)
if generated_solution is None:
summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
continue
models_ran_successfully += 1
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
if problem_name not in ground_truth_models:
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
continue
gt_models_found += 1
ground_truth_script_content = ground_truth_models[problem_name]
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
try:
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
dir=top_level_temp_dir) as tmp_file:
tmp_file.write(modified_gt_script)
tmp_file_path_str = tmp_file.name
gt_check_result = subprocess.run(
[sys.executable, tmp_file_path_str],
capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT, encoding='utf-8',
)
os.unlink(tmp_file_path_str)
gt_stdout = gt_check_result.stdout
if "SUCCESS: Model is consistent" in gt_stdout:
summary_f.write(" - CONSISTENCY: PASSED\n")
consistency_checks_passed += 1
else:
summary_f.write(
" - CONSISTENCY: FAILED (Details in logs or stdout)\n")
if "SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout:
summary_f.write(" - OBJECTIVE: PASSED\n")
objective_checks_passed += 1
else:
summary_f.write(" - OBJECTIVE: FAILED (Details in logs or stdout)\n")
if "SUCCESS: Model is consistent" in gt_stdout and ("SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
summary_f.write(" - SELF-CONSISTENCY CHECK: PASSED fully\n")
all_checks_passed += 1
except Exception as e_gt_run:
summary_f.write(f" - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
# Final statistics (write to summary_f)
summary_f.write("\n" + "=" * 30 + "\n")
summary_f.write("Overall Evaluation Statistics:\n")
summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
summary_f.write(f" Models That Ran Successfully: {models_ran_successfully}/{total_submitted_models}\n")
summary_f.write(f" Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully}\n")
summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
summary_f.write("=" * 30 + "\n")
summary_f.write("Final Evaluation Summary:\n")
summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
summary_f.write("-" * 30 + "\n")
# 4. Upload the entire local_result_dir_for_upload to HF Dataset
# This directory contains summary.txt and could contain other result files.
result_path_on_hub = f"{results_base_path_in_dataset}/{submission_name_for_files}"
print(f" Uploading results from '{local_result_dir_for_upload}' to '{result_path_on_hub}' on dataset...",
flush=True)
try:
hf_api.upload_folder(
folder_path=str(local_result_dir_for_upload),
path_in_repo=result_path_on_hub,
repo_id=user_dataset_repo_id,
repo_type="dataset",
commit_message=f"Evaluation results for {submission_name_for_files}"
)
print(" Results uploaded successfully.", flush=True)
except Exception as e_upload:
print(f" CRITICAL ERROR: Failed to upload results: {e_upload}", flush=True)
# The summary.txt was written locally, but upload failed.
elapsed_time = time.time() - start_time
print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
# return 0
# if __name__ == "__main__":
# if len(sys.argv) < 4:
# print(
# "Usage: python eval.py <user_dataset_repo_id> <submission_path_in_dataset> <results_base_path_in_dataset>")
# print("Example: python eval.py your-username/my-storage submissions/run123 results")
# sys.exit(1)
#
# arg_user_dataset_repo_id = sys.argv[1]
# arg_submission_path_in_dataset = sys.argv[2]
# arg_results_base_path_in_dataset = sys.argv[3]
#
# exit_code = main(arg_user_dataset_repo_id, arg_submission_path_in_dataset, arg_results_base_path_in_dataset)
# sys.exit(exit_code) |