Commit
Β·
70cc330
1
Parent(s):
f434b15
Refactor evaluation logic: streamline user_eval.py, update evaluation script references, and clean up eval.py
Browse files- src/eval.py +10 -332
- src/hf_utils.py +0 -4
- src/ui.py +3 -3
- user_eval.py β src/user_eval.py +211 -113
- template_submission.jsonl +18 -18
src/eval.py
CHANGED
@@ -1,30 +1,14 @@
|
|
1 |
-
import datetime
|
2 |
import time
|
3 |
import json
|
4 |
import tempfile
|
5 |
|
6 |
-
import minizinc
|
7 |
-
from datasets import load_dataset
|
8 |
from huggingface_hub import HfApi, hf_hub_download
|
9 |
import os
|
10 |
-
import sys
|
11 |
-
import subprocess
|
12 |
import threading
|
13 |
from pathlib import Path
|
14 |
|
15 |
-
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH
|
16 |
-
|
17 |
-
|
18 |
-
# --- Configuration ---
|
19 |
-
|
20 |
-
GT_DATASET_NAME = "kostis-init/CP-Bench"
|
21 |
-
|
22 |
-
# Column names in the Hugging Face dataset for problem identifier and model script
|
23 |
-
GT_PROBLEM_NAME_COLUMN = "id"
|
24 |
-
GT_MODEL_CODE_COLUMN = "model"
|
25 |
-
|
26 |
-
# Timeout for running individual model scripts (both generated and modified ground-truth)
|
27 |
-
SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
|
28 |
|
29 |
|
30 |
def run_evaluation(submission_path):
|
@@ -43,171 +27,6 @@ def start_background_evaluation(submission_path):
|
|
43 |
return True
|
44 |
|
45 |
|
46 |
-
def extract_json_from_code_output(output: str):
|
47 |
-
try:
|
48 |
-
start_index = output.find('{')
|
49 |
-
end_index = output.rfind('}') + 1
|
50 |
-
# Extract the JSON part
|
51 |
-
json_part = output[start_index:end_index]
|
52 |
-
return json.loads(json_part)
|
53 |
-
except json.JSONDecodeError:
|
54 |
-
return None
|
55 |
-
|
56 |
-
|
57 |
-
def exec_code_minizinc(code: str, timeout_sec):
|
58 |
-
"""
|
59 |
-
Executes a MiniZinc model string using the minizinc-python library.
|
60 |
-
|
61 |
-
:param code: The MiniZinc model code as a string.
|
62 |
-
:param timeout_sec: The maximum time to wait for the solver in seconds.
|
63 |
-
:return: A tuple of (success, output, timeout_occured)
|
64 |
-
"""
|
65 |
-
successfully_executed = False
|
66 |
-
output = ""
|
67 |
-
timeout_occurred = False
|
68 |
-
timeout_duration = datetime.timedelta(seconds=timeout_sec)
|
69 |
-
|
70 |
-
try:
|
71 |
-
# 1. Create a MiniZinc model instance
|
72 |
-
model = minizinc.Model()
|
73 |
-
model.add_string(code)
|
74 |
-
|
75 |
-
# 2. Find a default solver configured with MiniZinc
|
76 |
-
# You can be more specific, e.g., solver = minizinc.Solver.lookup("gecode")
|
77 |
-
# If the default solver isn't found or suitable, this will raise an error.
|
78 |
-
gecode = minizinc.Solver.lookup("gecode")
|
79 |
-
if gecode is None:
|
80 |
-
raise RuntimeError("No suitable solver found. Please install a MiniZinc solver.")
|
81 |
-
|
82 |
-
# 3. Create an Instance to solve
|
83 |
-
instance = minizinc.Instance(gecode, model)
|
84 |
-
|
85 |
-
# 4. Solve the instance with the specified timeout
|
86 |
-
# The solve() method handles the timeout internally.
|
87 |
-
result = instance.solve(timeout=timeout_duration)
|
88 |
-
|
89 |
-
# 5. Process the result
|
90 |
-
if result.status in {minizinc.Status.SATISFIED, minizinc.Status.OPTIMAL_SOLUTION}:
|
91 |
-
successfully_executed = True
|
92 |
-
output = str(result.solution) if result.solution is not None else ""
|
93 |
-
timeout_occurred = False
|
94 |
-
elif result.status == minizinc.Status.UNKNOWN:
|
95 |
-
successfully_executed = False
|
96 |
-
output = f"Timeout Error: Solver stopped after {timeout_sec} seconds (Status: UNKNOWN)."
|
97 |
-
timeout_occurred = True
|
98 |
-
else:
|
99 |
-
# Handle other non-success statuses (UNSAT, ERROR, etc.)
|
100 |
-
successfully_executed = False
|
101 |
-
output = f"Solving failed. Status: {result.status}"
|
102 |
-
timeout_occurred = False
|
103 |
-
|
104 |
-
except minizinc.MiniZincError as e:
|
105 |
-
# Catch MiniZinc specific errors (e.g., syntax errors, solver not found)
|
106 |
-
successfully_executed = False
|
107 |
-
output = f"MiniZinc Error: {e}"
|
108 |
-
timeout_occurred = False
|
109 |
-
except Exception as e:
|
110 |
-
# Catch other unexpected errors
|
111 |
-
successfully_executed = False
|
112 |
-
output = f"Unexpected Error during MiniZinc execution: {e}"
|
113 |
-
timeout_occurred = False
|
114 |
-
|
115 |
-
return successfully_executed, output, timeout_occurred
|
116 |
-
|
117 |
-
|
118 |
-
def exec_code(code: str, timeout=10, modelling_language='cpmpy'):
|
119 |
-
"""
|
120 |
-
Execute the given code and return the output
|
121 |
-
|
122 |
-
:param code: The code to execute as a string
|
123 |
-
:param timeout: The maximum time to wait for the code to execute in seconds
|
124 |
-
:param modelling_language: The language to use for execution (cpmpy, minizinc, or-tools)
|
125 |
-
:return: A tuple of (success, output, timeout_occured)
|
126 |
-
"""
|
127 |
-
|
128 |
-
# create a temp directory to store the temporary file
|
129 |
-
temp_dir_name = "temp_dir_for_exec_code"
|
130 |
-
temp_dir = os.path.join(os.getcwd(), temp_dir_name)
|
131 |
-
os.makedirs(temp_dir, exist_ok=True)
|
132 |
-
|
133 |
-
# write the code to a temporary file
|
134 |
-
suffix = '.__hidden_py__' if modelling_language == CPMPY_FRAMEWORK or modelling_language == ORTOOLS_FRAMEWORK else '.mzn'
|
135 |
-
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=suffix, dir=temp_dir, encoding='utf-8') as temp_file:
|
136 |
-
temp_instance_path = temp_file.name
|
137 |
-
temp_file.write(code)
|
138 |
-
|
139 |
-
try:
|
140 |
-
# execute the code
|
141 |
-
if modelling_language == CPMPY_FRAMEWORK or modelling_language == ORTOOLS_FRAMEWORK:
|
142 |
-
command = [sys.executable, temp_instance_path]
|
143 |
-
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8')
|
144 |
-
|
145 |
-
successfully_executed = (result.returncode == 0)
|
146 |
-
output = result.stdout if successfully_executed else result.stderr
|
147 |
-
timeout_occurred = False
|
148 |
-
elif modelling_language == MINIZINC_FRAMEWORK:
|
149 |
-
successfully_executed, output, timeout_occurred = exec_code_minizinc(code, timeout)
|
150 |
-
else:
|
151 |
-
raise ValueError(f"MODELLING_LANGUAGE not supported: {modelling_language}")
|
152 |
-
|
153 |
-
except subprocess.TimeoutExpired as e:
|
154 |
-
successfully_executed = False
|
155 |
-
output = f"Timeout Error: Execution time exceeded {timeout} seconds"
|
156 |
-
timeout_occurred = True
|
157 |
-
except Exception as e:
|
158 |
-
successfully_executed = False
|
159 |
-
output = f"Error: {e}"
|
160 |
-
timeout_occurred = False
|
161 |
-
|
162 |
-
os.remove(temp_instance_path)
|
163 |
-
|
164 |
-
return successfully_executed, output, timeout_occurred
|
165 |
-
|
166 |
-
|
167 |
-
def add_constraints_as_string(solution):
|
168 |
-
"""Generate constraints as a string to be added to the original script."""
|
169 |
-
constraints = ""
|
170 |
-
if solution: # Ensure solution is not None
|
171 |
-
for key, value in solution.items():
|
172 |
-
# Basic escaping for string values if they occur, though typically solutions are numeric/boolean
|
173 |
-
if isinstance(value, str):
|
174 |
-
constraints += f"\nmodel += ({key} == \"{value}\")"
|
175 |
-
else:
|
176 |
-
constraints += f"\nmodel += ({key} == {value})"
|
177 |
-
return constraints
|
178 |
-
|
179 |
-
|
180 |
-
def get_modified_script(script_content, solution):
|
181 |
-
"""Add constraints to the script content and self-consistency checks."""
|
182 |
-
constraints_str = add_constraints_as_string(solution)
|
183 |
-
modified_script = f"{script_content}\n{constraints_str}"
|
184 |
-
modified_script += """
|
185 |
-
# Print the absolute path of the current directory along with the script name
|
186 |
-
import os
|
187 |
-
print(os.path.abspath(__file__))
|
188 |
-
|
189 |
-
# Keep old objective
|
190 |
-
old_objective = None
|
191 |
-
if hasattr(model, 'objective_is_min') and model.objective_is_min is not None:
|
192 |
-
old_objective = model.objective_value()
|
193 |
-
|
194 |
-
# Check self-consistency
|
195 |
-
if not model.solve():
|
196 |
-
print('ERROR: The model is unsatisfiable with the self-consistency constraints')
|
197 |
-
else:
|
198 |
-
print('SUCCESS: Model is consistent')
|
199 |
-
|
200 |
-
# Check if the objective value is the same
|
201 |
-
if old_objective is None:
|
202 |
-
print('SUCCESS: No objective defined')
|
203 |
-
elif model.objective_value() != old_objective:
|
204 |
-
print('ERROR: The objective value has changed')
|
205 |
-
else:
|
206 |
-
print('SUCCESS: Objective value is consistent')
|
207 |
-
"""
|
208 |
-
return modified_script
|
209 |
-
|
210 |
-
|
211 |
# --- Main Evaluation Logic ---
|
212 |
def main_eval(
|
213 |
user_dataset_repo_id: str,
|
@@ -237,7 +56,7 @@ def main_eval(
|
|
237 |
# Path for the summary file within the local temporary result directory
|
238 |
summary_file_path = local_result_dir_for_upload / "summary.txt"
|
239 |
|
240 |
-
#
|
241 |
print(f" Downloading submission files from '{submission_path_in_dataset}' to '{local_submission_dir}'...",
|
242 |
flush=True)
|
243 |
try:
|
@@ -262,28 +81,11 @@ def main_eval(
|
|
262 |
print(f" CRITICAL ERROR - Failed to download submission files: {e_download}", flush=True)
|
263 |
return 1
|
264 |
|
265 |
-
# 2. Load ground-truth dataset
|
266 |
-
print(f" Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
|
267 |
-
try:
|
268 |
-
gt_dataset = load_dataset(GT_DATASET_NAME, split="train", trust_remote_code=True)
|
269 |
-
ground_truth_models = {
|
270 |
-
item[GT_PROBLEM_NAME_COLUMN]: item[GT_MODEL_CODE_COLUMN]
|
271 |
-
for item in gt_dataset if
|
272 |
-
GT_PROBLEM_NAME_COLUMN in item and GT_MODEL_CODE_COLUMN in item and item[GT_MODEL_CODE_COLUMN]
|
273 |
-
}
|
274 |
-
if not ground_truth_models: raise ValueError("No models in GT dataset.")
|
275 |
-
print(f" Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
|
276 |
-
except Exception as e_gt:
|
277 |
-
print(f" CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
|
278 |
-
with open(summary_file_path, "w") as f:
|
279 |
-
f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{GT_DATASET_NAME}'.\nError: {e_gt}\n")
|
280 |
-
# (Attempt to upload error summary)
|
281 |
-
return 1
|
282 |
-
|
283 |
# load generated models from jsonl to memory
|
284 |
print(f" Loading generated models from '{local_submission_dir}'...", flush=True)
|
285 |
submitted_models = []
|
286 |
-
with open(os.path.join(local_submission_dir, submission_path_in_dataset, "submission.jsonl"), "r",
|
|
|
287 |
for line in f:
|
288 |
try:
|
289 |
json_obj = json.loads(line)
|
@@ -292,124 +94,16 @@ def main_eval(
|
|
292 |
print(f" ERROR: Failed to parse JSON object from line: {line}. Error: {e}", flush=True)
|
293 |
|
294 |
# load metadata file
|
295 |
-
with open(os.path.join(local_submission_dir, submission_path_in_dataset, "metadata.json"), "r",
|
|
|
296 |
metadata = json.load(f)
|
297 |
|
298 |
print(f" Loaded {len(submitted_models)} generated models.", flush=True)
|
299 |
|
|
|
|
|
300 |
|
301 |
-
#
|
302 |
-
total_submitted_models = 0
|
303 |
-
models_ran_successfully = 0
|
304 |
-
consistency_checks_passed = 0
|
305 |
-
objective_checks_passed = 0
|
306 |
-
all_checks_passed = 0
|
307 |
-
gt_models_found = 0
|
308 |
-
|
309 |
-
with open(summary_file_path, "w", encoding="utf-8") as summary_f:
|
310 |
-
summary_f.write(f"Evaluation Summary for Submission: {submission_name_for_files}\n")
|
311 |
-
summary_f.write(f"User Data Repo: {user_dataset_repo_id}\n")
|
312 |
-
summary_f.write(f"Submission Path in Dataset: {submission_path_in_dataset}\n")
|
313 |
-
summary_f.write(f"Ground-Truth Dataset: {GT_DATASET_NAME}\n")
|
314 |
-
summary_f.write("-" * 30 + "\n")
|
315 |
-
|
316 |
-
# Iterate through downloaded submitted models
|
317 |
-
for submitted_model in submitted_models:
|
318 |
-
curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
|
319 |
-
|
320 |
-
total_submitted_models += 1
|
321 |
-
problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
|
322 |
-
print(f"\n Processing downloaded model: {problem_name}", flush=True)
|
323 |
-
summary_f.write(f"\n--- Model: {problem_name} ---\n")
|
324 |
-
|
325 |
-
summary_f.write(" 1. Running submitted model...\n")
|
326 |
-
|
327 |
-
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT, modelling_language=metadata["modelling_framework"])
|
328 |
-
|
329 |
-
if succ_exec:
|
330 |
-
models_ran_successfully += 1
|
331 |
-
summary_f.write(" - SUCCESS: Model executed successfully.\n")
|
332 |
-
|
333 |
-
if timeout_occurred:
|
334 |
-
summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
|
335 |
-
continue
|
336 |
-
if not succ_exec:
|
337 |
-
summary_f.write(f" - FAILED: Execution failed with error: {output}\n")
|
338 |
-
continue
|
339 |
-
if output is None or not output.strip():
|
340 |
-
summary_f.write(f" - FAILED: No output from execution.\n")
|
341 |
-
continue
|
342 |
-
# Attempt to extract JSON from stdout
|
343 |
-
generated_solution = extract_json_from_code_output(output)
|
344 |
-
if generated_solution is None:
|
345 |
-
summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
|
346 |
-
continue
|
347 |
-
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
|
348 |
-
|
349 |
-
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
350 |
-
if problem_name not in ground_truth_models:
|
351 |
-
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
352 |
-
continue
|
353 |
-
gt_models_found += 1
|
354 |
-
ground_truth_script_content = ground_truth_models[problem_name]
|
355 |
-
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
|
356 |
-
|
357 |
-
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
|
358 |
-
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
359 |
-
|
360 |
-
try:
|
361 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
|
362 |
-
dir=top_level_temp_dir) as tmp_file:
|
363 |
-
tmp_file.write(modified_gt_script)
|
364 |
-
tmp_file_path_str = tmp_file.name
|
365 |
-
|
366 |
-
gt_check_result = subprocess.run(
|
367 |
-
[sys.executable, tmp_file_path_str],
|
368 |
-
capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT, encoding='utf-8',
|
369 |
-
)
|
370 |
-
os.unlink(tmp_file_path_str)
|
371 |
-
|
372 |
-
gt_stdout = gt_check_result.stdout
|
373 |
-
if "SUCCESS: Model is consistent" in gt_stdout:
|
374 |
-
summary_f.write(" - CONSISTENCY: PASSED\n")
|
375 |
-
consistency_checks_passed += 1
|
376 |
-
else:
|
377 |
-
summary_f.write(
|
378 |
-
" - CONSISTENCY: FAILED (Details in logs or stdout)\n")
|
379 |
-
|
380 |
-
if "SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout:
|
381 |
-
summary_f.write(" - OBJECTIVE: PASSED\n")
|
382 |
-
objective_checks_passed += 1
|
383 |
-
else:
|
384 |
-
summary_f.write(" - OBJECTIVE: FAILED (Details in logs or stdout)\n")
|
385 |
-
|
386 |
-
if "SUCCESS: Model is consistent" in gt_stdout and ("SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
|
387 |
-
summary_f.write(" - SELF-CONSISTENCY CHECK: PASSED fully\n")
|
388 |
-
all_checks_passed += 1
|
389 |
-
|
390 |
-
except Exception as e_gt_run:
|
391 |
-
summary_f.write(f" - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
|
392 |
-
|
393 |
-
# Final statistics (write to summary_f)
|
394 |
-
summary_f.write("\n" + "=" * 30 + "\n")
|
395 |
-
summary_f.write("Overall Evaluation Statistics:\n")
|
396 |
-
summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
|
397 |
-
summary_f.write(f" Models That Ran Successfully: {models_ran_successfully}/{total_submitted_models}\n")
|
398 |
-
summary_f.write(f" Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully}\n")
|
399 |
-
summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
|
400 |
-
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
|
401 |
-
summary_f.write("=" * 30 + "\n")
|
402 |
-
summary_f.write("Final Evaluation Summary:\n")
|
403 |
-
summary_f.write(f" Submission coverage perc: {float(total_submitted_models) / len(ground_truth_models) * 100:.2f}%\n")
|
404 |
-
summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
|
405 |
-
summary_f.write(f" Error perc: {(total_submitted_models - models_ran_successfully) / len(ground_truth_models) * 100:.2f}%\n")
|
406 |
-
summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
407 |
-
summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
408 |
-
summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
409 |
-
summary_f.write("-" * 30 + "\n")
|
410 |
-
|
411 |
-
# 4. Upload the entire local_result_dir_for_upload to HF Dataset
|
412 |
-
# This directory contains summary.txt and could contain other result files.
|
413 |
result_path_on_hub = f"{results_base_path_in_dataset}/{submission_name_for_files}"
|
414 |
print(f" Uploading results from '{local_result_dir_for_upload}' to '{result_path_on_hub}' on dataset...",
|
415 |
flush=True)
|
@@ -428,19 +122,3 @@ def main_eval(
|
|
428 |
|
429 |
elapsed_time = time.time() - start_time
|
430 |
print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
|
431 |
-
# return 0
|
432 |
-
|
433 |
-
|
434 |
-
# if __name__ == "__main__":
|
435 |
-
# if len(sys.argv) < 4:
|
436 |
-
# print(
|
437 |
-
# "Usage: python eval.py <user_dataset_repo_id> <submission_path_in_dataset> <results_base_path_in_dataset>")
|
438 |
-
# print("Example: python eval.py your-username/my-storage submissions/run123 results")
|
439 |
-
# sys.exit(1)
|
440 |
-
#
|
441 |
-
# arg_user_dataset_repo_id = sys.argv[1]
|
442 |
-
# arg_submission_path_in_dataset = sys.argv[2]
|
443 |
-
# arg_results_base_path_in_dataset = sys.argv[3]
|
444 |
-
#
|
445 |
-
# exit_code = main(arg_user_dataset_repo_id, arg_submission_path_in_dataset, arg_results_base_path_in_dataset)
|
446 |
-
# sys.exit(exit_code)
|
|
|
|
|
1 |
import time
|
2 |
import json
|
3 |
import tempfile
|
4 |
|
|
|
|
|
5 |
from huggingface_hub import HfApi, hf_hub_download
|
6 |
import os
|
|
|
|
|
7 |
import threading
|
8 |
from pathlib import Path
|
9 |
|
10 |
+
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH
|
11 |
+
from src.user_eval import evaluate_submission
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def run_evaluation(submission_path):
|
|
|
27 |
return True
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# --- Main Evaluation Logic ---
|
31 |
def main_eval(
|
32 |
user_dataset_repo_id: str,
|
|
|
56 |
# Path for the summary file within the local temporary result directory
|
57 |
summary_file_path = local_result_dir_for_upload / "summary.txt"
|
58 |
|
59 |
+
# Download submitted files from HF Dataset
|
60 |
print(f" Downloading submission files from '{submission_path_in_dataset}' to '{local_submission_dir}'...",
|
61 |
flush=True)
|
62 |
try:
|
|
|
81 |
print(f" CRITICAL ERROR - Failed to download submission files: {e_download}", flush=True)
|
82 |
return 1
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
# load generated models from jsonl to memory
|
85 |
print(f" Loading generated models from '{local_submission_dir}'...", flush=True)
|
86 |
submitted_models = []
|
87 |
+
with open(os.path.join(local_submission_dir, submission_path_in_dataset, "submission.jsonl"), "r",
|
88 |
+
encoding="utf-8") as f:
|
89 |
for line in f:
|
90 |
try:
|
91 |
json_obj = json.loads(line)
|
|
|
94 |
print(f" ERROR: Failed to parse JSON object from line: {line}. Error: {e}", flush=True)
|
95 |
|
96 |
# load metadata file
|
97 |
+
with open(os.path.join(local_submission_dir, submission_path_in_dataset, "metadata.json"), "r",
|
98 |
+
encoding="utf-8") as f:
|
99 |
metadata = json.load(f)
|
100 |
|
101 |
print(f" Loaded {len(submitted_models)} generated models.", flush=True)
|
102 |
|
103 |
+
# Writes stats to the summary file
|
104 |
+
evaluate_submission(submitted_models, summary_file_path, metadata["modelling_framework"], top_level_temp_dir)
|
105 |
|
106 |
+
# Upload the entire local_result_dir_for_upload to HF Dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
result_path_on_hub = f"{results_base_path_in_dataset}/{submission_name_for_files}"
|
108 |
print(f" Uploading results from '{local_result_dir_for_upload}' to '{result_path_on_hub}' on dataset...",
|
109 |
flush=True)
|
|
|
122 |
|
123 |
elapsed_time = time.time() - start_time
|
124 |
print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/hf_utils.py
CHANGED
@@ -73,10 +73,6 @@ def load_leaderboard_data():
|
|
73 |
for line in f:
|
74 |
if 'Error perc' in line:
|
75 |
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
|
76 |
-
# if 'Execution perc' in line:
|
77 |
-
# entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
|
78 |
-
# if 'Consistency perc' in line:
|
79 |
-
# entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
80 |
if 'Final Solution Accuracy' in line:
|
81 |
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
82 |
if 'Submission coverage perc' in line:
|
|
|
73 |
for line in f:
|
74 |
if 'Error perc' in line:
|
75 |
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
|
|
|
|
|
|
|
|
|
76 |
if 'Final Solution Accuracy' in line:
|
77 |
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
78 |
if 'Submission coverage perc' in line:
|
src/ui.py
CHANGED
@@ -97,14 +97,14 @@ def create_ui():
|
|
97 |
"## Important Notes\n"
|
98 |
"1. **Submission Name**: The submission name must be different from any existing submission names.\n"
|
99 |
"2. **File Format**: Ensure that the uploaded files are in the correct format. The submission file must be a `.jsonl` file, and the report must be a `pdf` file.\n"
|
100 |
-
"3. **Evaluation Script**: It is highly recommended to use the evaluation script provided [here](https://huggingface.co/spaces/kostis-init/CP-Bench-competition/blob/main/user_eval.py) to check your results before submission. You can run the script as follows:\n"
|
101 |
" ```bash\n"
|
102 |
-
" python user_eval.py --submission_file path/to/my/submission.jsonl\n"
|
103 |
" ```\n"
|
104 |
" This will evaluate your submission locally and print the results to the console.\n"
|
105 |
"4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
|
106 |
"\n\n"
|
107 |
-
"### If you have any questions or issues,
|
108 |
"---\n"
|
109 |
)
|
110 |
|
|
|
97 |
"## Important Notes\n"
|
98 |
"1. **Submission Name**: The submission name must be different from any existing submission names.\n"
|
99 |
"2. **File Format**: Ensure that the uploaded files are in the correct format. The submission file must be a `.jsonl` file, and the report must be a `pdf` file.\n"
|
100 |
+
"3. **Evaluation Script**: It is highly recommended to use the evaluation script provided [here](https://huggingface.co/spaces/kostis-init/CP-Bench-competition/blob/main/src/user_eval.py) to check your results before submission. You can run the script as follows:\n"
|
101 |
" ```bash\n"
|
102 |
+
" python user_eval.py --submission_file path/to/my/submission.jsonl --modelling_framework CPMpy\n"
|
103 |
" ```\n"
|
104 |
" This will evaluate your submission locally and print the results to the console.\n"
|
105 |
"4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n"
|
106 |
"\n\n"
|
107 |
+
"### If you have any questions or issues, feel free to reach out to us.\n"
|
108 |
"---\n"
|
109 |
)
|
110 |
|
user_eval.py β src/user_eval.py
RENAMED
@@ -1,17 +1,85 @@
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import subprocess
|
4 |
import sys
|
5 |
import tempfile
|
6 |
-
|
7 |
import click
|
8 |
from pathlib import Path
|
9 |
-
|
|
|
10 |
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
GT_DATASET_NAME = "kostis-init/CP-Bench"
|
13 |
GT_PROBLEM_NAME_COLUMN = "id"
|
14 |
GT_MODEL_CODE_COLUMN = "model"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
def exec_code(code: str, timeout=10, modelling_language='cpmpy'):
|
@@ -25,27 +93,28 @@ def exec_code(code: str, timeout=10, modelling_language='cpmpy'):
|
|
25 |
"""
|
26 |
|
27 |
# create a temp directory to store the temporary file
|
28 |
-
temp_dir_name = "
|
29 |
temp_dir = os.path.join(os.getcwd(), temp_dir_name)
|
30 |
os.makedirs(temp_dir, exist_ok=True)
|
31 |
|
32 |
# write the code to a temporary file
|
33 |
-
suffix = '.__hidden_py__' if modelling_language ==
|
34 |
-
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=suffix, dir=temp_dir,
|
|
|
35 |
temp_instance_path = temp_file.name
|
36 |
temp_file.write(code)
|
37 |
|
38 |
try:
|
39 |
# execute the code
|
40 |
-
if modelling_language ==
|
41 |
command = [sys.executable, temp_instance_path]
|
42 |
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8')
|
43 |
|
44 |
successfully_executed = (result.returncode == 0)
|
45 |
output = result.stdout if successfully_executed else result.stderr
|
46 |
timeout_occurred = False
|
47 |
-
|
48 |
-
|
49 |
else:
|
50 |
raise ValueError(f"MODELLING_LANGUAGE not supported: {modelling_language}")
|
51 |
|
@@ -153,10 +222,125 @@ else:
|
|
153 |
"""
|
154 |
return modified_script
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
@click.command()
|
157 |
@click.option('--submission_file', required=True, type=click.Path(exists=True, path_type=Path),
|
158 |
help='Path to the submission JSONL file')
|
159 |
-
|
|
|
|
|
|
|
160 |
"""Evaluate a submission file for the CP-Bench competition."""
|
161 |
is_valid, message = validate_submission_file(submission_file)
|
162 |
if not is_valid:
|
@@ -177,116 +361,30 @@ def main(submission_file: Path):
|
|
177 |
print(f" ERROR: Failed to parse JSON object from line: {line}. Error: {e}", flush=True)
|
178 |
print(f" Loaded {len(submitted_models)} generated models.", flush=True)
|
179 |
|
|
|
|
|
180 |
|
181 |
-
# eval
|
182 |
-
total_submitted_models = 0
|
183 |
-
models_ran_successfully = 0
|
184 |
-
consistency_checks_passed = 0
|
185 |
-
objective_checks_passed = 0
|
186 |
-
all_checks_passed = 0
|
187 |
-
gt_models_found = 0
|
188 |
-
|
189 |
-
# Load ground-truth models
|
190 |
-
print(f" Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
|
191 |
try:
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
}
|
198 |
-
if not ground_truth_models: raise ValueError("No models in GT dataset.")
|
199 |
-
print(f" Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
|
200 |
-
except Exception as e_gt:
|
201 |
-
print(f" CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
|
202 |
return
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
total_submitted_models += 1
|
209 |
-
problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
|
210 |
-
print(f"\n Processing model: {problem_name}", flush=True)
|
211 |
-
print(f"\n--- Model: {problem_name} ---\n")
|
212 |
-
|
213 |
-
print(" 1. Running submitted model...\n")
|
214 |
-
|
215 |
-
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=60)
|
216 |
-
|
217 |
-
if timeout_occurred:
|
218 |
-
print(f" - TIMEOUT: Execution time exceeded 60 seconds.\n")
|
219 |
-
continue
|
220 |
-
if not succ_exec:
|
221 |
-
print(f" - FAILED: Execution failed with error: {output}\n")
|
222 |
-
continue
|
223 |
-
if output is None or not output.strip():
|
224 |
-
print(f" - FAILED: No output from execution.\n")
|
225 |
-
continue
|
226 |
-
# Attempt to extract JSON from stdout
|
227 |
-
generated_solution = extract_json_from_code_output(output)
|
228 |
-
if generated_solution is None:
|
229 |
-
print(f" - FAILED: Could not extract JSON solution from output: {output}\n")
|
230 |
-
continue
|
231 |
-
|
232 |
-
models_ran_successfully += 1
|
233 |
-
print(f" - SUCCESS: Got solution: {generated_solution}\n")
|
234 |
-
|
235 |
-
print(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
236 |
-
if problem_name not in ground_truth_models:
|
237 |
-
print(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
238 |
-
continue
|
239 |
-
gt_models_found += 1
|
240 |
-
ground_truth_script_content = ground_truth_models[problem_name]
|
241 |
-
print(" - SUCCESS: Found ground-truth model.\n")
|
242 |
-
|
243 |
-
print(" 3. Performing self-consistency check on ground-truth model...\n")
|
244 |
-
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
245 |
|
|
|
|
|
246 |
try:
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
capture_output=True, text=True, timeout=60, encoding='utf-8',
|
254 |
-
)
|
255 |
-
os.unlink(tmp_file_path_str)
|
256 |
-
|
257 |
-
gt_stdout = gt_check_result.stdout
|
258 |
-
if "SUCCESS: Model is consistent" in gt_stdout:
|
259 |
-
print(" - CONSISTENCY: PASSED\n")
|
260 |
-
consistency_checks_passed += 1
|
261 |
-
else:
|
262 |
-
print(
|
263 |
-
" - CONSISTENCY: FAILED (Details in logs or stdout)\n")
|
264 |
-
|
265 |
-
if "SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout:
|
266 |
-
print(" - OBJECTIVE: PASSED\n")
|
267 |
-
objective_checks_passed += 1
|
268 |
-
else:
|
269 |
-
print(" - OBJECTIVE: FAILED (Details in logs or stdout)\n")
|
270 |
-
|
271 |
-
if "SUCCESS: Model is consistent" in gt_stdout and (
|
272 |
-
"SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
|
273 |
-
print(" - SELF-CONSISTENCY CHECK: PASSED fully\n")
|
274 |
-
all_checks_passed += 1
|
275 |
-
|
276 |
-
except Exception as e_gt_run:
|
277 |
-
print(f" - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
|
278 |
-
|
279 |
-
# Final statistics (write to summary_f)
|
280 |
-
print("\n" + "=" * 30 + "\n")
|
281 |
-
print("Overall Evaluation:\n")
|
282 |
-
print(f" Total Submitted Models Parsed: {total_submitted_models}\n")
|
283 |
-
print(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
|
284 |
-
print(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
285 |
-
print(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
286 |
-
print(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
287 |
-
print("-" * 30 + "\n")
|
288 |
-
|
289 |
-
click.echo("Evaluation complete!")
|
290 |
|
291 |
|
292 |
if __name__ == "__main__":
|
|
|
1 |
+
import time
|
2 |
import json
|
3 |
import os
|
4 |
import subprocess
|
5 |
import sys
|
6 |
import tempfile
|
|
|
7 |
import click
|
8 |
from pathlib import Path
|
9 |
+
import minizinc
|
10 |
+
import datetime
|
11 |
from datasets import load_dataset
|
12 |
+
from tqdm import tqdm
|
13 |
+
|
14 |
+
CPMPY_FRAMEWORK = "CPMpy"
|
15 |
+
MINIZINC_FRAMEWORK = "MiniZinc"
|
16 |
+
ORTOOLS_FRAMEWORK = "OR-Tools"
|
17 |
|
18 |
GT_DATASET_NAME = "kostis-init/CP-Bench"
|
19 |
GT_PROBLEM_NAME_COLUMN = "id"
|
20 |
GT_MODEL_CODE_COLUMN = "model"
|
21 |
+
SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
|
22 |
+
|
23 |
+
|
24 |
+
def exec_code_minizinc(code: str, timeout_sec):
|
25 |
+
"""
|
26 |
+
Executes a MiniZinc model string using the minizinc-python library.
|
27 |
+
|
28 |
+
:param code: The MiniZinc model code as a string.
|
29 |
+
:param timeout_sec: The maximum time to wait for the solver in seconds.
|
30 |
+
:return: A tuple of (success, output, timeout_occured)
|
31 |
+
"""
|
32 |
+
successfully_executed = False
|
33 |
+
output = ""
|
34 |
+
timeout_occurred = False
|
35 |
+
timeout_duration = datetime.timedelta(seconds=timeout_sec)
|
36 |
+
|
37 |
+
try:
|
38 |
+
# 1. Create a MiniZinc model instance
|
39 |
+
model = minizinc.Model()
|
40 |
+
model.add_string(code)
|
41 |
+
|
42 |
+
# 2. Find a default solver configured with MiniZinc
|
43 |
+
# You can be more specific, e.g., solver = minizinc.Solver.lookup("gecode")
|
44 |
+
# If the default solver isn't found or suitable, this will raise an error.
|
45 |
+
gecode = minizinc.Solver.lookup("gecode")
|
46 |
+
if gecode is None:
|
47 |
+
raise RuntimeError("No suitable solver found. Please install a MiniZinc solver.")
|
48 |
+
|
49 |
+
# 3. Create an Instance to solve
|
50 |
+
instance = minizinc.Instance(gecode, model)
|
51 |
+
|
52 |
+
# 4. Solve the instance with the specified timeout
|
53 |
+
# The solve() method handles the timeout internally.
|
54 |
+
result = instance.solve(timeout=timeout_duration)
|
55 |
+
|
56 |
+
# 5. Process the result
|
57 |
+
if result.status in {minizinc.Status.SATISFIED, minizinc.Status.OPTIMAL_SOLUTION}:
|
58 |
+
successfully_executed = True
|
59 |
+
output = str(result.solution) if result.solution is not None else ""
|
60 |
+
timeout_occurred = False
|
61 |
+
elif result.status == minizinc.Status.UNKNOWN:
|
62 |
+
successfully_executed = False
|
63 |
+
output = f"Timeout Error: Solver stopped after {timeout_sec} seconds (Status: UNKNOWN)."
|
64 |
+
timeout_occurred = True
|
65 |
+
else:
|
66 |
+
# Handle other non-success statuses (UNSAT, ERROR, etc.)
|
67 |
+
successfully_executed = False
|
68 |
+
output = f"Solving failed. Status: {result.status}"
|
69 |
+
timeout_occurred = False
|
70 |
+
|
71 |
+
except minizinc.MiniZincError as e:
|
72 |
+
# Catch MiniZinc specific errors (e.g., syntax errors, solver not found)
|
73 |
+
successfully_executed = False
|
74 |
+
output = f"MiniZinc Error: {e}"
|
75 |
+
timeout_occurred = False
|
76 |
+
except Exception as e:
|
77 |
+
# Catch other unexpected errors
|
78 |
+
successfully_executed = False
|
79 |
+
output = f"Unexpected Error during MiniZinc execution: {e}"
|
80 |
+
timeout_occurred = False
|
81 |
+
|
82 |
+
return successfully_executed, output, timeout_occurred
|
83 |
|
84 |
|
85 |
def exec_code(code: str, timeout=10, modelling_language='cpmpy'):
|
|
|
93 |
"""
|
94 |
|
95 |
# create a temp directory to store the temporary file
|
96 |
+
temp_dir_name = "temp_dir_for_exec_code"
|
97 |
temp_dir = os.path.join(os.getcwd(), temp_dir_name)
|
98 |
os.makedirs(temp_dir, exist_ok=True)
|
99 |
|
100 |
# write the code to a temporary file
|
101 |
+
suffix = '.__hidden_py__' if modelling_language == CPMPY_FRAMEWORK or modelling_language == ORTOOLS_FRAMEWORK else '.mzn'
|
102 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=suffix, dir=temp_dir,
|
103 |
+
encoding='utf-8') as temp_file:
|
104 |
temp_instance_path = temp_file.name
|
105 |
temp_file.write(code)
|
106 |
|
107 |
try:
|
108 |
# execute the code
|
109 |
+
if modelling_language == CPMPY_FRAMEWORK or modelling_language == ORTOOLS_FRAMEWORK:
|
110 |
command = [sys.executable, temp_instance_path]
|
111 |
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8')
|
112 |
|
113 |
successfully_executed = (result.returncode == 0)
|
114 |
output = result.stdout if successfully_executed else result.stderr
|
115 |
timeout_occurred = False
|
116 |
+
elif modelling_language == MINIZINC_FRAMEWORK:
|
117 |
+
successfully_executed, output, timeout_occurred = exec_code_minizinc(code, timeout)
|
118 |
else:
|
119 |
raise ValueError(f"MODELLING_LANGUAGE not supported: {modelling_language}")
|
120 |
|
|
|
222 |
"""
|
223 |
return modified_script
|
224 |
|
225 |
+
|
226 |
+
def evaluate_submission(submitted_models, summary_file_path, modelling_framw, top_lvl_temp_dir):
|
227 |
+
# Load ground-truth dataset
|
228 |
+
print(f" Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
|
229 |
+
try:
|
230 |
+
gt_dataset = load_dataset(GT_DATASET_NAME, split="train", trust_remote_code=True)
|
231 |
+
ground_truth_models = {
|
232 |
+
item[GT_PROBLEM_NAME_COLUMN]: item[GT_MODEL_CODE_COLUMN]
|
233 |
+
for item in gt_dataset if
|
234 |
+
GT_PROBLEM_NAME_COLUMN in item and GT_MODEL_CODE_COLUMN in item and item[GT_MODEL_CODE_COLUMN]
|
235 |
+
}
|
236 |
+
if not ground_truth_models: raise ValueError("No models in GT dataset.")
|
237 |
+
print(f" Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
|
238 |
+
except Exception as e_gt:
|
239 |
+
print(f" CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
|
240 |
+
with open(summary_file_path, "w") as f:
|
241 |
+
f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{GT_DATASET_NAME}'.\nError: {e_gt}\n")
|
242 |
+
return 1
|
243 |
+
|
244 |
+
# Statistics
|
245 |
+
total_submitted_models = 0
|
246 |
+
models_ran_successfully = 0
|
247 |
+
consistency_checks_passed = 0
|
248 |
+
all_checks_passed = 0
|
249 |
+
|
250 |
+
with (open(summary_file_path, "w", encoding="utf-8") as summary_f):
|
251 |
+
summary_f.write(f"Ground-Truth Dataset: {GT_DATASET_NAME}\n")
|
252 |
+
summary_f.write("-" * 30 + "\n")
|
253 |
+
|
254 |
+
# Iterate through downloaded submitted models
|
255 |
+
for submitted_model in tqdm(submitted_models):
|
256 |
+
curr_model = submitted_model[GT_MODEL_CODE_COLUMN]
|
257 |
+
|
258 |
+
total_submitted_models += 1
|
259 |
+
problem_name = submitted_model[GT_PROBLEM_NAME_COLUMN]
|
260 |
+
print(f"\n Processing model: {problem_name}", flush=True)
|
261 |
+
summary_f.write(f"\n--- Model: {problem_name} ---\n")
|
262 |
+
|
263 |
+
summary_f.write(" 1. Running submitted model...\n")
|
264 |
+
|
265 |
+
succ_exec, output, timeout_occurred = exec_code(curr_model, timeout=SCRIPT_EXECUTION_TIMEOUT,
|
266 |
+
modelling_language=modelling_framw)
|
267 |
+
|
268 |
+
if succ_exec:
|
269 |
+
models_ran_successfully += 1
|
270 |
+
summary_f.write(" - SUCCESS: Model executed successfully.\n")
|
271 |
+
|
272 |
+
if timeout_occurred:
|
273 |
+
summary_f.write(f" - TIMEOUT: Execution time exceeded {SCRIPT_EXECUTION_TIMEOUT} seconds.\n")
|
274 |
+
continue
|
275 |
+
if not succ_exec:
|
276 |
+
summary_f.write(f" - FAILED: Execution failed with error: {output}\n")
|
277 |
+
continue
|
278 |
+
if output is None or not output.strip():
|
279 |
+
summary_f.write(f" - FAILED: No output from execution.\n")
|
280 |
+
continue
|
281 |
+
# Attempt to extract JSON from stdout
|
282 |
+
generated_solution = extract_json_from_code_output(output)
|
283 |
+
if generated_solution is None:
|
284 |
+
summary_f.write(f" - FAILED: Could not extract JSON solution from output: {output}\n")
|
285 |
+
continue
|
286 |
+
summary_f.write(f" - SUCCESS: Got solution: {generated_solution}\n")
|
287 |
+
|
288 |
+
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
289 |
+
if problem_name not in ground_truth_models:
|
290 |
+
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
291 |
+
continue
|
292 |
+
ground_truth_script_content = ground_truth_models[problem_name]
|
293 |
+
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
|
294 |
+
|
295 |
+
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
|
296 |
+
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
297 |
+
|
298 |
+
try:
|
299 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
|
300 |
+
dir=top_lvl_temp_dir) as tmp_file:
|
301 |
+
tmp_file.write(modified_gt_script)
|
302 |
+
tmp_file_path_str = tmp_file.name
|
303 |
+
|
304 |
+
gt_check_result = subprocess.run(
|
305 |
+
[sys.executable, tmp_file_path_str],
|
306 |
+
capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT, encoding='utf-8',
|
307 |
+
)
|
308 |
+
os.unlink(tmp_file_path_str)
|
309 |
+
|
310 |
+
gt_stdout = gt_check_result.stdout
|
311 |
+
if "SUCCESS: Model is consistent" in gt_stdout:
|
312 |
+
summary_f.write(" - CONSISTENCY: PASSED\n")
|
313 |
+
consistency_checks_passed += 1
|
314 |
+
else:
|
315 |
+
summary_f.write(" - CONSISTENCY: FAILED (Details in logs or stdout)\n")
|
316 |
+
|
317 |
+
if "SUCCESS: Model is consistent" in gt_stdout and (
|
318 |
+
"SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
|
319 |
+
summary_f.write(" - SELF-CONSISTENCY CHECK: PASSED fully\n")
|
320 |
+
all_checks_passed += 1
|
321 |
+
|
322 |
+
except Exception as e_gt_run:
|
323 |
+
summary_f.write(f" - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
|
324 |
+
|
325 |
+
# Final statistics (write to summary_f)
|
326 |
+
summary_f.write("\n" + "=" * 30 + "\n")
|
327 |
+
summary_f.write("Overall Evaluation Statistics:\n")
|
328 |
+
summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
|
329 |
+
summary_f.write(f" Models That Ran Successfully (out of the submitted models): {models_ran_successfully}/{total_submitted_models}\n")
|
330 |
+
summary_f.write(f" Submission coverage perc: {float(total_submitted_models) / len(ground_truth_models) * 100:.2f}%\n")
|
331 |
+
summary_f.write(f" Error perc: {(total_submitted_models - models_ran_successfully) / len(ground_truth_models) * 100:.2f}%\n")
|
332 |
+
summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
333 |
+
summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
334 |
+
summary_f.write("-" * 30 + "\n")
|
335 |
+
|
336 |
+
|
337 |
@click.command()
|
338 |
@click.option('--submission_file', required=True, type=click.Path(exists=True, path_type=Path),
|
339 |
help='Path to the submission JSONL file')
|
340 |
+
@click.option('--modelling_framework', required=True,
|
341 |
+
type=click.Choice([CPMPY_FRAMEWORK, ORTOOLS_FRAMEWORK, MINIZINC_FRAMEWORK]),
|
342 |
+
help='Modelling framework used in the submission')
|
343 |
+
def main(submission_file: Path, modelling_framework: str):
|
344 |
"""Evaluate a submission file for the CP-Bench competition."""
|
345 |
is_valid, message = validate_submission_file(submission_file)
|
346 |
if not is_valid:
|
|
|
361 |
print(f" ERROR: Failed to parse JSON object from line: {line}. Error: {e}", flush=True)
|
362 |
print(f" Loaded {len(submitted_models)} generated models.", flush=True)
|
363 |
|
364 |
+
summary_file_path = Path("summary.txt")
|
365 |
+
top_level_temp_dir = tempfile.mkdtemp(prefix="cp_bench_eval_")
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
try:
|
368 |
+
start_time = time.time()
|
369 |
+
evaluate_submission(submitted_models, summary_file_path, modelling_framework, top_level_temp_dir)
|
370 |
+
elapsed_time = time.time() - start_time
|
371 |
+
except Exception as e:
|
372 |
+
click.echo(f"Error during evaluation: {e}")
|
|
|
|
|
|
|
|
|
|
|
373 |
return
|
374 |
|
375 |
+
click.echo("Evaluation complete!")
|
376 |
+
click.echo(f"Results written to {summary_file_path}")
|
377 |
+
click.echo(f"Total evaluation time: {elapsed_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
|
379 |
+
# Clean up temporary directory
|
380 |
+
if os.path.exists(top_level_temp_dir):
|
381 |
try:
|
382 |
+
os.rmdir(top_level_temp_dir)
|
383 |
+
except OSError as e:
|
384 |
+
click.echo(f"Warning: Could not remove temporary directory {top_level_temp_dir}: {e}")
|
385 |
+
else:
|
386 |
+
click.echo(f"Temporary directory {top_level_temp_dir} does not exist, nothing to clean up.")
|
387 |
+
click.echo("Thank you for your submission!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
|
390 |
if __name__ == "__main__":
|
template_submission.jsonl
CHANGED
@@ -81,21 +81,21 @@
|
|
81 |
{"id": "cpmpy_examples__who_killed_agatha", "model": "from cpmpy import *\nimport json\n\n# Data\nnames = [\"Agatha herself\", \"the butler\", \"Charles\"]\n\n# Variables\nkiller = int_var(0, 2, name=\"killer\")\n\n# Constraints\n# Agatha hates everybody except the butler\nagatha_hates = [1, 2] # indices of people Agatha hates\n# The butler hates everyone not richer than Aunt Agatha\nbutler_hates = [0, 2] # indices of people the butler hates\n# The butler hates everyone whom Agatha hates\n# This is already included in butler_hates\n# Charles hates noone that Agatha hates\ncharles_hates = [] # indices of people Charles hates\n\n# Model\nmodel = Model(\n # A killer always hates, and is no richer than his victim\n [killer in agatha_hates, \n killer != 0], # Agatha cannot kill herself\n # Additional constraints based on the problem description\n [killer in butler_hates if killer == 1 else True], # butler hates everyone not richer than Agatha\n [killer not in agatha_hates if killer == 2 else True] # Charles hates noone that Agatha hates\n)\n\n# Solve\nif model.solve():\n # Print solution as JSON\n solution = {\"killer\": names[killer.value()]}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
82 |
{"id": "cpmpy_examples__wolf_goat_cabbage", "model": "from cpmpy import *\nimport json\n\n# Define the number of stages\nstage = 8\n\n# Define the decision variables\nwolf_pos = boolvar(shape=stage+1, name=\"wolf_pos\")\ngoat_pos = boolvar(shape=stage+1, name=\"goat_pos\")\ncabbage_pos = boolvar(shape=stage+1, name=\"cabbage_pos\")\nboat_pos = boolvar(shape=stage+1, name=\"boat_pos\")\n\n# Model\nmodel = Model(\n # Initial positions\n wolf_pos[0] == 0,\n goat_pos[0] == 0,\n cabbage_pos[0] == 0,\n boat_pos[0] == 0,\n\n # Final positions\n wolf_pos[stage] == 1,\n goat_pos[stage] == 1,\n cabbage_pos[stage] == 1,\n boat_pos[stage] == 1,\n\n # Boat movement\n [boat_pos[t] != boat_pos[t+1] for t in range(stage)],\n\n # Farmer is with the boat\n [wolf_pos[t] == boat_pos[t] or goat_pos[t] == boat_pos[t] or cabbage_pos[t] == boat_pos[t] for t in range(1, stage+1)],\n\n # Wolf and goat are not left together\n [(wolf_pos[t] != goat_pos[t]) or boat_pos[t] == wolf_pos[t] or boat_pos[t] == goat_pos[t] for t in range(1, stage+1)],\n\n # Goat and cabbage are not left together\n [(goat_pos[t] != cabbage_pos[t]) or boat_pos[t] == goat_pos[t] or boat_pos[t] == cabbage_pos[t] for t in range(1, stage+1)],\n\n # Wolf and cabbage are not left together (this constraint is not necessary, but it helps the solver)\n [(wolf_pos[t] != cabbage_pos[t]) or boat_pos[t] == wolf_pos[t] or boat_pos[t] == cabbage_pos[t] for t in range(1, stage+1)]\n)\n\n# Solve the model\nif model.solve():\n # Create the solution dictionary\n solution = {\n \"boat_pos\": [int(boat_pos[t].value()) for t in range(stage+1)],\n \"0\": 0,\n \"cabbage_pos\": [int(cabbage_pos[t].value()) for t in range(stage+1)],\n \"goat_pos\": [int(goat_pos[t].value()) for t in range(stage+1)],\n \"wolf_pos\": [int(wolf_pos[t].value()) for t in range(stage+1)],\n \"1\": 1\n }\n\n # Print the solution as a JSON object\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
83 |
{"id": "cpmpy_examples__zebra", "model": "from cpmpy import *\nimport json\n\n# Define the variables\ncolors = [\"yellow\", \"green\", \"red\", \"white\", \"blue\"]\nnations = [\"italy\", \"spain\", \"japan\", \"england\", \"norway\"]\njobs = [\"painter\", \"sculptor\", \"diplomat\", \"pianist\", \"doctor\"]\npets = [\"cat\", \"zebra\", \"bear\", \"snails\", \"horse\"]\ndrinks = [\"milk\", \"water\", \"tea\", \"coffee\", \"juice\"]\n\n# Define the model\nmodel = Model()\n\n# Define the decision variables\ncolor_vars = intvar(1, 5, shape=5, name=\"color\")\nnation_vars = intvar(1, 5, shape=5, name=\"nation\")\njob_vars = intvar(1, 5, shape=5, name=\"job\")\npet_vars = intvar(1, 5, shape=5, name=\"pet\")\ndrink_vars = intvar(1, 5, shape=5, name=\"drink\")\n\n# Constraints\nmodel += [AllDifferent(color_vars)]\nmodel += [AllDifferent(nation_vars)]\nmodel += [AllDifferent(job_vars)]\nmodel += [AllDifferent(pet_vars)]\nmodel += [AllDifferent(drink_vars)]\n\n# The painter owns the horse\nmodel += [job_vars[0] == 1, pet_vars[0] == 5]\n\n# The diplomat drinks coffee\nmodel += [job_vars[2] == 3, drink_vars[2] == 4]\n\n# The one who drinks milk lives in the white house\nmodel += [drink_vars[3] == 1, color_vars[3] == 4]\n\n# The Spaniard is a painter\nmodel += [nation_vars[1] == 2, job_vars[1] == 1]\n\n# The Englishman lives in the red house\nmodel += [nation_vars[3] == 4, color_vars[3] == 3]\n\n# The snails are owned by the sculptor\nmodel += [pet_vars[2] == 4, job_vars[2] == 2]\n\n# The green house is on the left of the red one\nmodel += [color_vars[1] == 2, color_vars[2] == 3]\n\n# The Norwegian lives on the right of the blue house\nmodel += [color_vars[4] == 5, nation_vars[4] == 5]\n\n# The doctor drinks milk\nmodel += [job_vars[4] == 5, drink_vars[4] == 1]\n\n# The diplomat is Japanese\nmodel += [nation_vars[2] == 3, job_vars[2] == 3]\n\n# The Norwegian owns the zebra\nmodel += [pet_vars[4] == 2, nation_vars[4] == 5]\n\n# The green house is next to the white one\nmodel += [color_vars[2] == 2, color_vars[3] == 4]\n\n# The horse is owned by the neighbor of the diplomat\nmodel += [pet_vars[1] == 5, job_vars[2] == 3]\n\n# The Italian either lives in the red, white or green house\nmodel += [Or([nation_vars[1] == 1, nation_vars[2] == 1, nation_vars[3] == 1])]\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = {\n \"colors\": [colors[i-1] for i in color_vars.value()],\n \"jobs\": [jobs[i-1] for i in job_vars.value()],\n \"nations\": [nations[i-1] for i in nation_vars.value()],\n \"pets\": [pets[i-1] for i in pet_vars.value()],\n \"drinks\": [drinks[i-1] for i in drink_vars.value()]\n }\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
84 |
-
{"id": "
|
85 |
-
{"id": "
|
86 |
-
{"id": "
|
87 |
-
{"id": "
|
88 |
-
{"id": "
|
89 |
-
{"id": "
|
90 |
-
{"id": "
|
91 |
-
{"id": "
|
92 |
-
{"id": "
|
93 |
-
{"id": "
|
94 |
-
{"id": "
|
95 |
-
{"id": "
|
96 |
-
{"id": "
|
97 |
-
{"id": "
|
98 |
-
{"id": "
|
99 |
-
{"id": "
|
100 |
-
{"id": "
|
101 |
-
{"id": "
|
|
|
81 |
{"id": "cpmpy_examples__who_killed_agatha", "model": "from cpmpy import *\nimport json\n\n# Data\nnames = [\"Agatha herself\", \"the butler\", \"Charles\"]\n\n# Variables\nkiller = int_var(0, 2, name=\"killer\")\n\n# Constraints\n# Agatha hates everybody except the butler\nagatha_hates = [1, 2] # indices of people Agatha hates\n# The butler hates everyone not richer than Aunt Agatha\nbutler_hates = [0, 2] # indices of people the butler hates\n# The butler hates everyone whom Agatha hates\n# This is already included in butler_hates\n# Charles hates noone that Agatha hates\ncharles_hates = [] # indices of people Charles hates\n\n# Model\nmodel = Model(\n # A killer always hates, and is no richer than his victim\n [killer in agatha_hates, \n killer != 0], # Agatha cannot kill herself\n # Additional constraints based on the problem description\n [killer in butler_hates if killer == 1 else True], # butler hates everyone not richer than Agatha\n [killer not in agatha_hates if killer == 2 else True] # Charles hates noone that Agatha hates\n)\n\n# Solve\nif model.solve():\n # Print solution as JSON\n solution = {\"killer\": names[killer.value()]}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
82 |
{"id": "cpmpy_examples__wolf_goat_cabbage", "model": "from cpmpy import *\nimport json\n\n# Define the number of stages\nstage = 8\n\n# Define the decision variables\nwolf_pos = boolvar(shape=stage+1, name=\"wolf_pos\")\ngoat_pos = boolvar(shape=stage+1, name=\"goat_pos\")\ncabbage_pos = boolvar(shape=stage+1, name=\"cabbage_pos\")\nboat_pos = boolvar(shape=stage+1, name=\"boat_pos\")\n\n# Model\nmodel = Model(\n # Initial positions\n wolf_pos[0] == 0,\n goat_pos[0] == 0,\n cabbage_pos[0] == 0,\n boat_pos[0] == 0,\n\n # Final positions\n wolf_pos[stage] == 1,\n goat_pos[stage] == 1,\n cabbage_pos[stage] == 1,\n boat_pos[stage] == 1,\n\n # Boat movement\n [boat_pos[t] != boat_pos[t+1] for t in range(stage)],\n\n # Farmer is with the boat\n [wolf_pos[t] == boat_pos[t] or goat_pos[t] == boat_pos[t] or cabbage_pos[t] == boat_pos[t] for t in range(1, stage+1)],\n\n # Wolf and goat are not left together\n [(wolf_pos[t] != goat_pos[t]) or boat_pos[t] == wolf_pos[t] or boat_pos[t] == goat_pos[t] for t in range(1, stage+1)],\n\n # Goat and cabbage are not left together\n [(goat_pos[t] != cabbage_pos[t]) or boat_pos[t] == goat_pos[t] or boat_pos[t] == cabbage_pos[t] for t in range(1, stage+1)],\n\n # Wolf and cabbage are not left together (this constraint is not necessary, but it helps the solver)\n [(wolf_pos[t] != cabbage_pos[t]) or boat_pos[t] == wolf_pos[t] or boat_pos[t] == cabbage_pos[t] for t in range(1, stage+1)]\n)\n\n# Solve the model\nif model.solve():\n # Create the solution dictionary\n solution = {\n \"boat_pos\": [int(boat_pos[t].value()) for t in range(stage+1)],\n \"0\": 0,\n \"cabbage_pos\": [int(cabbage_pos[t].value()) for t in range(stage+1)],\n \"goat_pos\": [int(goat_pos[t].value()) for t in range(stage+1)],\n \"wolf_pos\": [int(wolf_pos[t].value()) for t in range(stage+1)],\n \"1\": 1\n }\n\n # Print the solution as a JSON object\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
83 |
{"id": "cpmpy_examples__zebra", "model": "from cpmpy import *\nimport json\n\n# Define the variables\ncolors = [\"yellow\", \"green\", \"red\", \"white\", \"blue\"]\nnations = [\"italy\", \"spain\", \"japan\", \"england\", \"norway\"]\njobs = [\"painter\", \"sculptor\", \"diplomat\", \"pianist\", \"doctor\"]\npets = [\"cat\", \"zebra\", \"bear\", \"snails\", \"horse\"]\ndrinks = [\"milk\", \"water\", \"tea\", \"coffee\", \"juice\"]\n\n# Define the model\nmodel = Model()\n\n# Define the decision variables\ncolor_vars = intvar(1, 5, shape=5, name=\"color\")\nnation_vars = intvar(1, 5, shape=5, name=\"nation\")\njob_vars = intvar(1, 5, shape=5, name=\"job\")\npet_vars = intvar(1, 5, shape=5, name=\"pet\")\ndrink_vars = intvar(1, 5, shape=5, name=\"drink\")\n\n# Constraints\nmodel += [AllDifferent(color_vars)]\nmodel += [AllDifferent(nation_vars)]\nmodel += [AllDifferent(job_vars)]\nmodel += [AllDifferent(pet_vars)]\nmodel += [AllDifferent(drink_vars)]\n\n# The painter owns the horse\nmodel += [job_vars[0] == 1, pet_vars[0] == 5]\n\n# The diplomat drinks coffee\nmodel += [job_vars[2] == 3, drink_vars[2] == 4]\n\n# The one who drinks milk lives in the white house\nmodel += [drink_vars[3] == 1, color_vars[3] == 4]\n\n# The Spaniard is a painter\nmodel += [nation_vars[1] == 2, job_vars[1] == 1]\n\n# The Englishman lives in the red house\nmodel += [nation_vars[3] == 4, color_vars[3] == 3]\n\n# The snails are owned by the sculptor\nmodel += [pet_vars[2] == 4, job_vars[2] == 2]\n\n# The green house is on the left of the red one\nmodel += [color_vars[1] == 2, color_vars[2] == 3]\n\n# The Norwegian lives on the right of the blue house\nmodel += [color_vars[4] == 5, nation_vars[4] == 5]\n\n# The doctor drinks milk\nmodel += [job_vars[4] == 5, drink_vars[4] == 1]\n\n# The diplomat is Japanese\nmodel += [nation_vars[2] == 3, job_vars[2] == 3]\n\n# The Norwegian owns the zebra\nmodel += [pet_vars[4] == 2, nation_vars[4] == 5]\n\n# The green house is next to the white one\nmodel += [color_vars[2] == 2, color_vars[3] == 4]\n\n# The horse is owned by the neighbor of the diplomat\nmodel += [pet_vars[1] == 5, job_vars[2] == 3]\n\n# The Italian either lives in the red, white or green house\nmodel += [Or([nation_vars[1] == 1, nation_vars[2] == 1, nation_vars[3] == 1])]\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = {\n \"colors\": [colors[i-1] for i in color_vars.value()],\n \"jobs\": [jobs[i-1] for i in job_vars.value()],\n \"nations\": [nations[i-1] for i in nation_vars.value()],\n \"pets\": [pets[i-1] for i in pet_vars.value()],\n \"drinks\": [drinks[i-1] for i in drink_vars.value()]\n }\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
84 |
+
{"id": "aplai_course__1_bank_card", "model": "from cpmpy import *\nimport json\n\n# Define the model\na, b, c, d = int_var(0, 9), int_var(0, 9), int_var(0, 9), int_var(0, 9)\n\nmodel = Model(\n [a != b, a != c, a != d, b != c, b != d, c != d], # all digits are different\n [10*a + b <= 9*10 + 9], # ab is at most 99\n [10*c + d <= 9*10 + 9], # cd is at most 99\n [10*d + a <= 9*10 + 9], # da is at most 99\n [10*b + c <= 9*10 + 9], # bc is at most 99\n [3*(10*a + b) == (10*c + d)], # cd is 3 times ab\n [2*(10*b + c) == (10*d + a)], # da is 2 times bc\n)\n\n# Solve the model\nif model.solve():\n solution = {\"a\": a.value(), \"b\": b.value(), \"c\": c.value(), \"d\": d.value()}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
85 |
+
{"id": "aplai_course__5_climbing_stairs", "model": "from cpmpy import *\nimport json\n\n# Define the model\nn = 20 # Total steps\nm1 = 3 # Minimum steps at a time\nm2 = 5 # Maximum steps at a time\n\n# Define the variable\nsteps = intvar(0, m2, shape=n)\n\n# Define the model\nmodel = Model(\n # Objective: sum of steps equals n\n [sum(steps) == n],\n # Each step is between m1 and m2 (inclusive) or 0\n [steps[i] in [0, m1, m2] for i in range(n)],\n # Cumulative sum of steps does not exceed n at any point\n [cumsum(steps[:i+1]) <= n for i in range(n)],\n # If a step is not 0, it must be m1 or m2\n [steps[i] >= m1 for i in range(n)]\n)\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n print(json.dumps({\"steps\": [steps[i].value() for i in range(n)]}))\nelse:\n print(\"No solution found\")"}
|
86 |
+
{"id": "aplai_course__2_color_simple", "model": "from cpmpy import *\nimport json\n\n# Define the adjacency list\ngraph = [\n [3, 1],\n [3, 6],\n [3, 4],\n [6, 4],\n [6, 1],\n [1, 5],\n [1, 4],\n [4, 5],\n [4, 2]\n]\n\n# Define the number of countries\nnum_countries = len(graph)\n\n# Define the number of colors\nnum_colors = num_countries\n\n# Define the color variables\ncolors = intvar(1, num_colors, shape=num_countries, name=\"colors\")\n\n# Define the model\nmodel = Model()\n\n# Add constraints: two neighbouring countries cannot have the same color\nfor i in range(num_countries):\n for j in graph[i]:\n model += colors[i] != colors[j-1]\n\n# Solve the model\nif model.solve():\n # Print the colors assigned to each country as a JSON object\n output = {\"colors\": [int(c) for c in colors.value()]}\n print(json.dumps(output))\nelse:\n print(\"No solution found\")"}
|
87 |
+
{"id": "aplai_course__3_exodus", "model": "from cpmpy import *\nimport json\n\n# Define the variables\nages = [3, 5, 7, 8, 10]\nchildren = [\"Bernice\", \"Carl\", \"Debby\", \"Sammy\", \"Ted\"]\ncountries = [\"Ethiopia\", \"Kazakhstan\", \"Lithuania\", \"Morocco\", \"Yemen\"]\nstories = [\"burning bush\", \"captivity\", \"Moses\u2019s youth\", \"Passover\", \"Ten Commandments\"]\n\n# Define the model\nmodel = Model()\n\n# Define the decision variables\nchild = intvar(1, 5, shape=5, name=\"child\")\ncountry = intvar(1, 5, shape=5, name=\"country\")\nstory = intvar(1, 5, shape=5, name=\"story\")\nage = intvar(1, 5, shape=5, name=\"age\")\n\n# Constraints\nmodel += [AllDifferent(child)]\nmodel += [AllDifferent(country)]\nmodel += [AllDifferent(story)]\nmodel += [AllDifferent(age)]\n\n# Debby\u2019s family is from Lithuania\nmodel += [country[children.index(\"Debby\")] == countries.index(\"Lithuania\") + 1]\n\n# The child who told the story of the Passover is two years older than Bernice\nmodel += [age[stories.index(\"Passover\")] - age[children.index(\"Bernice\")] == 2]\n\n# The child whose family is from Yemen is younger than the child from the Ethiopian family\nmodel += [age[countries.index(\"Yemen\")] < age[countries.index(\"Ethiopia\")]]\n\n# The child from the Moroccan family is three years older than Ted\nmodel += [age[countries.index(\"Morocco\")] - age[children.index(\"Ted\")] == 3]\n\n# Sammy is three years older than the child who told the story of Moses\u2019s youth\nmodel += [age[children.index(\"Sammy\")] - age[stories.index(\"Moses\u2019s youth\")] == 3]\n\n# Solve the model\nif model.solve():\n # Get the solution\n child_sol = [child[i].value() for i in range(5)]\n country_sol = [country[i].value() for i in range(5)]\n story_sol = [story[i].value() for i in range(5)]\n age_sol = [age[i].value() for i in range(5)]\n\n # Map the solution to the original lists\n countries_sol = [countries[i-1] for i in country_sol]\n children_sol = [children[i-1] for i in child_sol]\n stories_sol = [stories[i-1] for i in story_sol]\n ages_sol = [ages[i-1] for i in age_sol]\n\n # Print the solution as a JSON object\n print(json.dumps({\n \"countries\": [countries.index(c) + 1 for c in countries_sol],\n \"children\": [children.index(c) + 1 for c in children_sol],\n \"stories\": [stories.index(s) + 1 for s in stories_sol],\n \"ages\": [ages.index(a) + 1 for a in ages_sol]\n }))\nelse:\n print(\"No solution found\")"}
|
88 |
+
{"id": "aplai_course__3_farmer_and_cows", "model": "from cpmpy import *\nimport json\n\n# Data\nnum_cows = 25\nnum_sons = 5\ncows_per_son = [7, 6, 5, 4, 3]\nmilk_per_cow = list(range(1, num_cows + 1))\n\n# Model\nmodel = Model()\n\n# Variables\ncow_assignments = intvar(0, num_sons - 1, shape=num_cows, name=\"cow_assignments\")\n\n# Constraints\nfor i, num in enumerate(cows_per_son):\n model += sum([cow_assignments[j] == i for j in range(num_cows)]) == num\n\nmilk_per_son = [[milk_per_cow[j] for j in range(num_cows) if cow_assignments[j] == i] for i in range(num_sons)]\nfor i in range(1, num_sons):\n model += sum(milk_per_cow[j] for j in range(num_cows) if cow_assignments[j] == 0) == sum(milk_per_cow[j] for j in range(num_cows) if cow_assignments[j] == i)\n\n# Solve\nif model.solve():\n solution = {\"cow_assignments\": [int(x.value()) for x in cow_assignments]}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
89 |
+
{"id": "aplai_course__1_five_floors", "model": "from cpmpy import *\nimport json\n\n# Define the variables\nB = int_var(1, 5, name=\"B\")\nC = int_var(1, 5, name=\"C\")\nF = int_var(1, 5, name=\"F\")\nM = int_var(1, 5, name=\"M\")\nS = int_var(1, 5, name=\"S\")\n\n# Define the model\nmodel = Model(\n [B != 5, # Baker does not live on the fifth floor\n C != 1, # Cooper does not live on the first floor\n F != 1, F != 5, # Fletcher does not live on either the fifth or the first floor\n M > C, # Miller lives on a higher floor than does Cooper\n abs(S - F) > 1, # Smith does not live on a floor adjacent to Fletcher's\n abs(F - C) > 1, # Fletcher does not live on a floor adjacent to Cooper's\n AllDifferent([B, C, F, M, S]) # They all live on different floors\n ]\n)\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = {\"B\": B.value(), \"C\": C.value(), \"F\": F.value(), \"M\": M.value(), \"S\": S.value()}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
90 |
+
{"id": "aplai_course__5_grocery", "model": "from cpmpy import *\nimport json\n\n# Define the model\nx = intvar(1, 100, shape=4, name=\"prices\")\n\n# Define the constraints\nmodel = Model(\n [x[0] * x[1] * x[2] * x[3] == 711],\n [sum(x) == 711]\n)\n\n# Solve the model\nif model.solve():\n # Get the solution\n solution = {f\"prices[{i}]\": x[i].value() for i in range(4)}\n solution[\"prices\"] = [x[i].value() for i in range(4)]\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
91 |
+
{"id": "aplai_course__1_guards_and_apples", "model": "from cpmpy import *\nimport json\n\n# Define the model\nx = intvar(1, 1000, shape=6, name=\"apples\")\n\n# Define the model constraints\nmodel = Model(\n [x[i] == (x[i-1] + 1) // 2 + (x[i-1] + 1) % 2 for i in range(1, 6)],\n [x[5] == 1]\n)\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n print(json.dumps({\"apples\": [int(val) for val in x.value]}))\nelse:\n print(\"No solution found\")"}
|
92 |
+
{"id": "aplai_course__5_hardy_1729_square", "model": "from cpmpy import *\nimport json\n\n# Model\na, b, c, d = intvar(1, 100, shape=4, name=\"vars\")\n\n# Constraints\nmodel = Model(\n AllDifferent([a, b, c, d]),\n a**2 + b**2 == c**2 + d**2\n)\n\n# Solve\nif model.solve():\n solution = {\n \"a\": a.value(),\n \"b\": b.value(),\n \"c\": c.value(),\n \"d\": d.value()\n }\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
93 |
+
{"id": "aplai_course__3_kidney_exchange", "model": "from cpmpy import *\nimport json\n\n# Data\nnum_people = 8\ncompatible = [\n [2, 3],\n [1, 6],\n [1, 4, 7],\n [2],\n [2],\n [5],\n [8],\n [3]\n]\n\n# Model\nmodel = Model()\n\n# Variable: transplants[i][j] is 1 if person i donates to person j, and 0 otherwise\ntransplants = [[model.new_int(var=0, name=f\"transplants_{i}_{j}\") for j in range(num_people)] for i in range(num_people)]\n\n# Constraints\nfor i in range(num_people):\n for j in range(num_people):\n model += [transplants[i][j] == 1].only_if(compatible[i].__contains__(j + 1))\n\nfor i in range(num_people):\n model += sum(transplants[i]) <= 1 # person i donates to at most one person\n model += sum([transplants[j][i] for j in range(num_people)]) <= 1 # person i receives at most one kidney\n\nfor i in range(num_people):\n model += sum(transplants[i]) == sum([transplants[j][i] for j in range(num_people)]) # person i donates if and only if they receive\n\n# Objective\nmodel.maximize(sum([sum(transplants[i]) for i in range(num_people)]))\n\n# Solve\nif model.solve():\n # Extract solution\n transplants_solution = [[transplants[i][j].value() for j in range(num_people)] for i in range(num_people)]\n\n # Print solution as JSON\n print(json.dumps({\"transplants\": transplants_solution}))\nelse:\n print(\"No solution found\")"}
|
94 |
+
{"id": "aplai_course__1_magic_square", "model": "from cpmpy import *\nimport json\n\n# Define the size of the magic square\nn = 4\n\n# Define the sum of integers in each row, column and diagonal\ntarget_sum = n * (n**2 + 1) // 2\n\n# Define the variables\nx = intvar(1, n**2, shape=(n, n), name=\"x\")\n\n# Define the model\nmodel = Model()\n\n# Each cell contains a different integer\nmodel += AllDifferent(x.flatten())\n\n# The sum of integers in each row is equal to the target sum\nfor i in range(n):\n model += sum(x[i, :]) == target_sum\n\n# The sum of integers in each column is equal to the target sum\nfor j in range(n):\n model += sum(x[:, j]) == target_sum\n\n# The sum of integers in the main diagonal is equal to the target sum\nmodel += sum([x[i, i] for i in range(n)]) == target_sum\n\n# The sum of integers in the anti-diagonal is equal to the target sum\nmodel += sum([x[i, n-i-1] for i in range(n)]) == target_sum\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = x.value()\n print(json.dumps({\"square\": [list(row) for row in solution]}))\nelse:\n print(\"No solution found\")"}
|
95 |
+
{"id": "aplai_course__2_maximal_independent_sets", "model": "from cpmpy import *\nimport json\n\n# Data\nn = 8\nadjacency_list = [\n [2, 3, 7],\n [1, 4, 8],\n [1, 4, 5],\n [2, 3, 6],\n [3, 6, 7],\n [4, 5, 8],\n [1, 5, 8],\n [2, 6, 7]\n]\n\n# Model\nmodel = Model()\n\n# Variables\nx = boolvar(shape=n, name=\"x\")\n\n# Constraints\nfor i in range(n):\n for j in adjacency_list[i]:\n model += ~x[i] | ~x[j-1] # subtract 1 because adjacency list is 1-indexed\n\n# Objective: maximize the number of nodes in the independent set\nmodel.maximize(sum(x))\n\n# Solve\nif model.solve():\n # Get the solution\n solution = {f\"node_{i+1}\": bool(x[i].value()) for i in range(n)}\n print(json.dumps({\"nodes\": solution}))\nelse:\n print(\"No solution found\")"}
|
96 |
+
{"id": "aplai_course__1_money_change", "model": "from cpmpy import *\nimport json\n\n# Define the available coins and their counts\ncoin_values = [1, 2, 5, 10, 25, 50]\ncoin_counts_available = [20, 10, 15, 8, 4, 2]\n\n# Define the target change\nchange = 199\n\n# Define the model\nmodel = Model()\n\n# Define the decision variables\ncoin_counts = intvar(0, max(coin_counts_available), shape=len(coin_values), name=\"coin_counts\")\n\n# Define the constraints\nmodel += (sum([coin_values[i] * coin_counts[i] for i in range(len(coin_values))]) == change)\nfor i in range(len(coin_values)):\n model += (coin_counts[i] <= coin_counts_available[i])\n\n# Define the objective\nmodel.minimize(sum(coin_counts))\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = {\"coin_counts\": [int(x) for x in coin_counts.value()]}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
97 |
+
{"id": "aplai_course__2_movie_scheduling", "model": "from cpmpy import *\nimport json\n\n# Input data\nmovies = [\n [\"Tarjan of the Jungle\", 4, 13],\n [\"The Four Volume Problem\", 17, 27],\n [\"The President's Algorist\", 1, 10],\n [\"Steiner's Tree\", 12, 18],\n [\"Process Terminated\", 23, 30],\n [\"Halting State\", 9, 16],\n [\"Programming Challenges\", 19, 25],\n [\"Discrete Mathematics\", 2, 7],\n [\"Calculated Bets\", 26, 31]\n]\n\n# Sort movies by end date\nmovies.sort(key=lambda x: x[2])\n\n# Model\nmodel = Model()\n\n# Decision variables\nselected_movies = [model.bool_var() for _ in range(len(movies))]\n\n# Constraints\nfor i in range(len(movies)):\n for j in range(i+1, len(movies)):\n model.add(selected_movies[i] + selected_movies[j] <= (movies[j][1] > movies[i][2]))\n\n# Objective\nmodel.maximize(sum(selected_movies))\n\n# Solve\nif model.solve():\n num_selected_movies = sum([int(var.value()) for var in selected_movies])\n solution = {\n \"selected_movies\": [int(var.value()) for var in selected_movies],\n \"num_selected_movies\": num_selected_movies\n }\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
98 |
+
{"id": "aplai_course__3_people_in_a_room", "model": "from cpmpy import *\nimport json\n\n# Model\nnum_people = 13\nnum_males = 4\nnum_females = num_people - num_males\n\n# Decision variable: sequence of people entering the room (0 = male, 1 = female)\nsequence = intvar(0, 1, shape=num_people, name=\"sequence\")\n\n# Constraints\nmodel = Model(\n # Exactly num_males males and num_females females\n sum(sequence) == num_females,\n # Ratio of females to males at any time is no greater than 7/3\n [sum(sequence[:i+1]) / (i + 1 - sum(sequence[:i+1])) <= 7/3 for i in range(num_people)]\n)\n\n# Solve\nif model.solve():\n # Print solution as JSON\n solution = {\"sequence\": [int(x) for x in sequence.value()]}\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
99 |
+
{"id": "aplai_course__2_subsets_100", "model": "from cpmpy import *\nimport json\n\n# Define the set of integers\nA = [81, 21, 79, 4, 29, 70, 28, 20, 14, 7]\n\n# Define the model\nmodel = Model()\n\n# Define the decision variables\nin_S = boolvar(shape=len(A), name=\"in_S\")\nin_T = boolvar(shape=len(A), name=\"in_T\")\n\n# Define the constraints\nmodel += [sum([A[i] * in_S[i] for i in range(len(A))]) == sum([A[i] * in_T[i] for i in range(len(A))])]\nmodel += [(in_S[i] + in_T[i]) <= 1 for i in range(len(A))]\nmodel += [sum(in_S) > 0]\nmodel += [sum(in_T) > 0]\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = {\n \"in_S\": [int(in_S[i].value()) for i in range(len(A))],\n \"in_T\": [int(in_T[i].value()) for i in range(len(A))]\n }\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|
100 |
+
{"id": "aplai_course__2_subset_sum", "model": "from cpmpy import *\nimport json\n\n# Define the possible number of coins in each bag\ncoin_values = [16, 17, 23, 24, 39, 40]\n\n# Define the number of coins lost\ntotal_coins_lost = 100\n\n# Define the decision variables\nbags = intvar(0, 100, shape=len(coin_values), name=\"bags\")\n\n# Define the model\nmodel = Model(\n sum([coin_values[i] * bags[i] for i in range(len(coin_values))]) == total_coins_lost\n)\n\n# Solve the model\nif model.solve():\n # Print the solution as a JSON object\n solution = {f\"bags_{i}\": bags[i].value() for i in range(len(coin_values))}\n print(json.dumps({\"bags\": solution}))\nelse:\n print(\"No solution found\")"}
|
101 |
+
{"id": "aplai_course__1_thick_as_thieves", "model": "from cpmpy import *\nimport json\n\n# Define the variables\nartie = boolvar(name=\"artie\")\nbill = boolvar(name=\"bill\")\ncrackitt = boolvar(name=\"crackitt\")\ndodgy = boolvar(name=\"dodgy\")\nedgy = boolvar(name=\"edgy\")\nfingers = boolvar(name=\"fingers\")\n\n# At least 4 of them are innocent\nmodel = Model(\n 4 <= sum([~artie, ~bill, ~crackitt, ~dodgy, ~edgy, ~fingers])\n)\n\n# ARTIE: \"It wasn't me.\"\nmodel += [artie == ~artie]\n\n# BILL: \"Crackitt was in it up to his neck.\"\nmodel += [bill == crackitt]\n\n# CRACKITT: \"No I wasn't.\"\nmodel += [crackitt == ~crackitt]\n\n# DODGY: \"If Crackitt did it, Bill did it with him.\"\nmodel += [crackitt <= bill]\n\n# EDGY: \"Nobody did it alone.\"\nmodel += [(~edgy) | (sum([artie, bill, crackitt, dodgy, fingers]) >= 2)]\n\n# FINGERS: \"That\u2019s right: it was Artie and Dodgy together.\"\nmodel += [fingers == (artie & dodgy)]\n\n# Solve the model\nif model.solve():\n solution = {\n \"artie\": artie.value(),\n \"bill\": bill.value(),\n \"crackitt\": crackitt.value(),\n \"dodgy\": dodgy.value(),\n \"edgy\": edgy.value(),\n \"fingers\": fingers.value()\n }\n print(json.dumps(solution))\nelse:\n print(\"No solution found\")"}
|