kostis-init commited on
Commit
180f9fe
Β·
1 Parent(s): 3267617

add extra hf dataset for persistent storage of submissions and results

Browse files
Files changed (7) hide show
  1. app.py +3 -241
  2. eval.py +0 -356
  3. src/config.py +11 -0
  4. src/eval.py +403 -0
  5. src/hf_utils.py +128 -0
  6. src/ui.py +83 -0
  7. src/utils.py +7 -0
app.py CHANGED
@@ -1,246 +1,8 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import os
4
- import shutil
5
- from pathlib import Path
6
- import subprocess # For running eval.py
7
- import time
8
- import threading # For background tasks
9
- import sys
10
 
11
- # --- Configuration ---
12
- SUBMISSIONS_DIR = "submissions"
13
- RESULTS_DIR = "results"
14
- EVAL_SCRIPT_PATH = "eval.py"
15
-
16
-
17
- # --- Helper Functions ---
18
-
19
- def setup_directories():
20
- """Creates the submissions and results directories if they don't exist."""
21
- os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
22
- os.makedirs(RESULTS_DIR, exist_ok=True)
23
- if not os.listdir(RESULTS_DIR): # Add a placeholder if results is empty
24
- initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
25
- if not initial_result_demo_path.exists():
26
- os.makedirs(initial_result_demo_path, exist_ok=True)
27
- with open(initial_result_demo_path / "summary.txt", "w") as f:
28
- f.write("This is a placeholder initial result.\nScore: 0\n")
29
- print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")
30
-
31
-
32
- def load_leaderboard_data():
33
- """
34
- Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
35
- Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
36
- """
37
- if not os.path.exists(RESULTS_DIR):
38
- return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
39
-
40
- result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]
41
-
42
- leaderboard_entries = []
43
- # Sort by modification time of the directory (newest first)
44
- # This requires getting mtime for each directory.
45
- sorted_result_dirs = sorted(
46
- result_dirs,
47
- key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
48
- reverse=True
49
- )
50
-
51
- for dir_name in sorted_result_dirs:
52
- entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
53
- result_dir_path = Path(RESULTS_DIR) / dir_name
54
-
55
- try:
56
- entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
57
- except Exception:
58
- pass # Directory might have been removed during scan
59
-
60
- summary_file = result_dir_path / "summary.txt"
61
- if summary_file.exists():
62
- try:
63
- with open(summary_file, "r") as f:
64
- for line in f:
65
- if line.lower().startswith("score:"):
66
- entry["Score"] = line.split(":", 1)[1].strip()
67
- break
68
- except Exception as e:
69
- print(f"Error parsing summary for {dir_name}: {e}")
70
-
71
- leaderboard_entries.append(entry)
72
-
73
- if not leaderboard_entries:
74
- return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
75
-
76
- return pd.DataFrame(leaderboard_entries)
77
-
78
-
79
- def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
80
- """
81
- This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
82
- Outputs from eval.py will go to the console where app.py is running.
83
- """
84
- print(
85
- f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")
86
-
87
- if not Path(EVAL_SCRIPT_PATH).exists():
88
- print(
89
- f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
90
- return
91
-
92
- command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]
93
-
94
- try:
95
- # Using subprocess.run which is simpler for blocking calls within this thread
96
- process = subprocess.run(
97
- command,
98
- capture_output=True,
99
- text=True,
100
- check=False, # Handle non-zero exit codes manually
101
- timeout=300 # 5-minute timeout for the evaluation script
102
- )
103
-
104
- eval_output = process.stdout.strip()
105
- eval_error = process.stderr.strip()
106
-
107
- print(
108
- f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
109
- if eval_error: # Only print stderr if it's not empty
110
- print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")
111
-
112
- if process.returncode == 0:
113
- print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
114
- else:
115
- print(
116
- f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")
117
-
118
- except subprocess.TimeoutExpired:
119
- print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
120
- except FileNotFoundError: # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
121
- print(
122
- f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
123
- except Exception as e:
124
- print(
125
- f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")
126
-
127
- print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")
128
-
129
-
130
- def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
131
- """
132
- Handles directory upload, saves files, and starts eval.py in a background thread.
133
- Yields a status message for the UI. The leaderboard updates separately.
134
- """
135
- yield "Processing upload..." # Initial status
136
-
137
- if not uploaded_files_list:
138
- yield "No directory uploaded. Please select a directory."
139
- return
140
-
141
- try:
142
- # Determine original uploaded directory name
143
- first_temp_file_path = Path(uploaded_files_list[0].name)
144
- original_uploaded_dir_name = first_temp_file_path.parent.name
145
-
146
- submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name
147
-
148
- # Handle potential name collision
149
- if submission_dir_path.exists():
150
- timestamp = time.strftime("%Y%m%d-%H%M%S")
151
- descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
152
- submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
153
- status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
154
- original_uploaded_dir_name = descriptive_name_for_log_and_status # Use new name for logging
155
- else:
156
- descriptive_name_for_log_and_status = original_uploaded_dir_name
157
- status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."
158
-
159
- os.makedirs(submission_dir_path, exist_ok=True)
160
- progress(0.1, desc=status_update_msg)
161
-
162
- for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
163
- temp_file_path = Path(temp_file_obj.name)
164
- file_name_in_dir = temp_file_path.name
165
- target_file_path = submission_dir_path / file_name_in_dir
166
- shutil.copy(str(temp_file_path), str(target_file_path))
167
-
168
- upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
169
- progress(0.8, desc=upload_completion_msg)
170
-
171
- except Exception as e:
172
- yield f"Error during upload: {str(e)}"
173
- return
174
-
175
- # --- Start evaluation in a background thread ---
176
- if not Path(EVAL_SCRIPT_PATH).exists():
177
- yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
178
- return
179
-
180
- # Ensure paths passed to thread are absolute strings, good practice for threads.
181
- abs_submission_path = str(submission_dir_path.resolve())
182
- abs_results_path = str(Path(RESULTS_DIR).resolve())
183
-
184
- eval_thread = threading.Thread(
185
- target=run_evaluation_in_background,
186
- args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
187
- daemon=True # Set as daemon so it exits when main app exits
188
- )
189
- eval_thread.start()
190
-
191
- final_status_msg = (
192
- f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
193
- "The leaderboard will auto-refresh (or use manual refresh)."
194
- )
195
- progress(1.0, desc="Background evaluation initiated.")
196
- yield final_status_msg
197
-
198
-
199
- # --- Create Directories ---
200
  setup_directories()
201
 
202
- # --- Gradio App Definition ---
203
- with gr.Blocks(title="CP-Bench Leaderboard") as demo:
204
- gr.Markdown(
205
- """
206
- # CP-Bench Leaderboard
207
-
208
- This is a leaderboard for the CP-Bench dataset. You can upload your submission directory for evaluation.
209
- """
210
- )
211
-
212
- with gr.Row():
213
- with gr.Column(scale=1): # Upload Column
214
- gr.Markdown("## πŸ“€ Upload Submission")
215
- upload_button = gr.UploadButton(
216
- "Click to Upload Directory for Evaluation",
217
- file_count="directory",
218
- )
219
- upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)
220
-
221
- with gr.Column(scale=3): # Leaderboard Column
222
- gr.Markdown("## πŸ† Results Leaderboard")
223
- leaderboard_df_component = gr.DataFrame(
224
- value=load_leaderboard_data, # Load initial data
225
- label="Leaderboard (auto-refreshes)",
226
- interactive=False,
227
- # every=20 # Auto-refresh leaderboard data every 20 seconds
228
- )
229
- refresh_leaderboard_button = gr.Button("πŸ”„ Refresh Leaderboard Manually")
230
-
231
- # --- Event Handlers ---
232
- upload_button.upload(
233
- fn=handle_upload_and_kickoff_eval,
234
- inputs=[upload_button],
235
- outputs=[upload_status_textbox], # Only one output now for the status message
236
- show_progress="full"
237
- )
238
-
239
- refresh_leaderboard_button.click(
240
- fn=load_leaderboard_data,
241
- inputs=None,
242
- outputs=[leaderboard_df_component]
243
- )
244
-
245
  if __name__ == "__main__":
 
246
  demo.queue().launch()
 
1
+ from src.ui import create_ui
2
+ from src.utils import setup_directories
 
 
 
 
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  setup_directories()
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  if __name__ == "__main__":
7
+ demo = create_ui()
8
  demo.queue().launch()
eval.py DELETED
@@ -1,356 +0,0 @@
1
- # eval.py
2
- import sys
3
- import os
4
- import time
5
- import json
6
- import subprocess
7
- import tempfile
8
- from pathlib import Path
9
- from datasets import load_dataset # Hugging Face datasets library
10
-
11
- # --- Configuration ---
12
-
13
- DATASET_NAME = "kostis-init/CP-Bench"
14
-
15
- # Column names in the Hugging Face dataset for problem identifier and model script
16
- PROBLEM_NAME_COLUMN = "id"
17
- MODEL_CODE_COLUMN = "model"
18
-
19
- # Timeout for running individual model scripts (both generated and modified ground-truth)
20
- SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
21
-
22
-
23
- def extract_json_from_string(text_output: str):
24
- """
25
- Attempts to find and parse the first valid JSON object or array from a string.
26
- Handles cases where JSON is preceded or followed by non-JSON text.
27
- """
28
- idx = 0
29
- while idx < len(text_output):
30
- # Find the next potential start of a JSON structure
31
- start_brace = text_output.find('{', idx)
32
- start_bracket = text_output.find('[', idx)
33
-
34
- if start_brace == -1 and start_bracket == -1:
35
- # No more '{' or '[' found in the rest of the string
36
- return None
37
-
38
- # Determine the actual starting character for this attempt
39
- if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
40
- json_start_index = start_brace
41
- else:
42
- json_start_index = start_bracket
43
-
44
- potential_json_segment = text_output[json_start_index:]
45
-
46
- try:
47
- # Use raw_decode to parse the first valid JSON object from the segment
48
- decoder = json.JSONDecoder()
49
- json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
50
- # Successfully parsed a JSON object
51
- return json_obj
52
- except json.JSONDecodeError:
53
- # This segment (starting at json_start_index) wasn't a valid JSON.
54
- # Advance the search index past the character that caused the current attempt.
55
- idx = json_start_index + 1
56
-
57
- return None # No valid JSON found in the entire string
58
-
59
-
60
- def run_instance(instance_path_str: str,
61
- timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
62
- """Run the instance file and robustly capture the JSON output."""
63
- command = [sys.executable, instance_path_str]
64
- instance_name = Path(instance_path_str).name
65
- try:
66
- result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
67
- errors='replace')
68
-
69
- # Check return code first
70
- if result.returncode != 0:
71
- # Log stderr for debugging if the script itself failed
72
- error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
73
- print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
74
- return None
75
-
76
- # Attempt to extract JSON from stdout
77
- stdout_text = result.stdout
78
- if not stdout_text or not stdout_text.strip():
79
- print(f" ERROR: No stdout from {instance_name}.", flush=True)
80
- return None
81
-
82
- solution = extract_json_from_string(stdout_text)
83
-
84
- if solution is None:
85
- # Be more verbose if JSON extraction fails
86
- abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
87
- print(
88
- f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
89
- flush=True)
90
- return None
91
-
92
- return solution
93
-
94
- except subprocess.TimeoutExpired:
95
- print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
96
- return None
97
- except Exception as e:
98
- print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
99
- return None
100
-
101
-
102
- def add_constraints_as_string(solution):
103
- """Generate constraints as a string to be added to the original script."""
104
- constraints = ""
105
- if solution: # Ensure solution is not None
106
- for key, value in solution.items():
107
- # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
108
- if isinstance(value, str):
109
- constraints += f"\nmodel += ({key} == \"{value}\")"
110
- else:
111
- constraints += f"\nmodel += ({key} == {value})"
112
- return constraints
113
-
114
-
115
- def get_modified_script(script_content, solution):
116
- """Add constraints to the script content and self-consistency checks."""
117
- constraints_str = add_constraints_as_string(solution)
118
- modified_script = f"{script_content}\n{constraints_str}"
119
- modified_script += """
120
-
121
- # --- Self-consistency check appended by eval.py ---
122
- # Print the absolute path of the current directory along with the script name
123
- import os
124
- # print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug
125
-
126
- # Keep old objective
127
- old_objective_value = None
128
- objective_defined = False
129
- if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
130
- try:
131
- # This block assumes 'model' is the CPMpy model object or similar
132
- # Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
133
- # or might raise an error if objective_value() is called on an unsolved/unformulated objective.
134
- # This part might need adjustment based on the specific modeling library used in CP-Bench.
135
- # For now, we'll try to get it and catch errors.
136
- # A more robust way might be to inspect model.objective_
137
- if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
138
- if model._objective_value is not None: # cpmpy does not have objective_is_min
139
- objective_defined = True
140
- old_objective_value = model.objective_value()
141
-
142
- except Exception as e_obj_check:
143
- # print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
144
- pass # Objective might not be set or model not solved yet.
145
-
146
- # Check self-consistency
147
- solved_ok = False
148
- try:
149
- if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
150
- solved_ok = model.solve()
151
- else:
152
- print('ERROR: Model object not found or does not have a solve() method.')
153
- except Exception as e_solve:
154
- print(f'ERROR: Exception during model.solve(): {e_solve}')
155
- solved_ok = False # Ensure it's false on exception
156
-
157
- if not solved_ok:
158
- print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
159
- else:
160
- print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')
161
-
162
- # Check if the objective value is the same
163
- if not objective_defined:
164
- print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
165
- else:
166
- try:
167
- current_objective_value = model.objective_value()
168
- # Handle potential floating point inaccuracies if objectives can be floats
169
- if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
170
- if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
171
- print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
172
- else:
173
- print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
174
- elif current_objective_value != old_objective_value: # Integer comparison
175
- print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
176
- else:
177
- print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
178
- except Exception as e_obj_final:
179
- print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')
180
-
181
- """
182
- return modified_script
183
-
184
-
185
- # --- Main Evaluation Logic ---
186
- def main(submission_path_str: str, results_base_dir_str: str):
187
- start_time = time.time()
188
- print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
189
- print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
190
- print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)
191
-
192
- submission_path = Path(submission_path_str)
193
- submission_name = submission_path.name
194
- result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
195
- os.makedirs(result_dir_for_submission, exist_ok=True)
196
- summary_file_path = result_dir_for_submission / "summary.txt"
197
-
198
- # Load ground-truth dataset
199
- try:
200
- # Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
201
- gt_dataset = load_dataset(DATASET_NAME, split="train")
202
- ground_truth_models = {
203
- item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
204
- for item in gt_dataset
205
- if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
206
- }
207
- if not ground_truth_models:
208
- raise ValueError(
209
- f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
210
- print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
211
- except Exception as e:
212
- print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
213
- with open(summary_file_path, "w") as f:
214
- f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
215
- return 1 # Indicate failure
216
-
217
- # Statistics
218
- total_submitted_models = 0
219
- models_ran_successfully = 0
220
- gt_models_found = 0
221
- consistency_checks_passed = 0
222
- objective_checks_passed = 0 # Includes "NO_OBJECTIVE_DEFINED" as a pass
223
-
224
- with open(summary_file_path, "w") as summary_f:
225
- summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
226
- summary_f.write(
227
- f"Ground-Truth Dataset: {DATASET_NAME}\n")
228
- summary_f.write("-" * 30 + "\n")
229
-
230
- submitted_model_files = list(submission_path.glob('*.py')) # Assuming Python models
231
- if not submitted_model_files:
232
- summary_f.write("No .py model files found in submission.\n")
233
- print("eval.py: No .py model files found in submission.", flush=True)
234
- return 0 # No models to evaluate, but script ran.
235
-
236
- for model_file_path in submitted_model_files:
237
- total_submitted_models += 1
238
- problem_name = model_file_path.stem # Filename without .py extension
239
- print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
240
- summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
241
-
242
- # 1. Run the submitted model to get its solution
243
- summary_f.write(" 1. Running submitted model...\n")
244
- generated_solution = run_instance(str(model_file_path))
245
- if generated_solution is None:
246
- summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
247
- continue # Move to the next model
248
- models_ran_successfully += 1
249
- summary_f.write(f" - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")
250
-
251
- # 2. Find corresponding ground-truth model
252
- summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
253
- if problem_name not in ground_truth_models:
254
- summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
255
- print(f" WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
256
- continue
257
- gt_models_found += 1
258
- ground_truth_script_content = ground_truth_models[problem_name]
259
- summary_f.write(" - SUCCESS: Found ground-truth model.\n")
260
-
261
- # 3. Modify ground-truth script with solution and run self-consistency check
262
- summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
263
- modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
264
-
265
- consistency_passed_this_model = False
266
- objective_passed_this_model = False
267
-
268
- try:
269
- with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
270
- tmp_file.write(modified_gt_script)
271
- tmp_file_path_str = tmp_file.name
272
-
273
- # Run the modified ground-truth script
274
- gt_check_result = subprocess.run(
275
- [sys.executable, tmp_file_path_str],
276
- capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
277
- )
278
- os.unlink(tmp_file_path_str) # Clean up temp file
279
-
280
- # 4. Parse output of modified ground-truth
281
- gt_stdout = gt_check_result.stdout
282
- gt_stderr = gt_check_result.stderr
283
- # summary_f.write(f" Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
284
- if gt_stderr:
285
- summary_f.write(f" Modified GT STDERR: {gt_stderr[:500]}...\n")
286
-
287
- if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
288
- summary_f.write(" - CONSISTENCY: PASSED\n")
289
- consistency_checks_passed += 1
290
- consistency_passed_this_model = True
291
- elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
292
- summary_f.write(" - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
293
- else:
294
- summary_f.write(" - CONSISTENCY: FAILED (Could not determine consistency from output)\n")
295
-
296
- if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
297
- "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
298
- summary_f.write(" - OBJECTIVE: PASSED (Consistent or no objective)\n")
299
- objective_checks_passed += 1
300
- objective_passed_this_model = True
301
- elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
302
- summary_f.write(f" - OBJECTIVE: FAILED (Value changed)\n")
303
- elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
304
- summary_f.write(f" - OBJECTIVE: FAILED (Error accessing final objective)\n")
305
- else:
306
- summary_f.write(" - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")
307
-
308
- except subprocess.TimeoutExpired:
309
- summary_f.write(
310
- f" - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
311
- print(f" ERROR: Timeout running modified GT for {problem_name}", flush=True)
312
- except Exception as e_gt_run:
313
- summary_f.write(
314
- f" - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
315
- print(f" ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)
316
-
317
- # Final statistics
318
- summary_f.write("\n" + "=" * 30 + "\n")
319
- summary_f.write("Overall Evaluation Statistics:\n")
320
- summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
321
- summary_f.write(
322
- f" Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
323
- summary_f.write(
324
- f" Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
325
- summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
326
- summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
327
-
328
- # Define an overall score, e.g. number of models that passed both checks against found GT
329
- fully_passed_models = 0
330
- # This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
331
- # This simple score is just the sum of passes, could be more nuanced
332
- overall_score = consistency_checks_passed + objective_checks_passed
333
- summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n") # For Gradio app to parse
334
-
335
- elapsed_time = time.time() - start_time
336
- print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
337
- print(f"eval.py: Summary written to {summary_file_path}", flush=True)
338
- return 0 # Success
339
-
340
-
341
- if __name__ == "__main__":
342
- if len(sys.argv) < 3:
343
- print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
344
- print("Example: python eval.py ./submissions/my_run ./results")
345
- sys.exit(1)
346
-
347
- submission_dir = sys.argv[1]
348
- results_base_dir = sys.argv[2]
349
-
350
- # Simple check if submission_dir exists
351
- if not Path(submission_dir).is_dir():
352
- print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
353
- sys.exit(1)
354
-
355
- exit_code = main(submission_dir, results_base_dir)
356
- sys.exit(exit_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File and directory paths
2
+ EVAL_SCRIPT_PATH = "src/eval.py"
3
+ LOCAL_TEMP_SUBMISSIONS_DIR = "../temp_submissions_app"
4
+
5
+ # Hugging Face Dataset Configuration
6
+ DATASET_REPO_ID = "kostis-init/my-storage"
7
+ DS_SUBMISSIONS_PATH = "submissions"
8
+ DS_RESULTS_PATH = "results"
9
+
10
+ # leaderboard
11
+ LDB_COLS = ["Submission Name", "Execution (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
src/eval.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eval.py
2
+ import sys
3
+ import os
4
+ import time
5
+ import json
6
+ import subprocess
7
+ import tempfile
8
+ from pathlib import Path
9
+ from datasets import load_dataset # Hugging Face datasets library
10
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download # For user data dataset
11
+ from huggingface_hub.utils import RepositoryNotFoundError
12
+
13
+ # --- Configuration ---
14
+
15
+ GT_DATASET_NAME = "kostis-init/CP-Bench"
16
+
17
+ # Column names in the Hugging Face dataset for problem identifier and model script
18
+ GT_PROBLEM_NAME_COLUMN = "id"
19
+ GT_MODEL_CODE_COLUMN = "model"
20
+
21
+ # Timeout for running individual model scripts (both generated and modified ground-truth)
22
+ SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
23
+
24
+ """Handles evaluation of submissions."""
25
+
26
+ import os
27
+ import sys
28
+ import subprocess
29
+ import threading
30
+ from pathlib import Path
31
+
32
+ from src.config import EVAL_SCRIPT_PATH, DATASET_REPO_ID, DS_RESULTS_PATH
33
+
34
+
35
+ def run_evaluation(submission_path):
36
+
37
+ if not Path(EVAL_SCRIPT_PATH).exists():
38
+ print(f"ERROR: Eval script '{EVAL_SCRIPT_PATH}' not found")
39
+ return
40
+
41
+ print(f"Starting evaluation for: {submission_path}")
42
+
43
+ command = [
44
+ sys.executable,
45
+ EVAL_SCRIPT_PATH,
46
+ DATASET_REPO_ID,
47
+ submission_path,
48
+ DS_RESULTS_PATH
49
+ ]
50
+
51
+ try:
52
+ process = subprocess.run(
53
+ command,
54
+ capture_output=True,
55
+ text=True,
56
+ check=False,
57
+ timeout=600,
58
+ encoding='utf-8',
59
+ )
60
+
61
+ if process.returncode == 0:
62
+ print(f"Evaluation successful for: {submission_path}")
63
+ else:
64
+ print(f"Evaluation failed for: {submission_path}")
65
+ print(f"STDERR: {process.stderr}")
66
+
67
+ except subprocess.TimeoutExpired:
68
+ print(f"Evaluation timed out for: {submission_path}")
69
+ except Exception as e:
70
+ print(f"Error running evaluation: {e}")
71
+
72
+ print(f"Evaluation process complete for: {submission_path}")
73
+
74
+
75
+ def start_background_evaluation(submission_path):
76
+ """Start evaluation in a background thread."""
77
+ thread = threading.Thread(
78
+ target=lambda: run_evaluation(submission_path),
79
+ daemon=True
80
+ )
81
+ thread.start()
82
+ return True
83
+
84
+
85
+ def extract_json_from_string(text_output: str):
86
+ """
87
+ Attempts to find and parse the first valid JSON object or array from a string.
88
+ Handles cases where JSON is preceded or followed by non-JSON text.
89
+ """
90
+ idx = 0
91
+ while idx < len(text_output):
92
+ # Find the next potential start of a JSON structure
93
+ start_brace = text_output.find('{', idx)
94
+ start_bracket = text_output.find('[', idx)
95
+
96
+ if start_brace == -1 and start_bracket == -1:
97
+ # No more '{' or '[' found in the rest of the string
98
+ return None
99
+
100
+ # Determine the actual starting character for this attempt
101
+ if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
102
+ json_start_index = start_brace
103
+ else:
104
+ json_start_index = start_bracket
105
+
106
+ potential_json_segment = text_output[json_start_index:]
107
+
108
+ try:
109
+ # Use raw_decode to parse the first valid JSON object from the segment
110
+ decoder = json.JSONDecoder()
111
+ json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
112
+ # Successfully parsed a JSON object
113
+ return json_obj
114
+ except json.JSONDecodeError:
115
+ # This segment (starting at json_start_index) wasn't a valid JSON.
116
+ # Advance the search index past the character that caused the current attempt.
117
+ idx = json_start_index + 1
118
+
119
+ return None # No valid JSON found in the entire string
120
+
121
+
122
+ def run_instance(instance_path_str: str,
123
+ timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
124
+ """Run the instance file and robustly capture the JSON output."""
125
+ command = [sys.executable, instance_path_str]
126
+ instance_name = Path(instance_path_str).name
127
+ try:
128
+ result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
129
+ errors='replace')
130
+
131
+ # Check return code first
132
+ if result.returncode != 0:
133
+ # Log stderr for debugging if the script itself failed
134
+ error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
135
+ print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
136
+ return None
137
+
138
+ # Attempt to extract JSON from stdout
139
+ stdout_text = result.stdout
140
+ if not stdout_text or not stdout_text.strip():
141
+ print(f" ERROR: No stdout from {instance_name}.", flush=True)
142
+ return None
143
+
144
+ solution = extract_json_from_string(stdout_text)
145
+
146
+ if solution is None:
147
+ # Be more verbose if JSON extraction fails
148
+ abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
149
+ print(
150
+ f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
151
+ flush=True)
152
+ return None
153
+
154
+ return solution
155
+
156
+ except subprocess.TimeoutExpired:
157
+ print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
158
+ return None
159
+ except Exception as e:
160
+ print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
161
+ return None
162
+
163
+
164
+ def add_constraints_as_string(solution):
165
+ """Generate constraints as a string to be added to the original script."""
166
+ constraints = ""
167
+ if solution: # Ensure solution is not None
168
+ for key, value in solution.items():
169
+ # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
170
+ if isinstance(value, str):
171
+ constraints += f"\nmodel += ({key} == \"{value}\")"
172
+ else:
173
+ constraints += f"\nmodel += ({key} == {value})"
174
+ return constraints
175
+
176
+
177
+ def get_modified_script(script_content, solution):
178
+ """Add constraints to the script content and self-consistency checks."""
179
+ constraints_str = add_constraints_as_string(solution)
180
+ modified_script = f"{script_content}\n{constraints_str}"
181
+ modified_script += """
182
+ # Print the absolute path of the current directory along with the script name
183
+ import os
184
+ print(os.path.abspath(__file__))
185
+
186
+ # Keep old objective
187
+ old_objective = None
188
+ if hasattr(model, 'objective_is_min') and model.objective_is_min is not None:
189
+ old_objective = model.objective_value()
190
+
191
+ # Check self-consistency
192
+ if not model.solve():
193
+ print('ERROR: The model is unsatisfiable with the self-consistency constraints')
194
+ else:
195
+ print('SUCCESS: Model is consistent')
196
+
197
+ # Check if the objective value is the same
198
+ if old_objective is None:
199
+ print('SUCCESS: No objective defined')
200
+ elif model.objective_value() != old_objective:
201
+ print('ERROR: The objective value has changed')
202
+ else:
203
+ print('SUCCESS: Objective value is consistent')
204
+ """
205
+ return modified_script
206
+
207
+
208
+ # --- Main Evaluation Logic ---
209
+ def main(
210
+ user_dataset_repo_id: str,
211
+ submission_path_in_dataset: str, # e.g., "submissions/uploaded_dir_name"
212
+ results_base_path_in_dataset: str # e.g., "results"
213
+ ):
214
+ start_time = time.time()
215
+ # Infer submission name for logging and result path generation
216
+ submission_name_for_files = Path(submission_path_in_dataset).name
217
+
218
+ print(f"eval.py: Starting evaluation for submission: '{submission_name_for_files}'", flush=True)
219
+ print(f" User Data Repo: {user_dataset_repo_id}", flush=True)
220
+ print(f" Submission to download from: {submission_path_in_dataset}", flush=True)
221
+ print(f" Results to upload to: {results_base_path_in_dataset}/{submission_name_for_files}", flush=True)
222
+
223
+ hf_api = HfApi() # Will use HF_TOKEN from environment
224
+
225
+ # Create a top-level temporary directory for all operations for this eval run
226
+ with tempfile.TemporaryDirectory(prefix="eval_run_") as top_level_temp_dir_str:
227
+ top_level_temp_dir = Path(top_level_temp_dir_str)
228
+ local_submission_dir = top_level_temp_dir / "submissions"
229
+ local_result_dir_for_upload = top_level_temp_dir / "results"
230
+
231
+ os.makedirs(local_submission_dir, exist_ok=True)
232
+ os.makedirs(local_result_dir_for_upload, exist_ok=True)
233
+
234
+ # Path for the summary file within the local temporary result directory
235
+ summary_file_path = local_result_dir_for_upload / "summary.txt"
236
+
237
+ # 1. Download submitted files from HF Dataset
238
+ print(f" Downloading submission files from '{submission_path_in_dataset}' to '{local_submission_dir}'...",
239
+ flush=True)
240
+ try:
241
+ # Download the relevant submission files
242
+ snapshot_download(
243
+ repo_id=user_dataset_repo_id,
244
+ repo_type="dataset",
245
+ local_dir=local_submission_dir,
246
+ allow_patterns=[f"{submission_path_in_dataset}/*"],
247
+ )
248
+ print(f" Downloaded submission files successfully.", flush=True)
249
+
250
+ except Exception as e_download:
251
+ print(f" CRITICAL ERROR - Failed to download submission files: {e_download}", flush=True)
252
+ return 1
253
+
254
+ # 2. Load ground-truth dataset (remains the same)
255
+ print(f" Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
256
+ try:
257
+ gt_dataset = load_dataset(GT_DATASET_NAME, split="train", trust_remote_code=True)
258
+ ground_truth_models = {
259
+ item[GT_PROBLEM_NAME_COLUMN]: item[GT_MODEL_CODE_COLUMN]
260
+ for item in gt_dataset if
261
+ GT_PROBLEM_NAME_COLUMN in item and GT_MODEL_CODE_COLUMN in item and item[GT_MODEL_CODE_COLUMN]
262
+ }
263
+ if not ground_truth_models: raise ValueError("No models in GT dataset.")
264
+ print(f" Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
265
+ except Exception as e_gt:
266
+ print(f" CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
267
+ with open(summary_file_path, "w") as f:
268
+ f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{GT_DATASET_NAME}'.\nError: {e_gt}\n")
269
+ # (Attempt to upload error summary)
270
+ return 1
271
+
272
+ # Statistics
273
+ total_submitted_models = 0
274
+ models_ran_successfully = 0
275
+ consistency_checks_passed = 0
276
+ objective_checks_passed = 0
277
+ all_checks_passed = 0
278
+ gt_models_found = 0
279
+
280
+ with open(summary_file_path, "w", encoding="utf-8") as summary_f:
281
+ summary_f.write(f"Evaluation Summary for Submission: {submission_name_for_files}\n")
282
+ summary_f.write(f"User Data Repo: {user_dataset_repo_id}\n")
283
+ summary_f.write(f"Submission Path in Dataset: {submission_path_in_dataset}\n")
284
+ summary_f.write(f"Ground-Truth Dataset: {GT_DATASET_NAME}\n")
285
+ summary_f.write("-" * 30 + "\n")
286
+
287
+ # Iterate through downloaded submitted models
288
+ submitted_model_files = list((local_submission_dir / submission_path_in_dataset).rglob('*.py'))
289
+ if not submitted_model_files:
290
+ summary_f.write("No .py model files found in downloaded submission.\n")
291
+ print(" No .py model files found in downloaded submission.", flush=True)
292
+
293
+ for model_file_path in submitted_model_files:
294
+ total_submitted_models += 1
295
+ problem_name = model_file_path.stem
296
+ print(f"\n Processing downloaded model: {model_file_path.name}", flush=True)
297
+ summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
298
+
299
+ summary_f.write(" 1. Running submitted model...\n")
300
+ generated_solution = run_instance(str(model_file_path))
301
+ if generated_solution is None:
302
+ summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
303
+ continue
304
+ models_ran_successfully += 1
305
+ summary_f.write(f" - SUCCESS: Got solution.\n")
306
+
307
+ summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
308
+ if problem_name not in ground_truth_models:
309
+ summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
310
+ continue
311
+ gt_models_found += 1
312
+ ground_truth_script_content = ground_truth_models[problem_name]
313
+ summary_f.write(" - SUCCESS: Found ground-truth model.\n")
314
+
315
+ summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
316
+ modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
317
+
318
+ try:
319
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
320
+ dir=top_level_temp_dir) as tmp_file:
321
+ tmp_file.write(modified_gt_script)
322
+ tmp_file_path_str = tmp_file.name
323
+
324
+ gt_check_result = subprocess.run(
325
+ [sys.executable, tmp_file_path_str],
326
+ capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT, encoding='utf-8',
327
+ )
328
+ os.unlink(tmp_file_path_str)
329
+
330
+ gt_stdout = gt_check_result.stdout
331
+ # ... (parse EVAL_OUTPUT tags for consistency and objective)
332
+ if "SUCCESS: Model is consistent" in gt_stdout:
333
+ summary_f.write(" - CONSISTENCY: PASSED\n")
334
+ consistency_checks_passed += 1
335
+ else:
336
+ summary_f.write(
337
+ " - CONSISTENCY: FAILED (Details in logs or stdout)\n")
338
+
339
+ if "SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout:
340
+ summary_f.write(" - OBJECTIVE: PASSED\n")
341
+ objective_checks_passed += 1
342
+ else:
343
+ summary_f.write(" - OBJECTIVE: FAILED (Details in logs or stdout)\n")
344
+
345
+ if "SUCCESS: Model is consistent" in gt_stdout and ("SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
346
+ summary_f.write(" - SELF-CONSISTENCY CHECK: PASSED fully\n")
347
+ all_checks_passed += 1
348
+
349
+ except Exception as e_gt_run:
350
+ summary_f.write(f" - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
351
+
352
+ # Final statistics (write to summary_f)
353
+ summary_f.write("\n" + "=" * 30 + "\n")
354
+ summary_f.write("Overall Evaluation Statistics:\n")
355
+ summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
356
+ summary_f.write(f" Models That Ran Successfully: {models_ran_successfully}/{total_submitted_models}\n")
357
+ summary_f.write(f" Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully}\n")
358
+ summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
359
+ summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
360
+ summary_f.write("=" * 30 + "\n")
361
+ summary_f.write("Final Evaluation Summary:\n")
362
+ summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
363
+ summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
364
+ summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
365
+ summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
366
+ summary_f.write("-" * 30 + "\n")
367
+
368
+ # 4. Upload the entire local_result_dir_for_upload to HF Dataset
369
+ # This directory contains summary.txt and could contain other result files.
370
+ result_path_on_hub = f"{results_base_path_in_dataset}/{submission_name_for_files}"
371
+ print(f" Uploading results from '{local_result_dir_for_upload}' to '{result_path_on_hub}' on dataset...",
372
+ flush=True)
373
+ try:
374
+ hf_api.upload_folder(
375
+ folder_path=str(local_result_dir_for_upload),
376
+ path_in_repo=result_path_on_hub,
377
+ repo_id=user_dataset_repo_id,
378
+ repo_type="dataset",
379
+ commit_message=f"Evaluation results for {submission_name_for_files}"
380
+ )
381
+ print(" Results uploaded successfully.", flush=True)
382
+ except Exception as e_upload:
383
+ print(f" CRITICAL ERROR: Failed to upload results: {e_upload}", flush=True)
384
+ # The summary.txt was written locally, but upload failed.
385
+
386
+ elapsed_time = time.time() - start_time
387
+ print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
388
+ return 0
389
+
390
+
391
+ if __name__ == "__main__":
392
+ if len(sys.argv) < 4:
393
+ print(
394
+ "Usage: python eval.py <user_dataset_repo_id> <submission_path_in_dataset> <results_base_path_in_dataset>")
395
+ print("Example: python eval.py your-username/my-storage submissions/run123 results")
396
+ sys.exit(1)
397
+
398
+ arg_user_dataset_repo_id = sys.argv[1]
399
+ arg_submission_path_in_dataset = sys.argv[2]
400
+ arg_results_base_path_in_dataset = sys.argv[3]
401
+
402
+ exit_code = main(arg_user_dataset_repo_id, arg_submission_path_in_dataset, arg_results_base_path_in_dataset)
403
+ sys.exit(exit_code)
src/hf_utils.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilities for interacting with the Hugging Face Hub."""
2
+
3
+ import os
4
+ import shutil
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_files
8
+ from huggingface_hub.utils import RepositoryNotFoundError, HFValidationError
9
+
10
+ from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS
11
+
12
+ # Initialize HfApi
13
+ try:
14
+ HF_API = HfApi()
15
+ print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
16
+ except Exception as e:
17
+ print(f"Failed to initialize HfApi: {e}")
18
+ HF_API = None
19
+
20
+
21
+ def load_leaderboard_data():
22
+ """Load leaderboard data from Hugging Face Dataset."""
23
+ if not HF_API:
24
+ return pd.DataFrame(columns=LDB_COLS)
25
+
26
+ leaderboard_entries = []
27
+ processed_result_dirs = set()
28
+
29
+ try:
30
+ # List all files in the results path of the dataset
31
+ repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
32
+
33
+ # Find all summary files
34
+ summary_files = [
35
+ f for f in repo_files
36
+ if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
37
+ ]
38
+ summary_files.sort(reverse=True)
39
+
40
+ for file_path in summary_files:
41
+ dir_name = Path(file_path).parent.name
42
+ if dir_name in processed_result_dirs:
43
+ continue
44
+
45
+ processed_result_dirs.add(dir_name)
46
+ entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: 'N/A', LDB_COLS[2]: 'N/A', LDB_COLS[3]: 'N/A', LDB_COLS[4]: 0}
47
+
48
+ # Download summary file
49
+ temp_dir = os.path.join("temp_hf_downloads", dir_name)
50
+ local_summary_path = hf_hub_download(
51
+ repo_id=DATASET_REPO_ID,
52
+ filename=file_path,
53
+ repo_type="dataset",
54
+ local_dir=temp_dir,
55
+ )
56
+
57
+ # Count files
58
+ files_in_result_dir = [
59
+ f for f in repo_files
60
+ if f.startswith(f"{DS_RESULTS_PATH}/{dir_name}/") and not f.endswith("/")
61
+ ]
62
+
63
+ # Parse score from summary
64
+ if Path(local_summary_path).exists():
65
+ with open(local_summary_path, "r", encoding="utf-8") as f:
66
+ for line in f:
67
+ if 'Execution perc' in line:
68
+ entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
69
+ if 'Consistency perc' in line:
70
+ entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
71
+ if 'Final Solution Accuracy' in line:
72
+ entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
73
+ if 'Total Submitted Models Parsed' in line:
74
+ entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
75
+ os.remove(local_summary_path)
76
+
77
+ leaderboard_entries.append(entry)
78
+
79
+ except Exception as e:
80
+ print(f"Error loading leaderboard data: {e}")
81
+
82
+ finally:
83
+ # Clean up
84
+ if Path("temp_hf_downloads").exists():
85
+ shutil.rmtree("temp_hf_downloads", ignore_errors=True)
86
+
87
+ if not leaderboard_entries:
88
+ return pd.DataFrame(columns=LDB_COLS)
89
+
90
+ return pd.DataFrame(leaderboard_entries)
91
+
92
+
93
+ def upload_submission(uploaded_files, dir_name):
94
+ """Upload submission to Hugging Face Dataset."""
95
+ if not HF_API:
96
+ return False, "Hugging Face API not initialized"
97
+
98
+ try:
99
+ submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
100
+
101
+ for file in uploaded_files:
102
+ file_name = os.path.basename(file.name)
103
+ HF_API.upload_file(
104
+ path_or_fileobj=file,
105
+ path_in_repo=f"{submission_path}/{file_name}",
106
+ repo_id=DATASET_REPO_ID,
107
+ repo_type="dataset",
108
+ commit_message=f"Upload submission: {dir_name}"
109
+ )
110
+
111
+ return True, submission_path
112
+ except Exception as e:
113
+ return False, f"Upload error: {str(e)}"
114
+
115
+
116
+ def check_name_exists(submission_name):
117
+ if not HF_API:
118
+ return False
119
+
120
+ try:
121
+ repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
122
+ for file_path in repo_files:
123
+ if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
124
+ return True
125
+ except Exception as e:
126
+ print(f"Error checking name existence: {e}")
127
+
128
+ return False
src/ui.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+
4
+ from src.hf_utils import load_leaderboard_data, upload_submission, check_name_exists
5
+ from src.eval import start_background_evaluation
6
+
7
+
8
+ def handle_upload(submission_name, uploaded_files, progress=gr.Progress()):
9
+ """Handle file upload and start evaluation."""
10
+ if not uploaded_files or len(uploaded_files) == 0:
11
+ return "No directory uploaded or directory is empty, please try again."
12
+
13
+ # normalize the submission name
14
+ submission_name = submission_name.strip().replace(" ", "_").lower()
15
+ # keep only alphanumeric characters and underscores, restrict to 30 characters
16
+ submission_name = "".join(
17
+ c for c in submission_name if c.isalnum() or c == "_"
18
+ )[:30]
19
+
20
+ if not submission_name or submission_name.strip() == "":
21
+ return "Submission name is required"
22
+
23
+ if check_name_exists(submission_name):
24
+ return f"Submission name '{submission_name}' already exists. Please choose a different name."
25
+
26
+ try:
27
+ progress(0.3, "Uploading to Hugging Face...")
28
+
29
+ # Upload the directory to Hugging Face
30
+ success, result = upload_submission(uploaded_files, submission_name)
31
+ if not success:
32
+ return f"Upload failed: {result}"
33
+
34
+ progress(0.7, "Starting evaluation...")
35
+
36
+ # Start evaluation
37
+ start_background_evaluation(result)
38
+
39
+ progress(1.0, "Process complete")
40
+ return f"Upload complete. Evaluation started for: {submission_name}. Refresh the leaderboard to see results. Do not worry if the leaderboard does not update immediately; it may take some time for the results to appear."
41
+
42
+ except Exception as e:
43
+ return f"Error processing upload: {str(e)}"
44
+
45
+
46
+ def create_ui():
47
+ """Create and return Gradio UI."""
48
+ with gr.Blocks(title="CP-Bench Leaderboard") as demo:
49
+ gr.Markdown("# CP-Bench Leaderboard")
50
+
51
+ with gr.Row():
52
+ with gr.Column(scale=1):
53
+ gr.Markdown("## πŸ“€ Upload Submission")
54
+
55
+ submission_name = gr.Textbox(
56
+ label="Submission Name (required)",
57
+ placeholder="Enter a unique name for your submission",
58
+ interactive=True,
59
+ info="This name will appear on the leaderboard"
60
+ )
61
+ upload_button = gr.UploadButton("Click to Upload Directory", file_count="directory")
62
+ status_box = gr.Textbox(label="Status", interactive=False)
63
+
64
+ with gr.Column(scale=3):
65
+ gr.Markdown("## πŸ† Results Leaderboard")
66
+ leaderboard = gr.DataFrame(value=load_leaderboard_data, label="Leaderboard", interactive=False)
67
+ refresh_button = gr.Button("πŸ”„ Refresh Leaderboard")
68
+
69
+ # Event handlers
70
+ upload_button.upload(
71
+ fn=handle_upload,
72
+ inputs=[submission_name, upload_button],
73
+ outputs=[status_box],
74
+ show_progress="full",
75
+ )
76
+
77
+ refresh_button.click(
78
+ fn=load_leaderboard_data,
79
+ inputs=None,
80
+ outputs=[leaderboard]
81
+ )
82
+
83
+ return demo
src/utils.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from src.config import LOCAL_TEMP_SUBMISSIONS_DIR
4
+
5
+
6
+ def setup_directories():
7
+ os.makedirs(LOCAL_TEMP_SUBMISSIONS_DIR, exist_ok=True)