Commit
Β·
180f9fe
1
Parent(s):
3267617
add extra hf dataset for persistent storage of submissions and results
Browse files- app.py +3 -241
- eval.py +0 -356
- src/config.py +11 -0
- src/eval.py +403 -0
- src/hf_utils.py +128 -0
- src/ui.py +83 -0
- src/utils.py +7 -0
app.py
CHANGED
@@ -1,246 +1,8 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
import os
|
4 |
-
import shutil
|
5 |
-
from pathlib import Path
|
6 |
-
import subprocess # For running eval.py
|
7 |
-
import time
|
8 |
-
import threading # For background tasks
|
9 |
-
import sys
|
10 |
|
11 |
-
# --- Configuration ---
|
12 |
-
SUBMISSIONS_DIR = "submissions"
|
13 |
-
RESULTS_DIR = "results"
|
14 |
-
EVAL_SCRIPT_PATH = "eval.py"
|
15 |
-
|
16 |
-
|
17 |
-
# --- Helper Functions ---
|
18 |
-
|
19 |
-
def setup_directories():
|
20 |
-
"""Creates the submissions and results directories if they don't exist."""
|
21 |
-
os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
|
22 |
-
os.makedirs(RESULTS_DIR, exist_ok=True)
|
23 |
-
if not os.listdir(RESULTS_DIR): # Add a placeholder if results is empty
|
24 |
-
initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
|
25 |
-
if not initial_result_demo_path.exists():
|
26 |
-
os.makedirs(initial_result_demo_path, exist_ok=True)
|
27 |
-
with open(initial_result_demo_path / "summary.txt", "w") as f:
|
28 |
-
f.write("This is a placeholder initial result.\nScore: 0\n")
|
29 |
-
print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")
|
30 |
-
|
31 |
-
|
32 |
-
def load_leaderboard_data():
|
33 |
-
"""
|
34 |
-
Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
|
35 |
-
Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
|
36 |
-
"""
|
37 |
-
if not os.path.exists(RESULTS_DIR):
|
38 |
-
return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
|
39 |
-
|
40 |
-
result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]
|
41 |
-
|
42 |
-
leaderboard_entries = []
|
43 |
-
# Sort by modification time of the directory (newest first)
|
44 |
-
# This requires getting mtime for each directory.
|
45 |
-
sorted_result_dirs = sorted(
|
46 |
-
result_dirs,
|
47 |
-
key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
|
48 |
-
reverse=True
|
49 |
-
)
|
50 |
-
|
51 |
-
for dir_name in sorted_result_dirs:
|
52 |
-
entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
|
53 |
-
result_dir_path = Path(RESULTS_DIR) / dir_name
|
54 |
-
|
55 |
-
try:
|
56 |
-
entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
|
57 |
-
except Exception:
|
58 |
-
pass # Directory might have been removed during scan
|
59 |
-
|
60 |
-
summary_file = result_dir_path / "summary.txt"
|
61 |
-
if summary_file.exists():
|
62 |
-
try:
|
63 |
-
with open(summary_file, "r") as f:
|
64 |
-
for line in f:
|
65 |
-
if line.lower().startswith("score:"):
|
66 |
-
entry["Score"] = line.split(":", 1)[1].strip()
|
67 |
-
break
|
68 |
-
except Exception as e:
|
69 |
-
print(f"Error parsing summary for {dir_name}: {e}")
|
70 |
-
|
71 |
-
leaderboard_entries.append(entry)
|
72 |
-
|
73 |
-
if not leaderboard_entries:
|
74 |
-
return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
|
75 |
-
|
76 |
-
return pd.DataFrame(leaderboard_entries)
|
77 |
-
|
78 |
-
|
79 |
-
def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
|
80 |
-
"""
|
81 |
-
This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
|
82 |
-
Outputs from eval.py will go to the console where app.py is running.
|
83 |
-
"""
|
84 |
-
print(
|
85 |
-
f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")
|
86 |
-
|
87 |
-
if not Path(EVAL_SCRIPT_PATH).exists():
|
88 |
-
print(
|
89 |
-
f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
|
90 |
-
return
|
91 |
-
|
92 |
-
command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]
|
93 |
-
|
94 |
-
try:
|
95 |
-
# Using subprocess.run which is simpler for blocking calls within this thread
|
96 |
-
process = subprocess.run(
|
97 |
-
command,
|
98 |
-
capture_output=True,
|
99 |
-
text=True,
|
100 |
-
check=False, # Handle non-zero exit codes manually
|
101 |
-
timeout=300 # 5-minute timeout for the evaluation script
|
102 |
-
)
|
103 |
-
|
104 |
-
eval_output = process.stdout.strip()
|
105 |
-
eval_error = process.stderr.strip()
|
106 |
-
|
107 |
-
print(
|
108 |
-
f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
|
109 |
-
if eval_error: # Only print stderr if it's not empty
|
110 |
-
print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")
|
111 |
-
|
112 |
-
if process.returncode == 0:
|
113 |
-
print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
|
114 |
-
else:
|
115 |
-
print(
|
116 |
-
f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")
|
117 |
-
|
118 |
-
except subprocess.TimeoutExpired:
|
119 |
-
print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
|
120 |
-
except FileNotFoundError: # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
|
121 |
-
print(
|
122 |
-
f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
|
123 |
-
except Exception as e:
|
124 |
-
print(
|
125 |
-
f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")
|
126 |
-
|
127 |
-
print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")
|
128 |
-
|
129 |
-
|
130 |
-
def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
|
131 |
-
"""
|
132 |
-
Handles directory upload, saves files, and starts eval.py in a background thread.
|
133 |
-
Yields a status message for the UI. The leaderboard updates separately.
|
134 |
-
"""
|
135 |
-
yield "Processing upload..." # Initial status
|
136 |
-
|
137 |
-
if not uploaded_files_list:
|
138 |
-
yield "No directory uploaded. Please select a directory."
|
139 |
-
return
|
140 |
-
|
141 |
-
try:
|
142 |
-
# Determine original uploaded directory name
|
143 |
-
first_temp_file_path = Path(uploaded_files_list[0].name)
|
144 |
-
original_uploaded_dir_name = first_temp_file_path.parent.name
|
145 |
-
|
146 |
-
submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name
|
147 |
-
|
148 |
-
# Handle potential name collision
|
149 |
-
if submission_dir_path.exists():
|
150 |
-
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
151 |
-
descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
|
152 |
-
submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
|
153 |
-
status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
|
154 |
-
original_uploaded_dir_name = descriptive_name_for_log_and_status # Use new name for logging
|
155 |
-
else:
|
156 |
-
descriptive_name_for_log_and_status = original_uploaded_dir_name
|
157 |
-
status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."
|
158 |
-
|
159 |
-
os.makedirs(submission_dir_path, exist_ok=True)
|
160 |
-
progress(0.1, desc=status_update_msg)
|
161 |
-
|
162 |
-
for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
|
163 |
-
temp_file_path = Path(temp_file_obj.name)
|
164 |
-
file_name_in_dir = temp_file_path.name
|
165 |
-
target_file_path = submission_dir_path / file_name_in_dir
|
166 |
-
shutil.copy(str(temp_file_path), str(target_file_path))
|
167 |
-
|
168 |
-
upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
|
169 |
-
progress(0.8, desc=upload_completion_msg)
|
170 |
-
|
171 |
-
except Exception as e:
|
172 |
-
yield f"Error during upload: {str(e)}"
|
173 |
-
return
|
174 |
-
|
175 |
-
# --- Start evaluation in a background thread ---
|
176 |
-
if not Path(EVAL_SCRIPT_PATH).exists():
|
177 |
-
yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
|
178 |
-
return
|
179 |
-
|
180 |
-
# Ensure paths passed to thread are absolute strings, good practice for threads.
|
181 |
-
abs_submission_path = str(submission_dir_path.resolve())
|
182 |
-
abs_results_path = str(Path(RESULTS_DIR).resolve())
|
183 |
-
|
184 |
-
eval_thread = threading.Thread(
|
185 |
-
target=run_evaluation_in_background,
|
186 |
-
args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
|
187 |
-
daemon=True # Set as daemon so it exits when main app exits
|
188 |
-
)
|
189 |
-
eval_thread.start()
|
190 |
-
|
191 |
-
final_status_msg = (
|
192 |
-
f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
|
193 |
-
"The leaderboard will auto-refresh (or use manual refresh)."
|
194 |
-
)
|
195 |
-
progress(1.0, desc="Background evaluation initiated.")
|
196 |
-
yield final_status_msg
|
197 |
-
|
198 |
-
|
199 |
-
# --- Create Directories ---
|
200 |
setup_directories()
|
201 |
|
202 |
-
# --- Gradio App Definition ---
|
203 |
-
with gr.Blocks(title="CP-Bench Leaderboard") as demo:
|
204 |
-
gr.Markdown(
|
205 |
-
"""
|
206 |
-
# CP-Bench Leaderboard
|
207 |
-
|
208 |
-
This is a leaderboard for the CP-Bench dataset. You can upload your submission directory for evaluation.
|
209 |
-
"""
|
210 |
-
)
|
211 |
-
|
212 |
-
with gr.Row():
|
213 |
-
with gr.Column(scale=1): # Upload Column
|
214 |
-
gr.Markdown("## π€ Upload Submission")
|
215 |
-
upload_button = gr.UploadButton(
|
216 |
-
"Click to Upload Directory for Evaluation",
|
217 |
-
file_count="directory",
|
218 |
-
)
|
219 |
-
upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)
|
220 |
-
|
221 |
-
with gr.Column(scale=3): # Leaderboard Column
|
222 |
-
gr.Markdown("## π Results Leaderboard")
|
223 |
-
leaderboard_df_component = gr.DataFrame(
|
224 |
-
value=load_leaderboard_data, # Load initial data
|
225 |
-
label="Leaderboard (auto-refreshes)",
|
226 |
-
interactive=False,
|
227 |
-
# every=20 # Auto-refresh leaderboard data every 20 seconds
|
228 |
-
)
|
229 |
-
refresh_leaderboard_button = gr.Button("π Refresh Leaderboard Manually")
|
230 |
-
|
231 |
-
# --- Event Handlers ---
|
232 |
-
upload_button.upload(
|
233 |
-
fn=handle_upload_and_kickoff_eval,
|
234 |
-
inputs=[upload_button],
|
235 |
-
outputs=[upload_status_textbox], # Only one output now for the status message
|
236 |
-
show_progress="full"
|
237 |
-
)
|
238 |
-
|
239 |
-
refresh_leaderboard_button.click(
|
240 |
-
fn=load_leaderboard_data,
|
241 |
-
inputs=None,
|
242 |
-
outputs=[leaderboard_df_component]
|
243 |
-
)
|
244 |
-
|
245 |
if __name__ == "__main__":
|
|
|
246 |
demo.queue().launch()
|
|
|
1 |
+
from src.ui import create_ui
|
2 |
+
from src.utils import setup_directories
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
setup_directories()
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
if __name__ == "__main__":
|
7 |
+
demo = create_ui()
|
8 |
demo.queue().launch()
|
eval.py
DELETED
@@ -1,356 +0,0 @@
|
|
1 |
-
# eval.py
|
2 |
-
import sys
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
import json
|
6 |
-
import subprocess
|
7 |
-
import tempfile
|
8 |
-
from pathlib import Path
|
9 |
-
from datasets import load_dataset # Hugging Face datasets library
|
10 |
-
|
11 |
-
# --- Configuration ---
|
12 |
-
|
13 |
-
DATASET_NAME = "kostis-init/CP-Bench"
|
14 |
-
|
15 |
-
# Column names in the Hugging Face dataset for problem identifier and model script
|
16 |
-
PROBLEM_NAME_COLUMN = "id"
|
17 |
-
MODEL_CODE_COLUMN = "model"
|
18 |
-
|
19 |
-
# Timeout for running individual model scripts (both generated and modified ground-truth)
|
20 |
-
SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
|
21 |
-
|
22 |
-
|
23 |
-
def extract_json_from_string(text_output: str):
|
24 |
-
"""
|
25 |
-
Attempts to find and parse the first valid JSON object or array from a string.
|
26 |
-
Handles cases where JSON is preceded or followed by non-JSON text.
|
27 |
-
"""
|
28 |
-
idx = 0
|
29 |
-
while idx < len(text_output):
|
30 |
-
# Find the next potential start of a JSON structure
|
31 |
-
start_brace = text_output.find('{', idx)
|
32 |
-
start_bracket = text_output.find('[', idx)
|
33 |
-
|
34 |
-
if start_brace == -1 and start_bracket == -1:
|
35 |
-
# No more '{' or '[' found in the rest of the string
|
36 |
-
return None
|
37 |
-
|
38 |
-
# Determine the actual starting character for this attempt
|
39 |
-
if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
|
40 |
-
json_start_index = start_brace
|
41 |
-
else:
|
42 |
-
json_start_index = start_bracket
|
43 |
-
|
44 |
-
potential_json_segment = text_output[json_start_index:]
|
45 |
-
|
46 |
-
try:
|
47 |
-
# Use raw_decode to parse the first valid JSON object from the segment
|
48 |
-
decoder = json.JSONDecoder()
|
49 |
-
json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
|
50 |
-
# Successfully parsed a JSON object
|
51 |
-
return json_obj
|
52 |
-
except json.JSONDecodeError:
|
53 |
-
# This segment (starting at json_start_index) wasn't a valid JSON.
|
54 |
-
# Advance the search index past the character that caused the current attempt.
|
55 |
-
idx = json_start_index + 1
|
56 |
-
|
57 |
-
return None # No valid JSON found in the entire string
|
58 |
-
|
59 |
-
|
60 |
-
def run_instance(instance_path_str: str,
|
61 |
-
timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
|
62 |
-
"""Run the instance file and robustly capture the JSON output."""
|
63 |
-
command = [sys.executable, instance_path_str]
|
64 |
-
instance_name = Path(instance_path_str).name
|
65 |
-
try:
|
66 |
-
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
|
67 |
-
errors='replace')
|
68 |
-
|
69 |
-
# Check return code first
|
70 |
-
if result.returncode != 0:
|
71 |
-
# Log stderr for debugging if the script itself failed
|
72 |
-
error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
|
73 |
-
print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
|
74 |
-
return None
|
75 |
-
|
76 |
-
# Attempt to extract JSON from stdout
|
77 |
-
stdout_text = result.stdout
|
78 |
-
if not stdout_text or not stdout_text.strip():
|
79 |
-
print(f" ERROR: No stdout from {instance_name}.", flush=True)
|
80 |
-
return None
|
81 |
-
|
82 |
-
solution = extract_json_from_string(stdout_text)
|
83 |
-
|
84 |
-
if solution is None:
|
85 |
-
# Be more verbose if JSON extraction fails
|
86 |
-
abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
|
87 |
-
print(
|
88 |
-
f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
|
89 |
-
flush=True)
|
90 |
-
return None
|
91 |
-
|
92 |
-
return solution
|
93 |
-
|
94 |
-
except subprocess.TimeoutExpired:
|
95 |
-
print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
|
96 |
-
return None
|
97 |
-
except Exception as e:
|
98 |
-
print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
|
99 |
-
return None
|
100 |
-
|
101 |
-
|
102 |
-
def add_constraints_as_string(solution):
|
103 |
-
"""Generate constraints as a string to be added to the original script."""
|
104 |
-
constraints = ""
|
105 |
-
if solution: # Ensure solution is not None
|
106 |
-
for key, value in solution.items():
|
107 |
-
# Basic escaping for string values if they occur, though typically solutions are numeric/boolean
|
108 |
-
if isinstance(value, str):
|
109 |
-
constraints += f"\nmodel += ({key} == \"{value}\")"
|
110 |
-
else:
|
111 |
-
constraints += f"\nmodel += ({key} == {value})"
|
112 |
-
return constraints
|
113 |
-
|
114 |
-
|
115 |
-
def get_modified_script(script_content, solution):
|
116 |
-
"""Add constraints to the script content and self-consistency checks."""
|
117 |
-
constraints_str = add_constraints_as_string(solution)
|
118 |
-
modified_script = f"{script_content}\n{constraints_str}"
|
119 |
-
modified_script += """
|
120 |
-
|
121 |
-
# --- Self-consistency check appended by eval.py ---
|
122 |
-
# Print the absolute path of the current directory along with the script name
|
123 |
-
import os
|
124 |
-
# print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug
|
125 |
-
|
126 |
-
# Keep old objective
|
127 |
-
old_objective_value = None
|
128 |
-
objective_defined = False
|
129 |
-
if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
|
130 |
-
try:
|
131 |
-
# This block assumes 'model' is the CPMpy model object or similar
|
132 |
-
# Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
|
133 |
-
# or might raise an error if objective_value() is called on an unsolved/unformulated objective.
|
134 |
-
# This part might need adjustment based on the specific modeling library used in CP-Bench.
|
135 |
-
# For now, we'll try to get it and catch errors.
|
136 |
-
# A more robust way might be to inspect model.objective_
|
137 |
-
if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
|
138 |
-
if model._objective_value is not None: # cpmpy does not have objective_is_min
|
139 |
-
objective_defined = True
|
140 |
-
old_objective_value = model.objective_value()
|
141 |
-
|
142 |
-
except Exception as e_obj_check:
|
143 |
-
# print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
|
144 |
-
pass # Objective might not be set or model not solved yet.
|
145 |
-
|
146 |
-
# Check self-consistency
|
147 |
-
solved_ok = False
|
148 |
-
try:
|
149 |
-
if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
|
150 |
-
solved_ok = model.solve()
|
151 |
-
else:
|
152 |
-
print('ERROR: Model object not found or does not have a solve() method.')
|
153 |
-
except Exception as e_solve:
|
154 |
-
print(f'ERROR: Exception during model.solve(): {e_solve}')
|
155 |
-
solved_ok = False # Ensure it's false on exception
|
156 |
-
|
157 |
-
if not solved_ok:
|
158 |
-
print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
|
159 |
-
else:
|
160 |
-
print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')
|
161 |
-
|
162 |
-
# Check if the objective value is the same
|
163 |
-
if not objective_defined:
|
164 |
-
print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
|
165 |
-
else:
|
166 |
-
try:
|
167 |
-
current_objective_value = model.objective_value()
|
168 |
-
# Handle potential floating point inaccuracies if objectives can be floats
|
169 |
-
if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
|
170 |
-
if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
|
171 |
-
print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
|
172 |
-
else:
|
173 |
-
print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
|
174 |
-
elif current_objective_value != old_objective_value: # Integer comparison
|
175 |
-
print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
|
176 |
-
else:
|
177 |
-
print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
|
178 |
-
except Exception as e_obj_final:
|
179 |
-
print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')
|
180 |
-
|
181 |
-
"""
|
182 |
-
return modified_script
|
183 |
-
|
184 |
-
|
185 |
-
# --- Main Evaluation Logic ---
|
186 |
-
def main(submission_path_str: str, results_base_dir_str: str):
|
187 |
-
start_time = time.time()
|
188 |
-
print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
|
189 |
-
print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
|
190 |
-
print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)
|
191 |
-
|
192 |
-
submission_path = Path(submission_path_str)
|
193 |
-
submission_name = submission_path.name
|
194 |
-
result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
|
195 |
-
os.makedirs(result_dir_for_submission, exist_ok=True)
|
196 |
-
summary_file_path = result_dir_for_submission / "summary.txt"
|
197 |
-
|
198 |
-
# Load ground-truth dataset
|
199 |
-
try:
|
200 |
-
# Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
|
201 |
-
gt_dataset = load_dataset(DATASET_NAME, split="train")
|
202 |
-
ground_truth_models = {
|
203 |
-
item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
|
204 |
-
for item in gt_dataset
|
205 |
-
if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
|
206 |
-
}
|
207 |
-
if not ground_truth_models:
|
208 |
-
raise ValueError(
|
209 |
-
f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
|
210 |
-
print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
|
211 |
-
except Exception as e:
|
212 |
-
print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
|
213 |
-
with open(summary_file_path, "w") as f:
|
214 |
-
f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
|
215 |
-
return 1 # Indicate failure
|
216 |
-
|
217 |
-
# Statistics
|
218 |
-
total_submitted_models = 0
|
219 |
-
models_ran_successfully = 0
|
220 |
-
gt_models_found = 0
|
221 |
-
consistency_checks_passed = 0
|
222 |
-
objective_checks_passed = 0 # Includes "NO_OBJECTIVE_DEFINED" as a pass
|
223 |
-
|
224 |
-
with open(summary_file_path, "w") as summary_f:
|
225 |
-
summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
|
226 |
-
summary_f.write(
|
227 |
-
f"Ground-Truth Dataset: {DATASET_NAME}\n")
|
228 |
-
summary_f.write("-" * 30 + "\n")
|
229 |
-
|
230 |
-
submitted_model_files = list(submission_path.glob('*.py')) # Assuming Python models
|
231 |
-
if not submitted_model_files:
|
232 |
-
summary_f.write("No .py model files found in submission.\n")
|
233 |
-
print("eval.py: No .py model files found in submission.", flush=True)
|
234 |
-
return 0 # No models to evaluate, but script ran.
|
235 |
-
|
236 |
-
for model_file_path in submitted_model_files:
|
237 |
-
total_submitted_models += 1
|
238 |
-
problem_name = model_file_path.stem # Filename without .py extension
|
239 |
-
print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
|
240 |
-
summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
|
241 |
-
|
242 |
-
# 1. Run the submitted model to get its solution
|
243 |
-
summary_f.write(" 1. Running submitted model...\n")
|
244 |
-
generated_solution = run_instance(str(model_file_path))
|
245 |
-
if generated_solution is None:
|
246 |
-
summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
|
247 |
-
continue # Move to the next model
|
248 |
-
models_ran_successfully += 1
|
249 |
-
summary_f.write(f" - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")
|
250 |
-
|
251 |
-
# 2. Find corresponding ground-truth model
|
252 |
-
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
253 |
-
if problem_name not in ground_truth_models:
|
254 |
-
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
255 |
-
print(f" WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
|
256 |
-
continue
|
257 |
-
gt_models_found += 1
|
258 |
-
ground_truth_script_content = ground_truth_models[problem_name]
|
259 |
-
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
|
260 |
-
|
261 |
-
# 3. Modify ground-truth script with solution and run self-consistency check
|
262 |
-
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
|
263 |
-
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
264 |
-
|
265 |
-
consistency_passed_this_model = False
|
266 |
-
objective_passed_this_model = False
|
267 |
-
|
268 |
-
try:
|
269 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
|
270 |
-
tmp_file.write(modified_gt_script)
|
271 |
-
tmp_file_path_str = tmp_file.name
|
272 |
-
|
273 |
-
# Run the modified ground-truth script
|
274 |
-
gt_check_result = subprocess.run(
|
275 |
-
[sys.executable, tmp_file_path_str],
|
276 |
-
capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
|
277 |
-
)
|
278 |
-
os.unlink(tmp_file_path_str) # Clean up temp file
|
279 |
-
|
280 |
-
# 4. Parse output of modified ground-truth
|
281 |
-
gt_stdout = gt_check_result.stdout
|
282 |
-
gt_stderr = gt_check_result.stderr
|
283 |
-
# summary_f.write(f" Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
|
284 |
-
if gt_stderr:
|
285 |
-
summary_f.write(f" Modified GT STDERR: {gt_stderr[:500]}...\n")
|
286 |
-
|
287 |
-
if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
|
288 |
-
summary_f.write(" - CONSISTENCY: PASSED\n")
|
289 |
-
consistency_checks_passed += 1
|
290 |
-
consistency_passed_this_model = True
|
291 |
-
elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
|
292 |
-
summary_f.write(" - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
|
293 |
-
else:
|
294 |
-
summary_f.write(" - CONSISTENCY: FAILED (Could not determine consistency from output)\n")
|
295 |
-
|
296 |
-
if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
|
297 |
-
"EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
|
298 |
-
summary_f.write(" - OBJECTIVE: PASSED (Consistent or no objective)\n")
|
299 |
-
objective_checks_passed += 1
|
300 |
-
objective_passed_this_model = True
|
301 |
-
elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
|
302 |
-
summary_f.write(f" - OBJECTIVE: FAILED (Value changed)\n")
|
303 |
-
elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
|
304 |
-
summary_f.write(f" - OBJECTIVE: FAILED (Error accessing final objective)\n")
|
305 |
-
else:
|
306 |
-
summary_f.write(" - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")
|
307 |
-
|
308 |
-
except subprocess.TimeoutExpired:
|
309 |
-
summary_f.write(
|
310 |
-
f" - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
|
311 |
-
print(f" ERROR: Timeout running modified GT for {problem_name}", flush=True)
|
312 |
-
except Exception as e_gt_run:
|
313 |
-
summary_f.write(
|
314 |
-
f" - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
|
315 |
-
print(f" ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)
|
316 |
-
|
317 |
-
# Final statistics
|
318 |
-
summary_f.write("\n" + "=" * 30 + "\n")
|
319 |
-
summary_f.write("Overall Evaluation Statistics:\n")
|
320 |
-
summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
|
321 |
-
summary_f.write(
|
322 |
-
f" Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
|
323 |
-
summary_f.write(
|
324 |
-
f" Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
|
325 |
-
summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
|
326 |
-
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
|
327 |
-
|
328 |
-
# Define an overall score, e.g. number of models that passed both checks against found GT
|
329 |
-
fully_passed_models = 0
|
330 |
-
# This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
|
331 |
-
# This simple score is just the sum of passes, could be more nuanced
|
332 |
-
overall_score = consistency_checks_passed + objective_checks_passed
|
333 |
-
summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n") # For Gradio app to parse
|
334 |
-
|
335 |
-
elapsed_time = time.time() - start_time
|
336 |
-
print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
|
337 |
-
print(f"eval.py: Summary written to {summary_file_path}", flush=True)
|
338 |
-
return 0 # Success
|
339 |
-
|
340 |
-
|
341 |
-
if __name__ == "__main__":
|
342 |
-
if len(sys.argv) < 3:
|
343 |
-
print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
|
344 |
-
print("Example: python eval.py ./submissions/my_run ./results")
|
345 |
-
sys.exit(1)
|
346 |
-
|
347 |
-
submission_dir = sys.argv[1]
|
348 |
-
results_base_dir = sys.argv[2]
|
349 |
-
|
350 |
-
# Simple check if submission_dir exists
|
351 |
-
if not Path(submission_dir).is_dir():
|
352 |
-
print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
|
353 |
-
sys.exit(1)
|
354 |
-
|
355 |
-
exit_code = main(submission_dir, results_base_dir)
|
356 |
-
sys.exit(exit_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/config.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File and directory paths
|
2 |
+
EVAL_SCRIPT_PATH = "src/eval.py"
|
3 |
+
LOCAL_TEMP_SUBMISSIONS_DIR = "../temp_submissions_app"
|
4 |
+
|
5 |
+
# Hugging Face Dataset Configuration
|
6 |
+
DATASET_REPO_ID = "kostis-init/my-storage"
|
7 |
+
DS_SUBMISSIONS_PATH = "submissions"
|
8 |
+
DS_RESULTS_PATH = "results"
|
9 |
+
|
10 |
+
# leaderboard
|
11 |
+
LDB_COLS = ["Submission Name", "Execution (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
|
src/eval.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# eval.py
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import json
|
6 |
+
import subprocess
|
7 |
+
import tempfile
|
8 |
+
from pathlib import Path
|
9 |
+
from datasets import load_dataset # Hugging Face datasets library
|
10 |
+
from huggingface_hub import HfApi, hf_hub_download, snapshot_download # For user data dataset
|
11 |
+
from huggingface_hub.utils import RepositoryNotFoundError
|
12 |
+
|
13 |
+
# --- Configuration ---
|
14 |
+
|
15 |
+
GT_DATASET_NAME = "kostis-init/CP-Bench"
|
16 |
+
|
17 |
+
# Column names in the Hugging Face dataset for problem identifier and model script
|
18 |
+
GT_PROBLEM_NAME_COLUMN = "id"
|
19 |
+
GT_MODEL_CODE_COLUMN = "model"
|
20 |
+
|
21 |
+
# Timeout for running individual model scripts (both generated and modified ground-truth)
|
22 |
+
SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
|
23 |
+
|
24 |
+
"""Handles evaluation of submissions."""
|
25 |
+
|
26 |
+
import os
|
27 |
+
import sys
|
28 |
+
import subprocess
|
29 |
+
import threading
|
30 |
+
from pathlib import Path
|
31 |
+
|
32 |
+
from src.config import EVAL_SCRIPT_PATH, DATASET_REPO_ID, DS_RESULTS_PATH
|
33 |
+
|
34 |
+
|
35 |
+
def run_evaluation(submission_path):
|
36 |
+
|
37 |
+
if not Path(EVAL_SCRIPT_PATH).exists():
|
38 |
+
print(f"ERROR: Eval script '{EVAL_SCRIPT_PATH}' not found")
|
39 |
+
return
|
40 |
+
|
41 |
+
print(f"Starting evaluation for: {submission_path}")
|
42 |
+
|
43 |
+
command = [
|
44 |
+
sys.executable,
|
45 |
+
EVAL_SCRIPT_PATH,
|
46 |
+
DATASET_REPO_ID,
|
47 |
+
submission_path,
|
48 |
+
DS_RESULTS_PATH
|
49 |
+
]
|
50 |
+
|
51 |
+
try:
|
52 |
+
process = subprocess.run(
|
53 |
+
command,
|
54 |
+
capture_output=True,
|
55 |
+
text=True,
|
56 |
+
check=False,
|
57 |
+
timeout=600,
|
58 |
+
encoding='utf-8',
|
59 |
+
)
|
60 |
+
|
61 |
+
if process.returncode == 0:
|
62 |
+
print(f"Evaluation successful for: {submission_path}")
|
63 |
+
else:
|
64 |
+
print(f"Evaluation failed for: {submission_path}")
|
65 |
+
print(f"STDERR: {process.stderr}")
|
66 |
+
|
67 |
+
except subprocess.TimeoutExpired:
|
68 |
+
print(f"Evaluation timed out for: {submission_path}")
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error running evaluation: {e}")
|
71 |
+
|
72 |
+
print(f"Evaluation process complete for: {submission_path}")
|
73 |
+
|
74 |
+
|
75 |
+
def start_background_evaluation(submission_path):
|
76 |
+
"""Start evaluation in a background thread."""
|
77 |
+
thread = threading.Thread(
|
78 |
+
target=lambda: run_evaluation(submission_path),
|
79 |
+
daemon=True
|
80 |
+
)
|
81 |
+
thread.start()
|
82 |
+
return True
|
83 |
+
|
84 |
+
|
85 |
+
def extract_json_from_string(text_output: str):
|
86 |
+
"""
|
87 |
+
Attempts to find and parse the first valid JSON object or array from a string.
|
88 |
+
Handles cases where JSON is preceded or followed by non-JSON text.
|
89 |
+
"""
|
90 |
+
idx = 0
|
91 |
+
while idx < len(text_output):
|
92 |
+
# Find the next potential start of a JSON structure
|
93 |
+
start_brace = text_output.find('{', idx)
|
94 |
+
start_bracket = text_output.find('[', idx)
|
95 |
+
|
96 |
+
if start_brace == -1 and start_bracket == -1:
|
97 |
+
# No more '{' or '[' found in the rest of the string
|
98 |
+
return None
|
99 |
+
|
100 |
+
# Determine the actual starting character for this attempt
|
101 |
+
if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
|
102 |
+
json_start_index = start_brace
|
103 |
+
else:
|
104 |
+
json_start_index = start_bracket
|
105 |
+
|
106 |
+
potential_json_segment = text_output[json_start_index:]
|
107 |
+
|
108 |
+
try:
|
109 |
+
# Use raw_decode to parse the first valid JSON object from the segment
|
110 |
+
decoder = json.JSONDecoder()
|
111 |
+
json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
|
112 |
+
# Successfully parsed a JSON object
|
113 |
+
return json_obj
|
114 |
+
except json.JSONDecodeError:
|
115 |
+
# This segment (starting at json_start_index) wasn't a valid JSON.
|
116 |
+
# Advance the search index past the character that caused the current attempt.
|
117 |
+
idx = json_start_index + 1
|
118 |
+
|
119 |
+
return None # No valid JSON found in the entire string
|
120 |
+
|
121 |
+
|
122 |
+
def run_instance(instance_path_str: str,
|
123 |
+
timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
|
124 |
+
"""Run the instance file and robustly capture the JSON output."""
|
125 |
+
command = [sys.executable, instance_path_str]
|
126 |
+
instance_name = Path(instance_path_str).name
|
127 |
+
try:
|
128 |
+
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
|
129 |
+
errors='replace')
|
130 |
+
|
131 |
+
# Check return code first
|
132 |
+
if result.returncode != 0:
|
133 |
+
# Log stderr for debugging if the script itself failed
|
134 |
+
error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
|
135 |
+
print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
|
136 |
+
return None
|
137 |
+
|
138 |
+
# Attempt to extract JSON from stdout
|
139 |
+
stdout_text = result.stdout
|
140 |
+
if not stdout_text or not stdout_text.strip():
|
141 |
+
print(f" ERROR: No stdout from {instance_name}.", flush=True)
|
142 |
+
return None
|
143 |
+
|
144 |
+
solution = extract_json_from_string(stdout_text)
|
145 |
+
|
146 |
+
if solution is None:
|
147 |
+
# Be more verbose if JSON extraction fails
|
148 |
+
abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
|
149 |
+
print(
|
150 |
+
f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
|
151 |
+
flush=True)
|
152 |
+
return None
|
153 |
+
|
154 |
+
return solution
|
155 |
+
|
156 |
+
except subprocess.TimeoutExpired:
|
157 |
+
print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
|
158 |
+
return None
|
159 |
+
except Exception as e:
|
160 |
+
print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
|
161 |
+
return None
|
162 |
+
|
163 |
+
|
164 |
+
def add_constraints_as_string(solution):
|
165 |
+
"""Generate constraints as a string to be added to the original script."""
|
166 |
+
constraints = ""
|
167 |
+
if solution: # Ensure solution is not None
|
168 |
+
for key, value in solution.items():
|
169 |
+
# Basic escaping for string values if they occur, though typically solutions are numeric/boolean
|
170 |
+
if isinstance(value, str):
|
171 |
+
constraints += f"\nmodel += ({key} == \"{value}\")"
|
172 |
+
else:
|
173 |
+
constraints += f"\nmodel += ({key} == {value})"
|
174 |
+
return constraints
|
175 |
+
|
176 |
+
|
177 |
+
def get_modified_script(script_content, solution):
|
178 |
+
"""Add constraints to the script content and self-consistency checks."""
|
179 |
+
constraints_str = add_constraints_as_string(solution)
|
180 |
+
modified_script = f"{script_content}\n{constraints_str}"
|
181 |
+
modified_script += """
|
182 |
+
# Print the absolute path of the current directory along with the script name
|
183 |
+
import os
|
184 |
+
print(os.path.abspath(__file__))
|
185 |
+
|
186 |
+
# Keep old objective
|
187 |
+
old_objective = None
|
188 |
+
if hasattr(model, 'objective_is_min') and model.objective_is_min is not None:
|
189 |
+
old_objective = model.objective_value()
|
190 |
+
|
191 |
+
# Check self-consistency
|
192 |
+
if not model.solve():
|
193 |
+
print('ERROR: The model is unsatisfiable with the self-consistency constraints')
|
194 |
+
else:
|
195 |
+
print('SUCCESS: Model is consistent')
|
196 |
+
|
197 |
+
# Check if the objective value is the same
|
198 |
+
if old_objective is None:
|
199 |
+
print('SUCCESS: No objective defined')
|
200 |
+
elif model.objective_value() != old_objective:
|
201 |
+
print('ERROR: The objective value has changed')
|
202 |
+
else:
|
203 |
+
print('SUCCESS: Objective value is consistent')
|
204 |
+
"""
|
205 |
+
return modified_script
|
206 |
+
|
207 |
+
|
208 |
+
# --- Main Evaluation Logic ---
|
209 |
+
def main(
|
210 |
+
user_dataset_repo_id: str,
|
211 |
+
submission_path_in_dataset: str, # e.g., "submissions/uploaded_dir_name"
|
212 |
+
results_base_path_in_dataset: str # e.g., "results"
|
213 |
+
):
|
214 |
+
start_time = time.time()
|
215 |
+
# Infer submission name for logging and result path generation
|
216 |
+
submission_name_for_files = Path(submission_path_in_dataset).name
|
217 |
+
|
218 |
+
print(f"eval.py: Starting evaluation for submission: '{submission_name_for_files}'", flush=True)
|
219 |
+
print(f" User Data Repo: {user_dataset_repo_id}", flush=True)
|
220 |
+
print(f" Submission to download from: {submission_path_in_dataset}", flush=True)
|
221 |
+
print(f" Results to upload to: {results_base_path_in_dataset}/{submission_name_for_files}", flush=True)
|
222 |
+
|
223 |
+
hf_api = HfApi() # Will use HF_TOKEN from environment
|
224 |
+
|
225 |
+
# Create a top-level temporary directory for all operations for this eval run
|
226 |
+
with tempfile.TemporaryDirectory(prefix="eval_run_") as top_level_temp_dir_str:
|
227 |
+
top_level_temp_dir = Path(top_level_temp_dir_str)
|
228 |
+
local_submission_dir = top_level_temp_dir / "submissions"
|
229 |
+
local_result_dir_for_upload = top_level_temp_dir / "results"
|
230 |
+
|
231 |
+
os.makedirs(local_submission_dir, exist_ok=True)
|
232 |
+
os.makedirs(local_result_dir_for_upload, exist_ok=True)
|
233 |
+
|
234 |
+
# Path for the summary file within the local temporary result directory
|
235 |
+
summary_file_path = local_result_dir_for_upload / "summary.txt"
|
236 |
+
|
237 |
+
# 1. Download submitted files from HF Dataset
|
238 |
+
print(f" Downloading submission files from '{submission_path_in_dataset}' to '{local_submission_dir}'...",
|
239 |
+
flush=True)
|
240 |
+
try:
|
241 |
+
# Download the relevant submission files
|
242 |
+
snapshot_download(
|
243 |
+
repo_id=user_dataset_repo_id,
|
244 |
+
repo_type="dataset",
|
245 |
+
local_dir=local_submission_dir,
|
246 |
+
allow_patterns=[f"{submission_path_in_dataset}/*"],
|
247 |
+
)
|
248 |
+
print(f" Downloaded submission files successfully.", flush=True)
|
249 |
+
|
250 |
+
except Exception as e_download:
|
251 |
+
print(f" CRITICAL ERROR - Failed to download submission files: {e_download}", flush=True)
|
252 |
+
return 1
|
253 |
+
|
254 |
+
# 2. Load ground-truth dataset (remains the same)
|
255 |
+
print(f" Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
|
256 |
+
try:
|
257 |
+
gt_dataset = load_dataset(GT_DATASET_NAME, split="train", trust_remote_code=True)
|
258 |
+
ground_truth_models = {
|
259 |
+
item[GT_PROBLEM_NAME_COLUMN]: item[GT_MODEL_CODE_COLUMN]
|
260 |
+
for item in gt_dataset if
|
261 |
+
GT_PROBLEM_NAME_COLUMN in item and GT_MODEL_CODE_COLUMN in item and item[GT_MODEL_CODE_COLUMN]
|
262 |
+
}
|
263 |
+
if not ground_truth_models: raise ValueError("No models in GT dataset.")
|
264 |
+
print(f" Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
|
265 |
+
except Exception as e_gt:
|
266 |
+
print(f" CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
|
267 |
+
with open(summary_file_path, "w") as f:
|
268 |
+
f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{GT_DATASET_NAME}'.\nError: {e_gt}\n")
|
269 |
+
# (Attempt to upload error summary)
|
270 |
+
return 1
|
271 |
+
|
272 |
+
# Statistics
|
273 |
+
total_submitted_models = 0
|
274 |
+
models_ran_successfully = 0
|
275 |
+
consistency_checks_passed = 0
|
276 |
+
objective_checks_passed = 0
|
277 |
+
all_checks_passed = 0
|
278 |
+
gt_models_found = 0
|
279 |
+
|
280 |
+
with open(summary_file_path, "w", encoding="utf-8") as summary_f:
|
281 |
+
summary_f.write(f"Evaluation Summary for Submission: {submission_name_for_files}\n")
|
282 |
+
summary_f.write(f"User Data Repo: {user_dataset_repo_id}\n")
|
283 |
+
summary_f.write(f"Submission Path in Dataset: {submission_path_in_dataset}\n")
|
284 |
+
summary_f.write(f"Ground-Truth Dataset: {GT_DATASET_NAME}\n")
|
285 |
+
summary_f.write("-" * 30 + "\n")
|
286 |
+
|
287 |
+
# Iterate through downloaded submitted models
|
288 |
+
submitted_model_files = list((local_submission_dir / submission_path_in_dataset).rglob('*.py'))
|
289 |
+
if not submitted_model_files:
|
290 |
+
summary_f.write("No .py model files found in downloaded submission.\n")
|
291 |
+
print(" No .py model files found in downloaded submission.", flush=True)
|
292 |
+
|
293 |
+
for model_file_path in submitted_model_files:
|
294 |
+
total_submitted_models += 1
|
295 |
+
problem_name = model_file_path.stem
|
296 |
+
print(f"\n Processing downloaded model: {model_file_path.name}", flush=True)
|
297 |
+
summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
|
298 |
+
|
299 |
+
summary_f.write(" 1. Running submitted model...\n")
|
300 |
+
generated_solution = run_instance(str(model_file_path))
|
301 |
+
if generated_solution is None:
|
302 |
+
summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
|
303 |
+
continue
|
304 |
+
models_ran_successfully += 1
|
305 |
+
summary_f.write(f" - SUCCESS: Got solution.\n")
|
306 |
+
|
307 |
+
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
308 |
+
if problem_name not in ground_truth_models:
|
309 |
+
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
310 |
+
continue
|
311 |
+
gt_models_found += 1
|
312 |
+
ground_truth_script_content = ground_truth_models[problem_name]
|
313 |
+
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
|
314 |
+
|
315 |
+
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
|
316 |
+
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
317 |
+
|
318 |
+
try:
|
319 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
|
320 |
+
dir=top_level_temp_dir) as tmp_file:
|
321 |
+
tmp_file.write(modified_gt_script)
|
322 |
+
tmp_file_path_str = tmp_file.name
|
323 |
+
|
324 |
+
gt_check_result = subprocess.run(
|
325 |
+
[sys.executable, tmp_file_path_str],
|
326 |
+
capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT, encoding='utf-8',
|
327 |
+
)
|
328 |
+
os.unlink(tmp_file_path_str)
|
329 |
+
|
330 |
+
gt_stdout = gt_check_result.stdout
|
331 |
+
# ... (parse EVAL_OUTPUT tags for consistency and objective)
|
332 |
+
if "SUCCESS: Model is consistent" in gt_stdout:
|
333 |
+
summary_f.write(" - CONSISTENCY: PASSED\n")
|
334 |
+
consistency_checks_passed += 1
|
335 |
+
else:
|
336 |
+
summary_f.write(
|
337 |
+
" - CONSISTENCY: FAILED (Details in logs or stdout)\n")
|
338 |
+
|
339 |
+
if "SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout:
|
340 |
+
summary_f.write(" - OBJECTIVE: PASSED\n")
|
341 |
+
objective_checks_passed += 1
|
342 |
+
else:
|
343 |
+
summary_f.write(" - OBJECTIVE: FAILED (Details in logs or stdout)\n")
|
344 |
+
|
345 |
+
if "SUCCESS: Model is consistent" in gt_stdout and ("SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
|
346 |
+
summary_f.write(" - SELF-CONSISTENCY CHECK: PASSED fully\n")
|
347 |
+
all_checks_passed += 1
|
348 |
+
|
349 |
+
except Exception as e_gt_run:
|
350 |
+
summary_f.write(f" - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
|
351 |
+
|
352 |
+
# Final statistics (write to summary_f)
|
353 |
+
summary_f.write("\n" + "=" * 30 + "\n")
|
354 |
+
summary_f.write("Overall Evaluation Statistics:\n")
|
355 |
+
summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
|
356 |
+
summary_f.write(f" Models That Ran Successfully: {models_ran_successfully}/{total_submitted_models}\n")
|
357 |
+
summary_f.write(f" Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully}\n")
|
358 |
+
summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
|
359 |
+
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
|
360 |
+
summary_f.write("=" * 30 + "\n")
|
361 |
+
summary_f.write("Final Evaluation Summary:\n")
|
362 |
+
summary_f.write(f" Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
|
363 |
+
summary_f.write(f" Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
364 |
+
summary_f.write(f" Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
365 |
+
summary_f.write(f" Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
|
366 |
+
summary_f.write("-" * 30 + "\n")
|
367 |
+
|
368 |
+
# 4. Upload the entire local_result_dir_for_upload to HF Dataset
|
369 |
+
# This directory contains summary.txt and could contain other result files.
|
370 |
+
result_path_on_hub = f"{results_base_path_in_dataset}/{submission_name_for_files}"
|
371 |
+
print(f" Uploading results from '{local_result_dir_for_upload}' to '{result_path_on_hub}' on dataset...",
|
372 |
+
flush=True)
|
373 |
+
try:
|
374 |
+
hf_api.upload_folder(
|
375 |
+
folder_path=str(local_result_dir_for_upload),
|
376 |
+
path_in_repo=result_path_on_hub,
|
377 |
+
repo_id=user_dataset_repo_id,
|
378 |
+
repo_type="dataset",
|
379 |
+
commit_message=f"Evaluation results for {submission_name_for_files}"
|
380 |
+
)
|
381 |
+
print(" Results uploaded successfully.", flush=True)
|
382 |
+
except Exception as e_upload:
|
383 |
+
print(f" CRITICAL ERROR: Failed to upload results: {e_upload}", flush=True)
|
384 |
+
# The summary.txt was written locally, but upload failed.
|
385 |
+
|
386 |
+
elapsed_time = time.time() - start_time
|
387 |
+
print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
|
388 |
+
return 0
|
389 |
+
|
390 |
+
|
391 |
+
if __name__ == "__main__":
|
392 |
+
if len(sys.argv) < 4:
|
393 |
+
print(
|
394 |
+
"Usage: python eval.py <user_dataset_repo_id> <submission_path_in_dataset> <results_base_path_in_dataset>")
|
395 |
+
print("Example: python eval.py your-username/my-storage submissions/run123 results")
|
396 |
+
sys.exit(1)
|
397 |
+
|
398 |
+
arg_user_dataset_repo_id = sys.argv[1]
|
399 |
+
arg_submission_path_in_dataset = sys.argv[2]
|
400 |
+
arg_results_base_path_in_dataset = sys.argv[3]
|
401 |
+
|
402 |
+
exit_code = main(arg_user_dataset_repo_id, arg_submission_path_in_dataset, arg_results_base_path_in_dataset)
|
403 |
+
sys.exit(exit_code)
|
src/hf_utils.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utilities for interacting with the Hugging Face Hub."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
from pathlib import Path
|
6 |
+
import pandas as pd
|
7 |
+
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
|
8 |
+
from huggingface_hub.utils import RepositoryNotFoundError, HFValidationError
|
9 |
+
|
10 |
+
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS
|
11 |
+
|
12 |
+
# Initialize HfApi
|
13 |
+
try:
|
14 |
+
HF_API = HfApi()
|
15 |
+
print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
|
16 |
+
except Exception as e:
|
17 |
+
print(f"Failed to initialize HfApi: {e}")
|
18 |
+
HF_API = None
|
19 |
+
|
20 |
+
|
21 |
+
def load_leaderboard_data():
|
22 |
+
"""Load leaderboard data from Hugging Face Dataset."""
|
23 |
+
if not HF_API:
|
24 |
+
return pd.DataFrame(columns=LDB_COLS)
|
25 |
+
|
26 |
+
leaderboard_entries = []
|
27 |
+
processed_result_dirs = set()
|
28 |
+
|
29 |
+
try:
|
30 |
+
# List all files in the results path of the dataset
|
31 |
+
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
|
32 |
+
|
33 |
+
# Find all summary files
|
34 |
+
summary_files = [
|
35 |
+
f for f in repo_files
|
36 |
+
if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
|
37 |
+
]
|
38 |
+
summary_files.sort(reverse=True)
|
39 |
+
|
40 |
+
for file_path in summary_files:
|
41 |
+
dir_name = Path(file_path).parent.name
|
42 |
+
if dir_name in processed_result_dirs:
|
43 |
+
continue
|
44 |
+
|
45 |
+
processed_result_dirs.add(dir_name)
|
46 |
+
entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: 'N/A', LDB_COLS[2]: 'N/A', LDB_COLS[3]: 'N/A', LDB_COLS[4]: 0}
|
47 |
+
|
48 |
+
# Download summary file
|
49 |
+
temp_dir = os.path.join("temp_hf_downloads", dir_name)
|
50 |
+
local_summary_path = hf_hub_download(
|
51 |
+
repo_id=DATASET_REPO_ID,
|
52 |
+
filename=file_path,
|
53 |
+
repo_type="dataset",
|
54 |
+
local_dir=temp_dir,
|
55 |
+
)
|
56 |
+
|
57 |
+
# Count files
|
58 |
+
files_in_result_dir = [
|
59 |
+
f for f in repo_files
|
60 |
+
if f.startswith(f"{DS_RESULTS_PATH}/{dir_name}/") and not f.endswith("/")
|
61 |
+
]
|
62 |
+
|
63 |
+
# Parse score from summary
|
64 |
+
if Path(local_summary_path).exists():
|
65 |
+
with open(local_summary_path, "r", encoding="utf-8") as f:
|
66 |
+
for line in f:
|
67 |
+
if 'Execution perc' in line:
|
68 |
+
entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
|
69 |
+
if 'Consistency perc' in line:
|
70 |
+
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
|
71 |
+
if 'Final Solution Accuracy' in line:
|
72 |
+
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
|
73 |
+
if 'Total Submitted Models Parsed' in line:
|
74 |
+
entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
|
75 |
+
os.remove(local_summary_path)
|
76 |
+
|
77 |
+
leaderboard_entries.append(entry)
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Error loading leaderboard data: {e}")
|
81 |
+
|
82 |
+
finally:
|
83 |
+
# Clean up
|
84 |
+
if Path("temp_hf_downloads").exists():
|
85 |
+
shutil.rmtree("temp_hf_downloads", ignore_errors=True)
|
86 |
+
|
87 |
+
if not leaderboard_entries:
|
88 |
+
return pd.DataFrame(columns=LDB_COLS)
|
89 |
+
|
90 |
+
return pd.DataFrame(leaderboard_entries)
|
91 |
+
|
92 |
+
|
93 |
+
def upload_submission(uploaded_files, dir_name):
|
94 |
+
"""Upload submission to Hugging Face Dataset."""
|
95 |
+
if not HF_API:
|
96 |
+
return False, "Hugging Face API not initialized"
|
97 |
+
|
98 |
+
try:
|
99 |
+
submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
|
100 |
+
|
101 |
+
for file in uploaded_files:
|
102 |
+
file_name = os.path.basename(file.name)
|
103 |
+
HF_API.upload_file(
|
104 |
+
path_or_fileobj=file,
|
105 |
+
path_in_repo=f"{submission_path}/{file_name}",
|
106 |
+
repo_id=DATASET_REPO_ID,
|
107 |
+
repo_type="dataset",
|
108 |
+
commit_message=f"Upload submission: {dir_name}"
|
109 |
+
)
|
110 |
+
|
111 |
+
return True, submission_path
|
112 |
+
except Exception as e:
|
113 |
+
return False, f"Upload error: {str(e)}"
|
114 |
+
|
115 |
+
|
116 |
+
def check_name_exists(submission_name):
|
117 |
+
if not HF_API:
|
118 |
+
return False
|
119 |
+
|
120 |
+
try:
|
121 |
+
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
|
122 |
+
for file_path in repo_files:
|
123 |
+
if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
|
124 |
+
return True
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Error checking name existence: {e}")
|
127 |
+
|
128 |
+
return False
|
src/ui.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from src.hf_utils import load_leaderboard_data, upload_submission, check_name_exists
|
5 |
+
from src.eval import start_background_evaluation
|
6 |
+
|
7 |
+
|
8 |
+
def handle_upload(submission_name, uploaded_files, progress=gr.Progress()):
|
9 |
+
"""Handle file upload and start evaluation."""
|
10 |
+
if not uploaded_files or len(uploaded_files) == 0:
|
11 |
+
return "No directory uploaded or directory is empty, please try again."
|
12 |
+
|
13 |
+
# normalize the submission name
|
14 |
+
submission_name = submission_name.strip().replace(" ", "_").lower()
|
15 |
+
# keep only alphanumeric characters and underscores, restrict to 30 characters
|
16 |
+
submission_name = "".join(
|
17 |
+
c for c in submission_name if c.isalnum() or c == "_"
|
18 |
+
)[:30]
|
19 |
+
|
20 |
+
if not submission_name or submission_name.strip() == "":
|
21 |
+
return "Submission name is required"
|
22 |
+
|
23 |
+
if check_name_exists(submission_name):
|
24 |
+
return f"Submission name '{submission_name}' already exists. Please choose a different name."
|
25 |
+
|
26 |
+
try:
|
27 |
+
progress(0.3, "Uploading to Hugging Face...")
|
28 |
+
|
29 |
+
# Upload the directory to Hugging Face
|
30 |
+
success, result = upload_submission(uploaded_files, submission_name)
|
31 |
+
if not success:
|
32 |
+
return f"Upload failed: {result}"
|
33 |
+
|
34 |
+
progress(0.7, "Starting evaluation...")
|
35 |
+
|
36 |
+
# Start evaluation
|
37 |
+
start_background_evaluation(result)
|
38 |
+
|
39 |
+
progress(1.0, "Process complete")
|
40 |
+
return f"Upload complete. Evaluation started for: {submission_name}. Refresh the leaderboard to see results. Do not worry if the leaderboard does not update immediately; it may take some time for the results to appear."
|
41 |
+
|
42 |
+
except Exception as e:
|
43 |
+
return f"Error processing upload: {str(e)}"
|
44 |
+
|
45 |
+
|
46 |
+
def create_ui():
|
47 |
+
"""Create and return Gradio UI."""
|
48 |
+
with gr.Blocks(title="CP-Bench Leaderboard") as demo:
|
49 |
+
gr.Markdown("# CP-Bench Leaderboard")
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
with gr.Column(scale=1):
|
53 |
+
gr.Markdown("## π€ Upload Submission")
|
54 |
+
|
55 |
+
submission_name = gr.Textbox(
|
56 |
+
label="Submission Name (required)",
|
57 |
+
placeholder="Enter a unique name for your submission",
|
58 |
+
interactive=True,
|
59 |
+
info="This name will appear on the leaderboard"
|
60 |
+
)
|
61 |
+
upload_button = gr.UploadButton("Click to Upload Directory", file_count="directory")
|
62 |
+
status_box = gr.Textbox(label="Status", interactive=False)
|
63 |
+
|
64 |
+
with gr.Column(scale=3):
|
65 |
+
gr.Markdown("## π Results Leaderboard")
|
66 |
+
leaderboard = gr.DataFrame(value=load_leaderboard_data, label="Leaderboard", interactive=False)
|
67 |
+
refresh_button = gr.Button("π Refresh Leaderboard")
|
68 |
+
|
69 |
+
# Event handlers
|
70 |
+
upload_button.upload(
|
71 |
+
fn=handle_upload,
|
72 |
+
inputs=[submission_name, upload_button],
|
73 |
+
outputs=[status_box],
|
74 |
+
show_progress="full",
|
75 |
+
)
|
76 |
+
|
77 |
+
refresh_button.click(
|
78 |
+
fn=load_leaderboard_data,
|
79 |
+
inputs=None,
|
80 |
+
outputs=[leaderboard]
|
81 |
+
)
|
82 |
+
|
83 |
+
return demo
|
src/utils.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from src.config import LOCAL_TEMP_SUBMISSIONS_DIR
|
4 |
+
|
5 |
+
|
6 |
+
def setup_directories():
|
7 |
+
os.makedirs(LOCAL_TEMP_SUBMISSIONS_DIR, exist_ok=True)
|