Commit
Β·
b5712a3
1
Parent(s):
083c72d
replace with simpler app
Browse files- app.py +230 -221
- backup_app_.py +320 -0
- eval.py +356 -0
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,236 +1,245 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
)
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
-
|
54 |
-
(
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
-
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
)
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
with demo:
|
96 |
-
gr.HTML(TITLE)
|
97 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
102 |
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
elem_classes="markdown-text",
|
110 |
-
)
|
111 |
upload_button = gr.UploadButton(
|
112 |
-
|
113 |
-
size="lg",
|
114 |
file_count="directory",
|
115 |
-
elem_id="upload-button",
|
116 |
-
file_types=["text"],
|
117 |
)
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
shutil.move(directory_path, "submissions/")
|
128 |
-
return f"Directory {directory_path} uploaded successfully!"
|
129 |
-
else:
|
130 |
-
return "No directory uploaded."
|
131 |
-
|
132 |
-
upload_button.upload(fn=upload_directory, inputs=upload_button, outputs=None)
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
with gr.TabItem("π Submit here!", elem_id="llm-benchmark-tab-table", id=3):
|
137 |
-
with gr.Column():
|
138 |
-
with gr.Row():
|
139 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
140 |
-
|
141 |
-
with gr.Column():
|
142 |
-
with gr.Accordion(
|
143 |
-
f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
144 |
-
open=False,
|
145 |
-
):
|
146 |
-
with gr.Row():
|
147 |
-
finished_eval_table = gr.components.Dataframe(
|
148 |
-
value=finished_eval_queue_df,
|
149 |
-
headers=EVAL_COLS,
|
150 |
-
datatype=EVAL_TYPES,
|
151 |
-
row_count=5,
|
152 |
-
)
|
153 |
-
with gr.Accordion(
|
154 |
-
f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
155 |
-
open=False,
|
156 |
-
):
|
157 |
-
with gr.Row():
|
158 |
-
running_eval_table = gr.components.Dataframe(
|
159 |
-
value=running_eval_queue_df,
|
160 |
-
headers=EVAL_COLS,
|
161 |
-
datatype=EVAL_TYPES,
|
162 |
-
row_count=5,
|
163 |
-
)
|
164 |
-
|
165 |
-
with gr.Accordion(
|
166 |
-
f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
167 |
-
open=False,
|
168 |
-
):
|
169 |
-
with gr.Row():
|
170 |
-
pending_eval_table = gr.components.Dataframe(
|
171 |
-
value=pending_eval_queue_df,
|
172 |
-
headers=EVAL_COLS,
|
173 |
-
datatype=EVAL_TYPES,
|
174 |
-
row_count=5,
|
175 |
-
)
|
176 |
-
with gr.Row():
|
177 |
-
gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
178 |
-
|
179 |
-
with gr.Row():
|
180 |
-
with gr.Column():
|
181 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
182 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
183 |
-
model_type = gr.Dropdown(
|
184 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
185 |
-
label="Model type",
|
186 |
-
multiselect=False,
|
187 |
-
value=None,
|
188 |
-
interactive=True,
|
189 |
-
)
|
190 |
-
|
191 |
-
with gr.Column():
|
192 |
-
precision = gr.Dropdown(
|
193 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
194 |
-
label="Precision",
|
195 |
-
multiselect=False,
|
196 |
-
value="float16",
|
197 |
-
interactive=True,
|
198 |
-
)
|
199 |
-
weight_type = gr.Dropdown(
|
200 |
-
choices=[i.value.name for i in WeightType],
|
201 |
-
label="Weights type",
|
202 |
-
multiselect=False,
|
203 |
-
value="Original",
|
204 |
-
interactive=True,
|
205 |
-
)
|
206 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
207 |
-
|
208 |
-
submit_button = gr.Button("Submit Eval")
|
209 |
-
submission_result = gr.Markdown()
|
210 |
-
submit_button.click(
|
211 |
-
add_new_eval,
|
212 |
-
[
|
213 |
-
model_name_textbox,
|
214 |
-
base_model_name_textbox,
|
215 |
-
revision_name_textbox,
|
216 |
-
precision,
|
217 |
-
weight_type,
|
218 |
-
model_type,
|
219 |
-
],
|
220 |
-
submission_result,
|
221 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
lines=20,
|
229 |
-
elem_id="citation-button",
|
230 |
-
show_copy_button=True,
|
231 |
-
)
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
scheduler.start()
|
236 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import pandas as pd
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
from pathlib import Path
|
6 |
+
import subprocess # For running eval.py
|
7 |
+
import time
|
8 |
+
import threading # For background tasks
|
9 |
+
import sys
|
10 |
+
|
11 |
+
# --- Configuration ---
|
12 |
+
SUBMISSIONS_DIR = "submissions"
|
13 |
+
RESULTS_DIR = "results"
|
14 |
+
EVAL_SCRIPT_PATH = "eval.py"
|
15 |
+
|
16 |
+
|
17 |
+
# --- Helper Functions ---
|
18 |
+
|
19 |
+
def setup_directories():
|
20 |
+
"""Creates the submissions and results directories if they don't exist."""
|
21 |
+
os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
|
22 |
+
os.makedirs(RESULTS_DIR, exist_ok=True)
|
23 |
+
if not os.listdir(RESULTS_DIR): # Add a placeholder if results is empty
|
24 |
+
initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
|
25 |
+
if not initial_result_demo_path.exists():
|
26 |
+
os.makedirs(initial_result_demo_path, exist_ok=True)
|
27 |
+
with open(initial_result_demo_path / "summary.txt", "w") as f:
|
28 |
+
f.write("This is a placeholder initial result.\nScore: 0\n")
|
29 |
+
print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")
|
30 |
+
|
31 |
+
|
32 |
+
def load_leaderboard_data():
|
33 |
+
"""
|
34 |
+
Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
|
35 |
+
Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
|
36 |
+
"""
|
37 |
+
if not os.path.exists(RESULTS_DIR):
|
38 |
+
return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
|
39 |
+
|
40 |
+
result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]
|
41 |
+
|
42 |
+
leaderboard_entries = []
|
43 |
+
# Sort by modification time of the directory (newest first)
|
44 |
+
# This requires getting mtime for each directory.
|
45 |
+
sorted_result_dirs = sorted(
|
46 |
+
result_dirs,
|
47 |
+
key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
|
48 |
+
reverse=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
)
|
50 |
|
51 |
+
for dir_name in sorted_result_dirs:
|
52 |
+
entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
|
53 |
+
result_dir_path = Path(RESULTS_DIR) / dir_name
|
54 |
+
|
55 |
+
try:
|
56 |
+
entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
|
57 |
+
except Exception:
|
58 |
+
pass # Directory might have been removed during scan
|
59 |
+
|
60 |
+
summary_file = result_dir_path / "summary.txt"
|
61 |
+
if summary_file.exists():
|
62 |
+
try:
|
63 |
+
with open(summary_file, "r") as f:
|
64 |
+
for line in f:
|
65 |
+
if line.lower().startswith("score:"):
|
66 |
+
entry["Score"] = line.split(":", 1)[1].strip()
|
67 |
+
break
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Error parsing summary for {dir_name}: {e}")
|
70 |
+
|
71 |
+
leaderboard_entries.append(entry)
|
72 |
+
|
73 |
+
if not leaderboard_entries:
|
74 |
+
return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
|
75 |
+
|
76 |
+
return pd.DataFrame(leaderboard_entries)
|
77 |
+
|
78 |
+
|
79 |
+
def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
|
80 |
+
"""
|
81 |
+
This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
|
82 |
+
Outputs from eval.py will go to the console where app.py is running.
|
83 |
+
"""
|
84 |
+
print(
|
85 |
+
f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")
|
86 |
+
|
87 |
+
if not Path(EVAL_SCRIPT_PATH).exists():
|
88 |
+
print(
|
89 |
+
f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
|
90 |
+
return
|
91 |
+
|
92 |
+
command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]
|
93 |
+
|
94 |
+
try:
|
95 |
+
# Using subprocess.run which is simpler for blocking calls within this thread
|
96 |
+
process = subprocess.run(
|
97 |
+
command,
|
98 |
+
capture_output=True,
|
99 |
+
text=True,
|
100 |
+
check=False, # Handle non-zero exit codes manually
|
101 |
+
timeout=300 # 5-minute timeout for the evaluation script
|
102 |
+
)
|
103 |
+
|
104 |
+
eval_output = process.stdout.strip()
|
105 |
+
eval_error = process.stderr.strip()
|
106 |
+
|
107 |
+
print(
|
108 |
+
f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
|
109 |
+
if eval_error: # Only print stderr if it's not empty
|
110 |
+
print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")
|
111 |
+
|
112 |
+
if process.returncode == 0:
|
113 |
+
print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
|
114 |
+
else:
|
115 |
+
print(
|
116 |
+
f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")
|
117 |
+
|
118 |
+
except subprocess.TimeoutExpired:
|
119 |
+
print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
|
120 |
+
except FileNotFoundError: # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
|
121 |
+
print(
|
122 |
+
f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
|
123 |
+
except Exception as e:
|
124 |
+
print(
|
125 |
+
f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")
|
126 |
+
|
127 |
+
print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")
|
128 |
+
|
129 |
+
|
130 |
+
def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
|
131 |
+
"""
|
132 |
+
Handles directory upload, saves files, and starts eval.py in a background thread.
|
133 |
+
Yields a status message for the UI. The leaderboard updates separately.
|
134 |
+
"""
|
135 |
+
yield "Processing upload..." # Initial status
|
136 |
+
|
137 |
+
if not uploaded_files_list:
|
138 |
+
yield "No directory uploaded. Please select a directory."
|
139 |
+
return
|
140 |
+
|
141 |
+
try:
|
142 |
+
# Determine original uploaded directory name
|
143 |
+
first_temp_file_path = Path(uploaded_files_list[0].name)
|
144 |
+
original_uploaded_dir_name = first_temp_file_path.parent.name
|
145 |
+
|
146 |
+
submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name
|
147 |
+
|
148 |
+
# Handle potential name collision
|
149 |
+
if submission_dir_path.exists():
|
150 |
+
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
151 |
+
descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
|
152 |
+
submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
|
153 |
+
status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
|
154 |
+
original_uploaded_dir_name = descriptive_name_for_log_and_status # Use new name for logging
|
155 |
+
else:
|
156 |
+
descriptive_name_for_log_and_status = original_uploaded_dir_name
|
157 |
+
status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."
|
158 |
+
|
159 |
+
os.makedirs(submission_dir_path, exist_ok=True)
|
160 |
+
progress(0.1, desc=status_update_msg)
|
161 |
+
|
162 |
+
for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
|
163 |
+
temp_file_path = Path(temp_file_obj.name)
|
164 |
+
file_name_in_dir = temp_file_path.name
|
165 |
+
target_file_path = submission_dir_path / file_name_in_dir
|
166 |
+
shutil.copy(str(temp_file_path), str(target_file_path))
|
167 |
+
|
168 |
+
upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
|
169 |
+
progress(0.8, desc=upload_completion_msg)
|
170 |
+
|
171 |
+
except Exception as e:
|
172 |
+
yield f"Error during upload: {str(e)}"
|
173 |
+
return
|
174 |
+
|
175 |
+
# --- Start evaluation in a background thread ---
|
176 |
+
if not Path(EVAL_SCRIPT_PATH).exists():
|
177 |
+
yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
|
178 |
+
return
|
179 |
+
|
180 |
+
# Ensure paths passed to thread are absolute strings, good practice for threads.
|
181 |
+
abs_submission_path = str(submission_dir_path.resolve())
|
182 |
+
abs_results_path = str(Path(RESULTS_DIR).resolve())
|
183 |
+
|
184 |
+
eval_thread = threading.Thread(
|
185 |
+
target=run_evaluation_in_background,
|
186 |
+
args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
|
187 |
+
daemon=True # Set as daemon so it exits when main app exits
|
188 |
+
)
|
189 |
+
eval_thread.start()
|
190 |
|
191 |
+
final_status_msg = (
|
192 |
+
f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
|
193 |
+
"The leaderboard will auto-refresh (or use manual refresh)."
|
194 |
+
)
|
195 |
+
progress(1.0, desc="Background evaluation initiated.")
|
196 |
+
yield final_status_msg
|
197 |
|
|
|
|
|
|
|
198 |
|
199 |
+
# --- Create Directories ---
|
200 |
+
setup_directories()
|
|
|
201 |
|
202 |
+
# --- Gradio App Definition ---
|
203 |
+
with gr.Blocks(title="Background Submission, Evaluation, and Leaderboard") as demo:
|
204 |
+
gr.Markdown("# Background Submission, Evaluation & Results")
|
205 |
+
gr.Markdown(
|
206 |
+
f"Upload submissions (directories) to **'{SUBMISSIONS_DIR}'**. "
|
207 |
+
f"The evaluation script (`{EVAL_SCRIPT_PATH}`) will process them in the background. "
|
208 |
+
f"Results appear in **'{RESULTS_DIR}'**. The leaderboard auto-refreshes."
|
209 |
+
)
|
210 |
|
211 |
+
with gr.Row():
|
212 |
+
with gr.Column(scale=1): # Upload Column
|
213 |
+
gr.Markdown("## π€ Upload & Evaluate Submission")
|
|
|
|
|
214 |
upload_button = gr.UploadButton(
|
215 |
+
"Click to Upload Directory for Evaluation",
|
|
|
216 |
file_count="directory",
|
|
|
|
|
217 |
)
|
218 |
+
upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)
|
219 |
+
|
220 |
+
with gr.Column(scale=2): # Leaderboard Column
|
221 |
+
gr.Markdown("## π Results Leaderboard")
|
222 |
+
leaderboard_df_component = gr.DataFrame(
|
223 |
+
value=load_leaderboard_data, # Load initial data
|
224 |
+
label="Leaderboard (auto-refreshes)",
|
225 |
+
interactive=False,
|
226 |
+
# every=20 # Auto-refresh leaderboard data every 20 seconds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
)
|
228 |
+
refresh_leaderboard_button = gr.Button("π Refresh Leaderboard Manually")
|
229 |
+
|
230 |
+
# --- Event Handlers ---
|
231 |
+
upload_button.upload(
|
232 |
+
fn=handle_upload_and_kickoff_eval,
|
233 |
+
inputs=[upload_button],
|
234 |
+
outputs=[upload_status_textbox], # Only one output now for the status message
|
235 |
+
show_progress="full"
|
236 |
+
)
|
237 |
|
238 |
+
refresh_leaderboard_button.click(
|
239 |
+
fn=load_leaderboard_data,
|
240 |
+
inputs=None,
|
241 |
+
outputs=[leaderboard_df_component]
|
242 |
+
)
|
|
|
|
|
|
|
|
|
243 |
|
244 |
+
if __name__ == "__main__":
|
245 |
+
demo.queue().launch()
|
|
|
|
backup_app_.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
+
import pandas as pd
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
import shutil # For file operations
|
7 |
+
from pathlib import Path # For path manipulations
|
8 |
+
|
9 |
+
|
10 |
+
from src.about import (
|
11 |
+
CITATION_BUTTON_LABEL,
|
12 |
+
CITATION_BUTTON_TEXT,
|
13 |
+
EVALUATION_QUEUE_TEXT,
|
14 |
+
INTRODUCTION_TEXT,
|
15 |
+
LLM_BENCHMARKS_TEXT,
|
16 |
+
TITLE,
|
17 |
+
)
|
18 |
+
from src.display.css_html_js import custom_css
|
19 |
+
from src.display.utils import (
|
20 |
+
BENCHMARK_COLS,
|
21 |
+
COLS,
|
22 |
+
EVAL_COLS,
|
23 |
+
EVAL_TYPES,
|
24 |
+
AutoEvalColumn,
|
25 |
+
ModelType,
|
26 |
+
fields,
|
27 |
+
WeightType,
|
28 |
+
Precision
|
29 |
+
)
|
30 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
31 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
32 |
+
from src.submission.submit import add_new_eval
|
33 |
+
|
34 |
+
|
35 |
+
def restart_space():
|
36 |
+
API.restart_space(repo_id=REPO_ID)
|
37 |
+
|
38 |
+
### Space initialisation
|
39 |
+
try:
|
40 |
+
print(EVAL_REQUESTS_PATH)
|
41 |
+
snapshot_download(
|
42 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
43 |
+
)
|
44 |
+
except Exception:
|
45 |
+
restart_space()
|
46 |
+
try:
|
47 |
+
print(EVAL_RESULTS_PATH)
|
48 |
+
snapshot_download(
|
49 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
50 |
+
)
|
51 |
+
except Exception:
|
52 |
+
restart_space()
|
53 |
+
|
54 |
+
|
55 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
56 |
+
|
57 |
+
(
|
58 |
+
finished_eval_queue_df,
|
59 |
+
running_eval_queue_df,
|
60 |
+
pending_eval_queue_df,
|
61 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
62 |
+
|
63 |
+
def init_leaderboard(dataframe):
|
64 |
+
if dataframe is None or dataframe.empty:
|
65 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
66 |
+
return Leaderboard(
|
67 |
+
value=dataframe,
|
68 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
69 |
+
select_columns=SelectColumns(
|
70 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
71 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
72 |
+
label="Select Columns to Display:",
|
73 |
+
),
|
74 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
75 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
76 |
+
filter_columns=[
|
77 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
78 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
79 |
+
ColumnFilter(
|
80 |
+
AutoEvalColumn.params.name,
|
81 |
+
type="slider",
|
82 |
+
min=0.01,
|
83 |
+
max=150,
|
84 |
+
label="Select the number of parameters (B)",
|
85 |
+
),
|
86 |
+
ColumnFilter(
|
87 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
88 |
+
),
|
89 |
+
],
|
90 |
+
bool_checkboxgroup_label="Hide models",
|
91 |
+
interactive=False,
|
92 |
+
)
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
# --- Function to handle the uploaded directory ---
|
98 |
+
def save_uploaded_models(files):
|
99 |
+
if files:
|
100 |
+
saved_paths = []
|
101 |
+
# 'files' will be a list of temporary file paths when file_count="directory"
|
102 |
+
# The actual files are in a temporary directory.
|
103 |
+
# We want to recreate the structure within UPLOAD_DIR.
|
104 |
+
|
105 |
+
# Assuming 'files' contains full paths to files within a single uploaded directory
|
106 |
+
# We need to determine the base name of the uploaded directory.
|
107 |
+
# Gradio often provides a list of file objects. Each object has a .name attribute (path).
|
108 |
+
# Example: if user uploads "my_run_1" containing "model.txt" and "config.json"
|
109 |
+
# files might be like: ['/tmp/gradio/somerandomhash/my_run_1/model.txt', '/tmp/gradio/somerandomhash/my_run_1/config.json']
|
110 |
+
# Or it might be a list of tempfile._TemporaryFileWrapper objects.
|
111 |
+
|
112 |
+
if not isinstance(files, list):
|
113 |
+
files = [files] # Ensure it's a list
|
114 |
+
|
115 |
+
# Let's assume `files` is a list of `tempfile._TemporaryFileWrapper` or similar
|
116 |
+
# where `file_obj.name` gives the temporary path to each file.
|
117 |
+
|
118 |
+
# Get the common parent directory from the temporary paths if possible,
|
119 |
+
# or derive the uploaded folder name from one of the paths.
|
120 |
+
# This part can be tricky depending on exactly how Gradio passes directory uploads.
|
121 |
+
# A robust way is to create a unique sub-directory for each upload.
|
122 |
+
|
123 |
+
# Let's get the name of the directory the user uploaded.
|
124 |
+
# With file_count="directory", `files` is a list of file paths.
|
125 |
+
# We can infer the uploaded directory name from the first file path.
|
126 |
+
if files:
|
127 |
+
first_file_path = Path(files[0].name if hasattr(files[0], 'name') else files[0])
|
128 |
+
# The uploaded directory name would be the parent of the files if Gradio flattens it,
|
129 |
+
# or the parent of the temp directory housing the uploaded folder.
|
130 |
+
# For simplicity, let's try to get the original uploaded folder name.
|
131 |
+
# Gradio's `UploadButton` usually puts uploaded directories into a subdirectory
|
132 |
+
# within the temp space that has the same name as the original uploaded directory.
|
133 |
+
# e.g., if user uploads "my_models_run1", files might be in /tmp/somehash/my_models_run1/file1.txt
|
134 |
+
|
135 |
+
# A common approach: find the common prefix of all file paths,
|
136 |
+
# then determine the uploaded directory's name from that.
|
137 |
+
# However, Gradio's behavior is that `files` is a list of file objects,
|
138 |
+
# each with a `.name` attribute that is the full path to a temporary file.
|
139 |
+
# These temporary files are often placed inside a directory that *itself*
|
140 |
+
# represents the uploaded directory structure.
|
141 |
+
|
142 |
+
# Let's assume the user uploaded a directory named "user_uploaded_dir"
|
143 |
+
# And it contains "model1.txt" and "model2.txt"
|
144 |
+
# `files` might be `[<temp_file_obj_for_model1>, <temp_file_obj_for_model2>]`
|
145 |
+
# `files[0].name` might be `/tmp/gradio_guid/user_uploaded_dir/model1.txt`
|
146 |
+
|
147 |
+
# We need to extract "user_uploaded_dir"
|
148 |
+
# And then recreate this structure under UPLOAD_DIR.
|
149 |
+
|
150 |
+
# Assuming the first file gives us a good representation of the path structure.
|
151 |
+
temp_file_path = Path(files[0].name if hasattr(files[0], 'name') else files[0])
|
152 |
+
# The uploaded directory's name is usually the second to last part of the temp path
|
153 |
+
# e.g. /tmp/tmpxyz/uploaded_dir_name/file.txt -> "uploaded_dir_name"
|
154 |
+
uploaded_dir_name = temp_file_path.parent.name
|
155 |
+
|
156 |
+
destination_folder_path = Path(UPLOAD_DIR) / uploaded_dir_name
|
157 |
+
os.makedirs(destination_folder_path, exist_ok=True)
|
158 |
+
|
159 |
+
for uploaded_file_obj in files:
|
160 |
+
# Get the path to the temporary file
|
161 |
+
temp_path_str = uploaded_file_obj.name
|
162 |
+
temp_path = Path(temp_path_str)
|
163 |
+
|
164 |
+
# Get the original filename (relative to the uploaded directory)
|
165 |
+
# This should be just the filename itself if Gradio preserves the structure
|
166 |
+
# correctly inside the temp directory for the uploaded folder.
|
167 |
+
original_filename = temp_path.name # e.g., "model1.txt"
|
168 |
+
|
169 |
+
destination_file_path = destination_folder_path / original_filename
|
170 |
+
|
171 |
+
try:
|
172 |
+
shutil.copy(temp_path_str, destination_file_path)
|
173 |
+
saved_paths.append(str(destination_file_path))
|
174 |
+
except Exception as e:
|
175 |
+
print(f"Error copying {temp_path_str} to {destination_file_path}: {e}")
|
176 |
+
return f"Error saving files: {e}"
|
177 |
+
|
178 |
+
if saved_paths:
|
179 |
+
return f"Successfully uploaded and saved models to: {destination_folder_path}"
|
180 |
+
else:
|
181 |
+
return "No files were saved."
|
182 |
+
return "No files uploaded."
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
# demo = gr.Blocks(css=custom_css)
|
188 |
+
demo = gr.Blocks()
|
189 |
+
|
190 |
+
with demo:
|
191 |
+
gr.HTML(TITLE)
|
192 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
193 |
+
|
194 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
195 |
+
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
196 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
197 |
+
|
198 |
+
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
199 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
200 |
+
|
201 |
+
with gr.TabItem("π Simple Submit here!", elem_id="llm-benchmark-tab-table", id=4):
|
202 |
+
gr.Markdown(
|
203 |
+
"## Submit your generated models here!",
|
204 |
+
elem_classes="markdown-text",
|
205 |
+
)
|
206 |
+
upload_button = gr.UploadButton(
|
207 |
+
label="Upload your generated models (only directories accepted)",
|
208 |
+
size="lg",
|
209 |
+
file_count="directory",
|
210 |
+
elem_id="upload-button",
|
211 |
+
)
|
212 |
+
# Add an output component to display the result of the upload
|
213 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
214 |
+
|
215 |
+
# Connect the upload_button to the save_uploaded_models function
|
216 |
+
upload_button.upload(save_uploaded_models, upload_button, upload_status)
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
with gr.TabItem("π Submit here!", elem_id="llm-benchmark-tab-table", id=3):
|
221 |
+
with gr.Column():
|
222 |
+
with gr.Row():
|
223 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
224 |
+
|
225 |
+
with gr.Column():
|
226 |
+
with gr.Accordion(
|
227 |
+
f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
228 |
+
open=False,
|
229 |
+
):
|
230 |
+
with gr.Row():
|
231 |
+
finished_eval_table = gr.components.Dataframe(
|
232 |
+
value=finished_eval_queue_df,
|
233 |
+
headers=EVAL_COLS,
|
234 |
+
datatype=EVAL_TYPES,
|
235 |
+
row_count=5,
|
236 |
+
)
|
237 |
+
with gr.Accordion(
|
238 |
+
f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
239 |
+
open=False,
|
240 |
+
):
|
241 |
+
with gr.Row():
|
242 |
+
running_eval_table = gr.components.Dataframe(
|
243 |
+
value=running_eval_queue_df,
|
244 |
+
headers=EVAL_COLS,
|
245 |
+
datatype=EVAL_TYPES,
|
246 |
+
row_count=5,
|
247 |
+
)
|
248 |
+
|
249 |
+
with gr.Accordion(
|
250 |
+
f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
251 |
+
open=False,
|
252 |
+
):
|
253 |
+
with gr.Row():
|
254 |
+
pending_eval_table = gr.components.Dataframe(
|
255 |
+
value=pending_eval_queue_df,
|
256 |
+
headers=EVAL_COLS,
|
257 |
+
datatype=EVAL_TYPES,
|
258 |
+
row_count=5,
|
259 |
+
)
|
260 |
+
with gr.Row():
|
261 |
+
gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
262 |
+
|
263 |
+
with gr.Row():
|
264 |
+
with gr.Column():
|
265 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
266 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
267 |
+
model_type = gr.Dropdown(
|
268 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
269 |
+
label="Model type",
|
270 |
+
multiselect=False,
|
271 |
+
value=None,
|
272 |
+
interactive=True,
|
273 |
+
)
|
274 |
+
|
275 |
+
with gr.Column():
|
276 |
+
precision = gr.Dropdown(
|
277 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
278 |
+
label="Precision",
|
279 |
+
multiselect=False,
|
280 |
+
value="float16",
|
281 |
+
interactive=True,
|
282 |
+
)
|
283 |
+
weight_type = gr.Dropdown(
|
284 |
+
choices=[i.value.name for i in WeightType],
|
285 |
+
label="Weights type",
|
286 |
+
multiselect=False,
|
287 |
+
value="Original",
|
288 |
+
interactive=True,
|
289 |
+
)
|
290 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
291 |
+
|
292 |
+
submit_button = gr.Button("Submit Eval")
|
293 |
+
submission_result = gr.Markdown()
|
294 |
+
submit_button.click(
|
295 |
+
add_new_eval,
|
296 |
+
[
|
297 |
+
model_name_textbox,
|
298 |
+
base_model_name_textbox,
|
299 |
+
revision_name_textbox,
|
300 |
+
precision,
|
301 |
+
weight_type,
|
302 |
+
model_type,
|
303 |
+
],
|
304 |
+
submission_result,
|
305 |
+
)
|
306 |
+
|
307 |
+
with gr.Row():
|
308 |
+
with gr.Accordion("π Citation", open=False):
|
309 |
+
citation_button = gr.Textbox(
|
310 |
+
value=CITATION_BUTTON_TEXT,
|
311 |
+
label=CITATION_BUTTON_LABEL,
|
312 |
+
lines=20,
|
313 |
+
elem_id="citation-button",
|
314 |
+
show_copy_button=True,
|
315 |
+
)
|
316 |
+
|
317 |
+
scheduler = BackgroundScheduler()
|
318 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
319 |
+
scheduler.start()
|
320 |
+
demo.queue(default_concurrency_limit=40).launch()
|
eval.py
ADDED
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# eval.py
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import json
|
6 |
+
import subprocess
|
7 |
+
import tempfile
|
8 |
+
from pathlib import Path
|
9 |
+
from datasets import load_dataset # Hugging Face datasets library
|
10 |
+
|
11 |
+
# --- Configuration ---
|
12 |
+
|
13 |
+
DATASET_NAME = "kostis-init/CP-Bench"
|
14 |
+
|
15 |
+
# Column names in the Hugging Face dataset for problem identifier and model script
|
16 |
+
PROBLEM_NAME_COLUMN = "id"
|
17 |
+
MODEL_CODE_COLUMN = "model"
|
18 |
+
|
19 |
+
# Timeout for running individual model scripts (both generated and modified ground-truth)
|
20 |
+
SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
|
21 |
+
|
22 |
+
|
23 |
+
def extract_json_from_string(text_output: str):
|
24 |
+
"""
|
25 |
+
Attempts to find and parse the first valid JSON object or array from a string.
|
26 |
+
Handles cases where JSON is preceded or followed by non-JSON text.
|
27 |
+
"""
|
28 |
+
idx = 0
|
29 |
+
while idx < len(text_output):
|
30 |
+
# Find the next potential start of a JSON structure
|
31 |
+
start_brace = text_output.find('{', idx)
|
32 |
+
start_bracket = text_output.find('[', idx)
|
33 |
+
|
34 |
+
if start_brace == -1 and start_bracket == -1:
|
35 |
+
# No more '{' or '[' found in the rest of the string
|
36 |
+
return None
|
37 |
+
|
38 |
+
# Determine the actual starting character for this attempt
|
39 |
+
if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
|
40 |
+
json_start_index = start_brace
|
41 |
+
else:
|
42 |
+
json_start_index = start_bracket
|
43 |
+
|
44 |
+
potential_json_segment = text_output[json_start_index:]
|
45 |
+
|
46 |
+
try:
|
47 |
+
# Use raw_decode to parse the first valid JSON object from the segment
|
48 |
+
decoder = json.JSONDecoder()
|
49 |
+
json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
|
50 |
+
# Successfully parsed a JSON object
|
51 |
+
return json_obj
|
52 |
+
except json.JSONDecodeError:
|
53 |
+
# This segment (starting at json_start_index) wasn't a valid JSON.
|
54 |
+
# Advance the search index past the character that caused the current attempt.
|
55 |
+
idx = json_start_index + 1
|
56 |
+
|
57 |
+
return None # No valid JSON found in the entire string
|
58 |
+
|
59 |
+
|
60 |
+
def run_instance(instance_path_str: str,
|
61 |
+
timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
|
62 |
+
"""Run the instance file and robustly capture the JSON output."""
|
63 |
+
command = [sys.executable, instance_path_str]
|
64 |
+
instance_name = Path(instance_path_str).name
|
65 |
+
try:
|
66 |
+
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
|
67 |
+
errors='replace')
|
68 |
+
|
69 |
+
# Check return code first
|
70 |
+
if result.returncode != 0:
|
71 |
+
# Log stderr for debugging if the script itself failed
|
72 |
+
error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
|
73 |
+
print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
|
74 |
+
return None
|
75 |
+
|
76 |
+
# Attempt to extract JSON from stdout
|
77 |
+
stdout_text = result.stdout
|
78 |
+
if not stdout_text or not stdout_text.strip():
|
79 |
+
print(f" ERROR: No stdout from {instance_name}.", flush=True)
|
80 |
+
return None
|
81 |
+
|
82 |
+
solution = extract_json_from_string(stdout_text)
|
83 |
+
|
84 |
+
if solution is None:
|
85 |
+
# Be more verbose if JSON extraction fails
|
86 |
+
abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
|
87 |
+
print(
|
88 |
+
f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
|
89 |
+
flush=True)
|
90 |
+
return None
|
91 |
+
|
92 |
+
return solution
|
93 |
+
|
94 |
+
except subprocess.TimeoutExpired:
|
95 |
+
print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
|
96 |
+
return None
|
97 |
+
except Exception as e:
|
98 |
+
print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
|
99 |
+
return None
|
100 |
+
|
101 |
+
|
102 |
+
def add_constraints_as_string(solution):
|
103 |
+
"""Generate constraints as a string to be added to the original script."""
|
104 |
+
constraints = ""
|
105 |
+
if solution: # Ensure solution is not None
|
106 |
+
for key, value in solution.items():
|
107 |
+
# Basic escaping for string values if they occur, though typically solutions are numeric/boolean
|
108 |
+
if isinstance(value, str):
|
109 |
+
constraints += f"\nmodel += ({key} == \"{value}\")"
|
110 |
+
else:
|
111 |
+
constraints += f"\nmodel += ({key} == {value})"
|
112 |
+
return constraints
|
113 |
+
|
114 |
+
|
115 |
+
def get_modified_script(script_content, solution):
|
116 |
+
"""Add constraints to the script content and self-consistency checks."""
|
117 |
+
constraints_str = add_constraints_as_string(solution)
|
118 |
+
modified_script = f"{script_content}\n{constraints_str}"
|
119 |
+
modified_script += """
|
120 |
+
|
121 |
+
# --- Self-consistency check appended by eval.py ---
|
122 |
+
# Print the absolute path of the current directory along with the script name
|
123 |
+
import os
|
124 |
+
# print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug
|
125 |
+
|
126 |
+
# Keep old objective
|
127 |
+
old_objective_value = None
|
128 |
+
objective_defined = False
|
129 |
+
if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
|
130 |
+
try:
|
131 |
+
# This block assumes 'model' is the CPMpy model object or similar
|
132 |
+
# Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
|
133 |
+
# or might raise an error if objective_value() is called on an unsolved/unformulated objective.
|
134 |
+
# This part might need adjustment based on the specific modeling library used in CP-Bench.
|
135 |
+
# For now, we'll try to get it and catch errors.
|
136 |
+
# A more robust way might be to inspect model.objective_
|
137 |
+
if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
|
138 |
+
if model._objective_value is not None: # cpmpy does not have objective_is_min
|
139 |
+
objective_defined = True
|
140 |
+
old_objective_value = model.objective_value()
|
141 |
+
|
142 |
+
except Exception as e_obj_check:
|
143 |
+
# print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
|
144 |
+
pass # Objective might not be set or model not solved yet.
|
145 |
+
|
146 |
+
# Check self-consistency
|
147 |
+
solved_ok = False
|
148 |
+
try:
|
149 |
+
if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
|
150 |
+
solved_ok = model.solve()
|
151 |
+
else:
|
152 |
+
print('ERROR: Model object not found or does not have a solve() method.')
|
153 |
+
except Exception as e_solve:
|
154 |
+
print(f'ERROR: Exception during model.solve(): {e_solve}')
|
155 |
+
solved_ok = False # Ensure it's false on exception
|
156 |
+
|
157 |
+
if not solved_ok:
|
158 |
+
print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
|
159 |
+
else:
|
160 |
+
print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')
|
161 |
+
|
162 |
+
# Check if the objective value is the same
|
163 |
+
if not objective_defined:
|
164 |
+
print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
|
165 |
+
else:
|
166 |
+
try:
|
167 |
+
current_objective_value = model.objective_value()
|
168 |
+
# Handle potential floating point inaccuracies if objectives can be floats
|
169 |
+
if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
|
170 |
+
if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
|
171 |
+
print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
|
172 |
+
else:
|
173 |
+
print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
|
174 |
+
elif current_objective_value != old_objective_value: # Integer comparison
|
175 |
+
print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
|
176 |
+
else:
|
177 |
+
print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
|
178 |
+
except Exception as e_obj_final:
|
179 |
+
print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')
|
180 |
+
|
181 |
+
"""
|
182 |
+
return modified_script
|
183 |
+
|
184 |
+
|
185 |
+
# --- Main Evaluation Logic ---
|
186 |
+
def main(submission_path_str: str, results_base_dir_str: str):
|
187 |
+
start_time = time.time()
|
188 |
+
print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
|
189 |
+
print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
|
190 |
+
print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)
|
191 |
+
|
192 |
+
submission_path = Path(submission_path_str)
|
193 |
+
submission_name = submission_path.name
|
194 |
+
result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
|
195 |
+
os.makedirs(result_dir_for_submission, exist_ok=True)
|
196 |
+
summary_file_path = result_dir_for_submission / "summary.txt"
|
197 |
+
|
198 |
+
# Load ground-truth dataset
|
199 |
+
try:
|
200 |
+
# Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
|
201 |
+
gt_dataset = load_dataset(DATASET_NAME, split="train")
|
202 |
+
ground_truth_models = {
|
203 |
+
item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
|
204 |
+
for item in gt_dataset
|
205 |
+
if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
|
206 |
+
}
|
207 |
+
if not ground_truth_models:
|
208 |
+
raise ValueError(
|
209 |
+
f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
|
210 |
+
print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
|
211 |
+
except Exception as e:
|
212 |
+
print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
|
213 |
+
with open(summary_file_path, "w") as f:
|
214 |
+
f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
|
215 |
+
return 1 # Indicate failure
|
216 |
+
|
217 |
+
# Statistics
|
218 |
+
total_submitted_models = 0
|
219 |
+
models_ran_successfully = 0
|
220 |
+
gt_models_found = 0
|
221 |
+
consistency_checks_passed = 0
|
222 |
+
objective_checks_passed = 0 # Includes "NO_OBJECTIVE_DEFINED" as a pass
|
223 |
+
|
224 |
+
with open(summary_file_path, "w") as summary_f:
|
225 |
+
summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
|
226 |
+
summary_f.write(
|
227 |
+
f"Ground-Truth Dataset: {DATASET_NAME}\n")
|
228 |
+
summary_f.write("-" * 30 + "\n")
|
229 |
+
|
230 |
+
submitted_model_files = list(submission_path.glob('*.py')) # Assuming Python models
|
231 |
+
if not submitted_model_files:
|
232 |
+
summary_f.write("No .py model files found in submission.\n")
|
233 |
+
print("eval.py: No .py model files found in submission.", flush=True)
|
234 |
+
return 0 # No models to evaluate, but script ran.
|
235 |
+
|
236 |
+
for model_file_path in submitted_model_files:
|
237 |
+
total_submitted_models += 1
|
238 |
+
problem_name = model_file_path.stem # Filename without .py extension
|
239 |
+
print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
|
240 |
+
summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
|
241 |
+
|
242 |
+
# 1. Run the submitted model to get its solution
|
243 |
+
summary_f.write(" 1. Running submitted model...\n")
|
244 |
+
generated_solution = run_instance(str(model_file_path))
|
245 |
+
if generated_solution is None:
|
246 |
+
summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
|
247 |
+
continue # Move to the next model
|
248 |
+
models_ran_successfully += 1
|
249 |
+
summary_f.write(f" - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")
|
250 |
+
|
251 |
+
# 2. Find corresponding ground-truth model
|
252 |
+
summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
|
253 |
+
if problem_name not in ground_truth_models:
|
254 |
+
summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
|
255 |
+
print(f" WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
|
256 |
+
continue
|
257 |
+
gt_models_found += 1
|
258 |
+
ground_truth_script_content = ground_truth_models[problem_name]
|
259 |
+
summary_f.write(" - SUCCESS: Found ground-truth model.\n")
|
260 |
+
|
261 |
+
# 3. Modify ground-truth script with solution and run self-consistency check
|
262 |
+
summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
|
263 |
+
modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
|
264 |
+
|
265 |
+
consistency_passed_this_model = False
|
266 |
+
objective_passed_this_model = False
|
267 |
+
|
268 |
+
try:
|
269 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
|
270 |
+
tmp_file.write(modified_gt_script)
|
271 |
+
tmp_file_path_str = tmp_file.name
|
272 |
+
|
273 |
+
# Run the modified ground-truth script
|
274 |
+
gt_check_result = subprocess.run(
|
275 |
+
[sys.executable, tmp_file_path_str],
|
276 |
+
capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
|
277 |
+
)
|
278 |
+
os.unlink(tmp_file_path_str) # Clean up temp file
|
279 |
+
|
280 |
+
# 4. Parse output of modified ground-truth
|
281 |
+
gt_stdout = gt_check_result.stdout
|
282 |
+
gt_stderr = gt_check_result.stderr
|
283 |
+
# summary_f.write(f" Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
|
284 |
+
if gt_stderr:
|
285 |
+
summary_f.write(f" Modified GT STDERR: {gt_stderr[:500]}...\n")
|
286 |
+
|
287 |
+
if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
|
288 |
+
summary_f.write(" - CONSISTENCY: PASSED\n")
|
289 |
+
consistency_checks_passed += 1
|
290 |
+
consistency_passed_this_model = True
|
291 |
+
elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
|
292 |
+
summary_f.write(" - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
|
293 |
+
else:
|
294 |
+
summary_f.write(" - CONSISTENCY: FAILED (Could not determine consistency from output)\n")
|
295 |
+
|
296 |
+
if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
|
297 |
+
"EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
|
298 |
+
summary_f.write(" - OBJECTIVE: PASSED (Consistent or no objective)\n")
|
299 |
+
objective_checks_passed += 1
|
300 |
+
objective_passed_this_model = True
|
301 |
+
elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
|
302 |
+
summary_f.write(f" - OBJECTIVE: FAILED (Value changed)\n")
|
303 |
+
elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
|
304 |
+
summary_f.write(f" - OBJECTIVE: FAILED (Error accessing final objective)\n")
|
305 |
+
else:
|
306 |
+
summary_f.write(" - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")
|
307 |
+
|
308 |
+
except subprocess.TimeoutExpired:
|
309 |
+
summary_f.write(
|
310 |
+
f" - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
|
311 |
+
print(f" ERROR: Timeout running modified GT for {problem_name}", flush=True)
|
312 |
+
except Exception as e_gt_run:
|
313 |
+
summary_f.write(
|
314 |
+
f" - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
|
315 |
+
print(f" ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)
|
316 |
+
|
317 |
+
# Final statistics
|
318 |
+
summary_f.write("\n" + "=" * 30 + "\n")
|
319 |
+
summary_f.write("Overall Evaluation Statistics:\n")
|
320 |
+
summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
|
321 |
+
summary_f.write(
|
322 |
+
f" Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
|
323 |
+
summary_f.write(
|
324 |
+
f" Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
|
325 |
+
summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
|
326 |
+
summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
|
327 |
+
|
328 |
+
# Define an overall score, e.g. number of models that passed both checks against found GT
|
329 |
+
fully_passed_models = 0
|
330 |
+
# This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
|
331 |
+
# This simple score is just the sum of passes, could be more nuanced
|
332 |
+
overall_score = consistency_checks_passed + objective_checks_passed
|
333 |
+
summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n") # For Gradio app to parse
|
334 |
+
|
335 |
+
elapsed_time = time.time() - start_time
|
336 |
+
print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
|
337 |
+
print(f"eval.py: Summary written to {summary_file_path}", flush=True)
|
338 |
+
return 0 # Success
|
339 |
+
|
340 |
+
|
341 |
+
if __name__ == "__main__":
|
342 |
+
if len(sys.argv) < 3:
|
343 |
+
print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
|
344 |
+
print("Example: python eval.py ./submissions/my_run ./results")
|
345 |
+
sys.exit(1)
|
346 |
+
|
347 |
+
submission_dir = sys.argv[1]
|
348 |
+
results_base_dir = sys.argv[2]
|
349 |
+
|
350 |
+
# Simple check if submission_dir exists
|
351 |
+
if not Path(submission_dir).is_dir():
|
352 |
+
print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
|
353 |
+
sys.exit(1)
|
354 |
+
|
355 |
+
exit_code = main(submission_dir, results_base_dir)
|
356 |
+
sys.exit(exit_code)
|
requirements.txt
CHANGED
@@ -13,4 +13,5 @@ python-dateutil
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
-
sentencepiece
|
|
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
17 |
+
cpmpy
|