kostis-init commited on
Commit
b5712a3
Β·
1 Parent(s): 083c72d

replace with simpler app

Browse files
Files changed (4) hide show
  1. app.py +230 -221
  2. backup_app_.py +320 -0
  3. eval.py +356 -0
  4. requirements.txt +2 -1
app.py CHANGED
@@ -1,236 +1,245 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
  )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # demo = gr.Blocks(css=custom_css)
93
- demo = gr.Blocks()
 
 
 
 
94
 
95
- with demo:
96
- gr.HTML(TITLE)
97
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
98
 
99
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
100
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
101
- leaderboard = init_leaderboard(LEADERBOARD_DF)
102
 
103
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
104
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
105
 
106
- with gr.TabItem("πŸš€ Simple Submit here!", elem_id="llm-benchmark-tab-table", id=4):
107
- gr.Markdown(
108
- "## Submit your generated models here!",
109
- elem_classes="markdown-text",
110
- )
111
  upload_button = gr.UploadButton(
112
- label="Upload your generated models (only directories accepted)",
113
- size="lg",
114
  file_count="directory",
115
- elem_id="upload-button",
116
- file_types=["text"],
117
  )
118
-
119
- # when the directory is uploaded, we need to save it under the submissions folder
120
- def upload_directory(directory):
121
- # Save the directory to the EVAL_REQUESTS_PATH
122
- directory_path = directory.name
123
- if directory_path:
124
- # Move the uploaded directory to the desired location
125
- import shutil
126
-
127
- shutil.move(directory_path, "submissions/")
128
- return f"Directory {directory_path} uploaded successfully!"
129
- else:
130
- return "No directory uploaded."
131
-
132
- upload_button.upload(fn=upload_directory, inputs=upload_button, outputs=None)
133
-
134
-
135
-
136
- with gr.TabItem("πŸš€ Submit here!", elem_id="llm-benchmark-tab-table", id=3):
137
- with gr.Column():
138
- with gr.Row():
139
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
140
-
141
- with gr.Column():
142
- with gr.Accordion(
143
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
144
- open=False,
145
- ):
146
- with gr.Row():
147
- finished_eval_table = gr.components.Dataframe(
148
- value=finished_eval_queue_df,
149
- headers=EVAL_COLS,
150
- datatype=EVAL_TYPES,
151
- row_count=5,
152
- )
153
- with gr.Accordion(
154
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
155
- open=False,
156
- ):
157
- with gr.Row():
158
- running_eval_table = gr.components.Dataframe(
159
- value=running_eval_queue_df,
160
- headers=EVAL_COLS,
161
- datatype=EVAL_TYPES,
162
- row_count=5,
163
- )
164
-
165
- with gr.Accordion(
166
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
167
- open=False,
168
- ):
169
- with gr.Row():
170
- pending_eval_table = gr.components.Dataframe(
171
- value=pending_eval_queue_df,
172
- headers=EVAL_COLS,
173
- datatype=EVAL_TYPES,
174
- row_count=5,
175
- )
176
- with gr.Row():
177
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
178
-
179
- with gr.Row():
180
- with gr.Column():
181
- model_name_textbox = gr.Textbox(label="Model name")
182
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
183
- model_type = gr.Dropdown(
184
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
185
- label="Model type",
186
- multiselect=False,
187
- value=None,
188
- interactive=True,
189
- )
190
-
191
- with gr.Column():
192
- precision = gr.Dropdown(
193
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
194
- label="Precision",
195
- multiselect=False,
196
- value="float16",
197
- interactive=True,
198
- )
199
- weight_type = gr.Dropdown(
200
- choices=[i.value.name for i in WeightType],
201
- label="Weights type",
202
- multiselect=False,
203
- value="Original",
204
- interactive=True,
205
- )
206
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
207
-
208
- submit_button = gr.Button("Submit Eval")
209
- submission_result = gr.Markdown()
210
- submit_button.click(
211
- add_new_eval,
212
- [
213
- model_name_textbox,
214
- base_model_name_textbox,
215
- revision_name_textbox,
216
- precision,
217
- weight_type,
218
- model_type,
219
- ],
220
- submission_result,
221
  )
 
 
 
 
 
 
 
 
 
222
 
223
- with gr.Row():
224
- with gr.Accordion("πŸ“™ Citation", open=False):
225
- citation_button = gr.Textbox(
226
- value=CITATION_BUTTON_TEXT,
227
- label=CITATION_BUTTON_LABEL,
228
- lines=20,
229
- elem_id="citation-button",
230
- show_copy_button=True,
231
- )
232
 
233
- scheduler = BackgroundScheduler()
234
- scheduler.add_job(restart_space, "interval", seconds=1800)
235
- scheduler.start()
236
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import os
4
+ import shutil
5
+ from pathlib import Path
6
+ import subprocess # For running eval.py
7
+ import time
8
+ import threading # For background tasks
9
+ import sys
10
+
11
+ # --- Configuration ---
12
+ SUBMISSIONS_DIR = "submissions"
13
+ RESULTS_DIR = "results"
14
+ EVAL_SCRIPT_PATH = "eval.py"
15
+
16
+
17
+ # --- Helper Functions ---
18
+
19
+ def setup_directories():
20
+ """Creates the submissions and results directories if they don't exist."""
21
+ os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
22
+ os.makedirs(RESULTS_DIR, exist_ok=True)
23
+ if not os.listdir(RESULTS_DIR): # Add a placeholder if results is empty
24
+ initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
25
+ if not initial_result_demo_path.exists():
26
+ os.makedirs(initial_result_demo_path, exist_ok=True)
27
+ with open(initial_result_demo_path / "summary.txt", "w") as f:
28
+ f.write("This is a placeholder initial result.\nScore: 0\n")
29
+ print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")
30
+
31
+
32
+ def load_leaderboard_data():
33
+ """
34
+ Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
35
+ Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
36
+ """
37
+ if not os.path.exists(RESULTS_DIR):
38
+ return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
39
+
40
+ result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]
41
+
42
+ leaderboard_entries = []
43
+ # Sort by modification time of the directory (newest first)
44
+ # This requires getting mtime for each directory.
45
+ sorted_result_dirs = sorted(
46
+ result_dirs,
47
+ key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
48
+ reverse=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  )
50
 
51
+ for dir_name in sorted_result_dirs:
52
+ entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
53
+ result_dir_path = Path(RESULTS_DIR) / dir_name
54
+
55
+ try:
56
+ entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
57
+ except Exception:
58
+ pass # Directory might have been removed during scan
59
+
60
+ summary_file = result_dir_path / "summary.txt"
61
+ if summary_file.exists():
62
+ try:
63
+ with open(summary_file, "r") as f:
64
+ for line in f:
65
+ if line.lower().startswith("score:"):
66
+ entry["Score"] = line.split(":", 1)[1].strip()
67
+ break
68
+ except Exception as e:
69
+ print(f"Error parsing summary for {dir_name}: {e}")
70
+
71
+ leaderboard_entries.append(entry)
72
+
73
+ if not leaderboard_entries:
74
+ return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
75
+
76
+ return pd.DataFrame(leaderboard_entries)
77
+
78
+
79
+ def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
80
+ """
81
+ This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
82
+ Outputs from eval.py will go to the console where app.py is running.
83
+ """
84
+ print(
85
+ f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")
86
+
87
+ if not Path(EVAL_SCRIPT_PATH).exists():
88
+ print(
89
+ f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
90
+ return
91
+
92
+ command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]
93
+
94
+ try:
95
+ # Using subprocess.run which is simpler for blocking calls within this thread
96
+ process = subprocess.run(
97
+ command,
98
+ capture_output=True,
99
+ text=True,
100
+ check=False, # Handle non-zero exit codes manually
101
+ timeout=300 # 5-minute timeout for the evaluation script
102
+ )
103
+
104
+ eval_output = process.stdout.strip()
105
+ eval_error = process.stderr.strip()
106
+
107
+ print(
108
+ f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
109
+ if eval_error: # Only print stderr if it's not empty
110
+ print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")
111
+
112
+ if process.returncode == 0:
113
+ print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
114
+ else:
115
+ print(
116
+ f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")
117
+
118
+ except subprocess.TimeoutExpired:
119
+ print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
120
+ except FileNotFoundError: # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
121
+ print(
122
+ f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
123
+ except Exception as e:
124
+ print(
125
+ f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")
126
+
127
+ print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")
128
+
129
+
130
+ def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
131
+ """
132
+ Handles directory upload, saves files, and starts eval.py in a background thread.
133
+ Yields a status message for the UI. The leaderboard updates separately.
134
+ """
135
+ yield "Processing upload..." # Initial status
136
+
137
+ if not uploaded_files_list:
138
+ yield "No directory uploaded. Please select a directory."
139
+ return
140
+
141
+ try:
142
+ # Determine original uploaded directory name
143
+ first_temp_file_path = Path(uploaded_files_list[0].name)
144
+ original_uploaded_dir_name = first_temp_file_path.parent.name
145
+
146
+ submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name
147
+
148
+ # Handle potential name collision
149
+ if submission_dir_path.exists():
150
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
151
+ descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
152
+ submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
153
+ status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
154
+ original_uploaded_dir_name = descriptive_name_for_log_and_status # Use new name for logging
155
+ else:
156
+ descriptive_name_for_log_and_status = original_uploaded_dir_name
157
+ status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."
158
+
159
+ os.makedirs(submission_dir_path, exist_ok=True)
160
+ progress(0.1, desc=status_update_msg)
161
+
162
+ for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
163
+ temp_file_path = Path(temp_file_obj.name)
164
+ file_name_in_dir = temp_file_path.name
165
+ target_file_path = submission_dir_path / file_name_in_dir
166
+ shutil.copy(str(temp_file_path), str(target_file_path))
167
+
168
+ upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
169
+ progress(0.8, desc=upload_completion_msg)
170
+
171
+ except Exception as e:
172
+ yield f"Error during upload: {str(e)}"
173
+ return
174
+
175
+ # --- Start evaluation in a background thread ---
176
+ if not Path(EVAL_SCRIPT_PATH).exists():
177
+ yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
178
+ return
179
+
180
+ # Ensure paths passed to thread are absolute strings, good practice for threads.
181
+ abs_submission_path = str(submission_dir_path.resolve())
182
+ abs_results_path = str(Path(RESULTS_DIR).resolve())
183
+
184
+ eval_thread = threading.Thread(
185
+ target=run_evaluation_in_background,
186
+ args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
187
+ daemon=True # Set as daemon so it exits when main app exits
188
+ )
189
+ eval_thread.start()
190
 
191
+ final_status_msg = (
192
+ f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
193
+ "The leaderboard will auto-refresh (or use manual refresh)."
194
+ )
195
+ progress(1.0, desc="Background evaluation initiated.")
196
+ yield final_status_msg
197
 
 
 
 
198
 
199
+ # --- Create Directories ---
200
+ setup_directories()
 
201
 
202
+ # --- Gradio App Definition ---
203
+ with gr.Blocks(title="Background Submission, Evaluation, and Leaderboard") as demo:
204
+ gr.Markdown("# Background Submission, Evaluation & Results")
205
+ gr.Markdown(
206
+ f"Upload submissions (directories) to **'{SUBMISSIONS_DIR}'**. "
207
+ f"The evaluation script (`{EVAL_SCRIPT_PATH}`) will process them in the background. "
208
+ f"Results appear in **'{RESULTS_DIR}'**. The leaderboard auto-refreshes."
209
+ )
210
 
211
+ with gr.Row():
212
+ with gr.Column(scale=1): # Upload Column
213
+ gr.Markdown("## πŸ“€ Upload & Evaluate Submission")
 
 
214
  upload_button = gr.UploadButton(
215
+ "Click to Upload Directory for Evaluation",
 
216
  file_count="directory",
 
 
217
  )
218
+ upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)
219
+
220
+ with gr.Column(scale=2): # Leaderboard Column
221
+ gr.Markdown("## πŸ† Results Leaderboard")
222
+ leaderboard_df_component = gr.DataFrame(
223
+ value=load_leaderboard_data, # Load initial data
224
+ label="Leaderboard (auto-refreshes)",
225
+ interactive=False,
226
+ # every=20 # Auto-refresh leaderboard data every 20 seconds
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  )
228
+ refresh_leaderboard_button = gr.Button("πŸ”„ Refresh Leaderboard Manually")
229
+
230
+ # --- Event Handlers ---
231
+ upload_button.upload(
232
+ fn=handle_upload_and_kickoff_eval,
233
+ inputs=[upload_button],
234
+ outputs=[upload_status_textbox], # Only one output now for the status message
235
+ show_progress="full"
236
+ )
237
 
238
+ refresh_leaderboard_button.click(
239
+ fn=load_leaderboard_data,
240
+ inputs=None,
241
+ outputs=[leaderboard_df_component]
242
+ )
 
 
 
 
243
 
244
+ if __name__ == "__main__":
245
+ demo.queue().launch()
 
 
backup_app_.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ import shutil # For file operations
7
+ from pathlib import Path # For path manipulations
8
+
9
+
10
+ from src.about import (
11
+ CITATION_BUTTON_LABEL,
12
+ CITATION_BUTTON_TEXT,
13
+ EVALUATION_QUEUE_TEXT,
14
+ INTRODUCTION_TEXT,
15
+ LLM_BENCHMARKS_TEXT,
16
+ TITLE,
17
+ )
18
+ from src.display.css_html_js import custom_css
19
+ from src.display.utils import (
20
+ BENCHMARK_COLS,
21
+ COLS,
22
+ EVAL_COLS,
23
+ EVAL_TYPES,
24
+ AutoEvalColumn,
25
+ ModelType,
26
+ fields,
27
+ WeightType,
28
+ Precision
29
+ )
30
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
32
+ from src.submission.submit import add_new_eval
33
+
34
+
35
+ def restart_space():
36
+ API.restart_space(repo_id=REPO_ID)
37
+
38
+ ### Space initialisation
39
+ try:
40
+ print(EVAL_REQUESTS_PATH)
41
+ snapshot_download(
42
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
43
+ )
44
+ except Exception:
45
+ restart_space()
46
+ try:
47
+ print(EVAL_RESULTS_PATH)
48
+ snapshot_download(
49
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
50
+ )
51
+ except Exception:
52
+ restart_space()
53
+
54
+
55
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
56
+
57
+ (
58
+ finished_eval_queue_df,
59
+ running_eval_queue_df,
60
+ pending_eval_queue_df,
61
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
62
+
63
+ def init_leaderboard(dataframe):
64
+ if dataframe is None or dataframe.empty:
65
+ raise ValueError("Leaderboard DataFrame is empty or None.")
66
+ return Leaderboard(
67
+ value=dataframe,
68
+ datatype=[c.type for c in fields(AutoEvalColumn)],
69
+ select_columns=SelectColumns(
70
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
71
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
72
+ label="Select Columns to Display:",
73
+ ),
74
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
75
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
76
+ filter_columns=[
77
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
78
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
79
+ ColumnFilter(
80
+ AutoEvalColumn.params.name,
81
+ type="slider",
82
+ min=0.01,
83
+ max=150,
84
+ label="Select the number of parameters (B)",
85
+ ),
86
+ ColumnFilter(
87
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
88
+ ),
89
+ ],
90
+ bool_checkboxgroup_label="Hide models",
91
+ interactive=False,
92
+ )
93
+
94
+
95
+
96
+
97
+ # --- Function to handle the uploaded directory ---
98
+ def save_uploaded_models(files):
99
+ if files:
100
+ saved_paths = []
101
+ # 'files' will be a list of temporary file paths when file_count="directory"
102
+ # The actual files are in a temporary directory.
103
+ # We want to recreate the structure within UPLOAD_DIR.
104
+
105
+ # Assuming 'files' contains full paths to files within a single uploaded directory
106
+ # We need to determine the base name of the uploaded directory.
107
+ # Gradio often provides a list of file objects. Each object has a .name attribute (path).
108
+ # Example: if user uploads "my_run_1" containing "model.txt" and "config.json"
109
+ # files might be like: ['/tmp/gradio/somerandomhash/my_run_1/model.txt', '/tmp/gradio/somerandomhash/my_run_1/config.json']
110
+ # Or it might be a list of tempfile._TemporaryFileWrapper objects.
111
+
112
+ if not isinstance(files, list):
113
+ files = [files] # Ensure it's a list
114
+
115
+ # Let's assume `files` is a list of `tempfile._TemporaryFileWrapper` or similar
116
+ # where `file_obj.name` gives the temporary path to each file.
117
+
118
+ # Get the common parent directory from the temporary paths if possible,
119
+ # or derive the uploaded folder name from one of the paths.
120
+ # This part can be tricky depending on exactly how Gradio passes directory uploads.
121
+ # A robust way is to create a unique sub-directory for each upload.
122
+
123
+ # Let's get the name of the directory the user uploaded.
124
+ # With file_count="directory", `files` is a list of file paths.
125
+ # We can infer the uploaded directory name from the first file path.
126
+ if files:
127
+ first_file_path = Path(files[0].name if hasattr(files[0], 'name') else files[0])
128
+ # The uploaded directory name would be the parent of the files if Gradio flattens it,
129
+ # or the parent of the temp directory housing the uploaded folder.
130
+ # For simplicity, let's try to get the original uploaded folder name.
131
+ # Gradio's `UploadButton` usually puts uploaded directories into a subdirectory
132
+ # within the temp space that has the same name as the original uploaded directory.
133
+ # e.g., if user uploads "my_models_run1", files might be in /tmp/somehash/my_models_run1/file1.txt
134
+
135
+ # A common approach: find the common prefix of all file paths,
136
+ # then determine the uploaded directory's name from that.
137
+ # However, Gradio's behavior is that `files` is a list of file objects,
138
+ # each with a `.name` attribute that is the full path to a temporary file.
139
+ # These temporary files are often placed inside a directory that *itself*
140
+ # represents the uploaded directory structure.
141
+
142
+ # Let's assume the user uploaded a directory named "user_uploaded_dir"
143
+ # And it contains "model1.txt" and "model2.txt"
144
+ # `files` might be `[<temp_file_obj_for_model1>, <temp_file_obj_for_model2>]`
145
+ # `files[0].name` might be `/tmp/gradio_guid/user_uploaded_dir/model1.txt`
146
+
147
+ # We need to extract "user_uploaded_dir"
148
+ # And then recreate this structure under UPLOAD_DIR.
149
+
150
+ # Assuming the first file gives us a good representation of the path structure.
151
+ temp_file_path = Path(files[0].name if hasattr(files[0], 'name') else files[0])
152
+ # The uploaded directory's name is usually the second to last part of the temp path
153
+ # e.g. /tmp/tmpxyz/uploaded_dir_name/file.txt -> "uploaded_dir_name"
154
+ uploaded_dir_name = temp_file_path.parent.name
155
+
156
+ destination_folder_path = Path(UPLOAD_DIR) / uploaded_dir_name
157
+ os.makedirs(destination_folder_path, exist_ok=True)
158
+
159
+ for uploaded_file_obj in files:
160
+ # Get the path to the temporary file
161
+ temp_path_str = uploaded_file_obj.name
162
+ temp_path = Path(temp_path_str)
163
+
164
+ # Get the original filename (relative to the uploaded directory)
165
+ # This should be just the filename itself if Gradio preserves the structure
166
+ # correctly inside the temp directory for the uploaded folder.
167
+ original_filename = temp_path.name # e.g., "model1.txt"
168
+
169
+ destination_file_path = destination_folder_path / original_filename
170
+
171
+ try:
172
+ shutil.copy(temp_path_str, destination_file_path)
173
+ saved_paths.append(str(destination_file_path))
174
+ except Exception as e:
175
+ print(f"Error copying {temp_path_str} to {destination_file_path}: {e}")
176
+ return f"Error saving files: {e}"
177
+
178
+ if saved_paths:
179
+ return f"Successfully uploaded and saved models to: {destination_folder_path}"
180
+ else:
181
+ return "No files were saved."
182
+ return "No files uploaded."
183
+
184
+
185
+
186
+
187
+ # demo = gr.Blocks(css=custom_css)
188
+ demo = gr.Blocks()
189
+
190
+ with demo:
191
+ gr.HTML(TITLE)
192
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
193
+
194
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
195
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
196
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
197
+
198
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
199
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
200
+
201
+ with gr.TabItem("πŸš€ Simple Submit here!", elem_id="llm-benchmark-tab-table", id=4):
202
+ gr.Markdown(
203
+ "## Submit your generated models here!",
204
+ elem_classes="markdown-text",
205
+ )
206
+ upload_button = gr.UploadButton(
207
+ label="Upload your generated models (only directories accepted)",
208
+ size="lg",
209
+ file_count="directory",
210
+ elem_id="upload-button",
211
+ )
212
+ # Add an output component to display the result of the upload
213
+ upload_status = gr.Textbox(label="Upload Status", interactive=False)
214
+
215
+ # Connect the upload_button to the save_uploaded_models function
216
+ upload_button.upload(save_uploaded_models, upload_button, upload_status)
217
+
218
+
219
+
220
+ with gr.TabItem("πŸš€ Submit here!", elem_id="llm-benchmark-tab-table", id=3):
221
+ with gr.Column():
222
+ with gr.Row():
223
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
224
+
225
+ with gr.Column():
226
+ with gr.Accordion(
227
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
228
+ open=False,
229
+ ):
230
+ with gr.Row():
231
+ finished_eval_table = gr.components.Dataframe(
232
+ value=finished_eval_queue_df,
233
+ headers=EVAL_COLS,
234
+ datatype=EVAL_TYPES,
235
+ row_count=5,
236
+ )
237
+ with gr.Accordion(
238
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
239
+ open=False,
240
+ ):
241
+ with gr.Row():
242
+ running_eval_table = gr.components.Dataframe(
243
+ value=running_eval_queue_df,
244
+ headers=EVAL_COLS,
245
+ datatype=EVAL_TYPES,
246
+ row_count=5,
247
+ )
248
+
249
+ with gr.Accordion(
250
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
251
+ open=False,
252
+ ):
253
+ with gr.Row():
254
+ pending_eval_table = gr.components.Dataframe(
255
+ value=pending_eval_queue_df,
256
+ headers=EVAL_COLS,
257
+ datatype=EVAL_TYPES,
258
+ row_count=5,
259
+ )
260
+ with gr.Row():
261
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
262
+
263
+ with gr.Row():
264
+ with gr.Column():
265
+ model_name_textbox = gr.Textbox(label="Model name")
266
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
267
+ model_type = gr.Dropdown(
268
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
269
+ label="Model type",
270
+ multiselect=False,
271
+ value=None,
272
+ interactive=True,
273
+ )
274
+
275
+ with gr.Column():
276
+ precision = gr.Dropdown(
277
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
278
+ label="Precision",
279
+ multiselect=False,
280
+ value="float16",
281
+ interactive=True,
282
+ )
283
+ weight_type = gr.Dropdown(
284
+ choices=[i.value.name for i in WeightType],
285
+ label="Weights type",
286
+ multiselect=False,
287
+ value="Original",
288
+ interactive=True,
289
+ )
290
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
291
+
292
+ submit_button = gr.Button("Submit Eval")
293
+ submission_result = gr.Markdown()
294
+ submit_button.click(
295
+ add_new_eval,
296
+ [
297
+ model_name_textbox,
298
+ base_model_name_textbox,
299
+ revision_name_textbox,
300
+ precision,
301
+ weight_type,
302
+ model_type,
303
+ ],
304
+ submission_result,
305
+ )
306
+
307
+ with gr.Row():
308
+ with gr.Accordion("πŸ“™ Citation", open=False):
309
+ citation_button = gr.Textbox(
310
+ value=CITATION_BUTTON_TEXT,
311
+ label=CITATION_BUTTON_LABEL,
312
+ lines=20,
313
+ elem_id="citation-button",
314
+ show_copy_button=True,
315
+ )
316
+
317
+ scheduler = BackgroundScheduler()
318
+ scheduler.add_job(restart_space, "interval", seconds=1800)
319
+ scheduler.start()
320
+ demo.queue(default_concurrency_limit=40).launch()
eval.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eval.py
2
+ import sys
3
+ import os
4
+ import time
5
+ import json
6
+ import subprocess
7
+ import tempfile
8
+ from pathlib import Path
9
+ from datasets import load_dataset # Hugging Face datasets library
10
+
11
+ # --- Configuration ---
12
+
13
+ DATASET_NAME = "kostis-init/CP-Bench"
14
+
15
+ # Column names in the Hugging Face dataset for problem identifier and model script
16
+ PROBLEM_NAME_COLUMN = "id"
17
+ MODEL_CODE_COLUMN = "model"
18
+
19
+ # Timeout for running individual model scripts (both generated and modified ground-truth)
20
+ SCRIPT_EXECUTION_TIMEOUT = 60 # seconds
21
+
22
+
23
+ def extract_json_from_string(text_output: str):
24
+ """
25
+ Attempts to find and parse the first valid JSON object or array from a string.
26
+ Handles cases where JSON is preceded or followed by non-JSON text.
27
+ """
28
+ idx = 0
29
+ while idx < len(text_output):
30
+ # Find the next potential start of a JSON structure
31
+ start_brace = text_output.find('{', idx)
32
+ start_bracket = text_output.find('[', idx)
33
+
34
+ if start_brace == -1 and start_bracket == -1:
35
+ # No more '{' or '[' found in the rest of the string
36
+ return None
37
+
38
+ # Determine the actual starting character for this attempt
39
+ if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
40
+ json_start_index = start_brace
41
+ else:
42
+ json_start_index = start_bracket
43
+
44
+ potential_json_segment = text_output[json_start_index:]
45
+
46
+ try:
47
+ # Use raw_decode to parse the first valid JSON object from the segment
48
+ decoder = json.JSONDecoder()
49
+ json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
50
+ # Successfully parsed a JSON object
51
+ return json_obj
52
+ except json.JSONDecodeError:
53
+ # This segment (starting at json_start_index) wasn't a valid JSON.
54
+ # Advance the search index past the character that caused the current attempt.
55
+ idx = json_start_index + 1
56
+
57
+ return None # No valid JSON found in the entire string
58
+
59
+
60
+ def run_instance(instance_path_str: str,
61
+ timeout: int = SCRIPT_EXECUTION_TIMEOUT): # SCRIPT_EXECUTION_TIMEOUT should be defined
62
+ """Run the instance file and robustly capture the JSON output."""
63
+ command = [sys.executable, instance_path_str]
64
+ instance_name = Path(instance_path_str).name
65
+ try:
66
+ result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
67
+ errors='replace')
68
+
69
+ # Check return code first
70
+ if result.returncode != 0:
71
+ # Log stderr for debugging if the script itself failed
72
+ error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
73
+ print(f" ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
74
+ return None
75
+
76
+ # Attempt to extract JSON from stdout
77
+ stdout_text = result.stdout
78
+ if not stdout_text or not stdout_text.strip():
79
+ print(f" ERROR: No stdout from {instance_name}.", flush=True)
80
+ return None
81
+
82
+ solution = extract_json_from_string(stdout_text)
83
+
84
+ if solution is None:
85
+ # Be more verbose if JSON extraction fails
86
+ abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300] # Show newlines as \n for brevity
87
+ print(
88
+ f" ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
89
+ flush=True)
90
+ return None
91
+
92
+ return solution
93
+
94
+ except subprocess.TimeoutExpired:
95
+ print(f" ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
96
+ return None
97
+ except Exception as e:
98
+ print(f" ERROR: Unexpected error running {instance_name}: {e}", flush=True)
99
+ return None
100
+
101
+
102
+ def add_constraints_as_string(solution):
103
+ """Generate constraints as a string to be added to the original script."""
104
+ constraints = ""
105
+ if solution: # Ensure solution is not None
106
+ for key, value in solution.items():
107
+ # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
108
+ if isinstance(value, str):
109
+ constraints += f"\nmodel += ({key} == \"{value}\")"
110
+ else:
111
+ constraints += f"\nmodel += ({key} == {value})"
112
+ return constraints
113
+
114
+
115
+ def get_modified_script(script_content, solution):
116
+ """Add constraints to the script content and self-consistency checks."""
117
+ constraints_str = add_constraints_as_string(solution)
118
+ modified_script = f"{script_content}\n{constraints_str}"
119
+ modified_script += """
120
+
121
+ # --- Self-consistency check appended by eval.py ---
122
+ # Print the absolute path of the current directory along with the script name
123
+ import os
124
+ # print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug
125
+
126
+ # Keep old objective
127
+ old_objective_value = None
128
+ objective_defined = False
129
+ if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
130
+ try:
131
+ # This block assumes 'model' is the CPMpy model object or similar
132
+ # Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
133
+ # or might raise an error if objective_value() is called on an unsolved/unformulated objective.
134
+ # This part might need adjustment based on the specific modeling library used in CP-Bench.
135
+ # For now, we'll try to get it and catch errors.
136
+ # A more robust way might be to inspect model.objective_
137
+ if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
138
+ if model._objective_value is not None: # cpmpy does not have objective_is_min
139
+ objective_defined = True
140
+ old_objective_value = model.objective_value()
141
+
142
+ except Exception as e_obj_check:
143
+ # print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
144
+ pass # Objective might not be set or model not solved yet.
145
+
146
+ # Check self-consistency
147
+ solved_ok = False
148
+ try:
149
+ if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
150
+ solved_ok = model.solve()
151
+ else:
152
+ print('ERROR: Model object not found or does not have a solve() method.')
153
+ except Exception as e_solve:
154
+ print(f'ERROR: Exception during model.solve(): {e_solve}')
155
+ solved_ok = False # Ensure it's false on exception
156
+
157
+ if not solved_ok:
158
+ print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
159
+ else:
160
+ print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')
161
+
162
+ # Check if the objective value is the same
163
+ if not objective_defined:
164
+ print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
165
+ else:
166
+ try:
167
+ current_objective_value = model.objective_value()
168
+ # Handle potential floating point inaccuracies if objectives can be floats
169
+ if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
170
+ if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
171
+ print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
172
+ else:
173
+ print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
174
+ elif current_objective_value != old_objective_value: # Integer comparison
175
+ print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
176
+ else:
177
+ print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
178
+ except Exception as e_obj_final:
179
+ print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')
180
+
181
+ """
182
+ return modified_script
183
+
184
+
185
+ # --- Main Evaluation Logic ---
186
+ def main(submission_path_str: str, results_base_dir_str: str):
187
+ start_time = time.time()
188
+ print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
189
+ print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
190
+ print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)
191
+
192
+ submission_path = Path(submission_path_str)
193
+ submission_name = submission_path.name
194
+ result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
195
+ os.makedirs(result_dir_for_submission, exist_ok=True)
196
+ summary_file_path = result_dir_for_submission / "summary.txt"
197
+
198
+ # Load ground-truth dataset
199
+ try:
200
+ # Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
201
+ gt_dataset = load_dataset(DATASET_NAME, split="train")
202
+ ground_truth_models = {
203
+ item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
204
+ for item in gt_dataset
205
+ if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
206
+ }
207
+ if not ground_truth_models:
208
+ raise ValueError(
209
+ f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
210
+ print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
211
+ except Exception as e:
212
+ print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
213
+ with open(summary_file_path, "w") as f:
214
+ f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
215
+ return 1 # Indicate failure
216
+
217
+ # Statistics
218
+ total_submitted_models = 0
219
+ models_ran_successfully = 0
220
+ gt_models_found = 0
221
+ consistency_checks_passed = 0
222
+ objective_checks_passed = 0 # Includes "NO_OBJECTIVE_DEFINED" as a pass
223
+
224
+ with open(summary_file_path, "w") as summary_f:
225
+ summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
226
+ summary_f.write(
227
+ f"Ground-Truth Dataset: {DATASET_NAME}\n")
228
+ summary_f.write("-" * 30 + "\n")
229
+
230
+ submitted_model_files = list(submission_path.glob('*.py')) # Assuming Python models
231
+ if not submitted_model_files:
232
+ summary_f.write("No .py model files found in submission.\n")
233
+ print("eval.py: No .py model files found in submission.", flush=True)
234
+ return 0 # No models to evaluate, but script ran.
235
+
236
+ for model_file_path in submitted_model_files:
237
+ total_submitted_models += 1
238
+ problem_name = model_file_path.stem # Filename without .py extension
239
+ print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
240
+ summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
241
+
242
+ # 1. Run the submitted model to get its solution
243
+ summary_f.write(" 1. Running submitted model...\n")
244
+ generated_solution = run_instance(str(model_file_path))
245
+ if generated_solution is None:
246
+ summary_f.write(" - FAILED to run or get valid JSON solution from submitted model.\n")
247
+ continue # Move to the next model
248
+ models_ran_successfully += 1
249
+ summary_f.write(f" - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")
250
+
251
+ # 2. Find corresponding ground-truth model
252
+ summary_f.write(f" 2. Checking against ground-truth for '{problem_name}'...\n")
253
+ if problem_name not in ground_truth_models:
254
+ summary_f.write(f" - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
255
+ print(f" WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
256
+ continue
257
+ gt_models_found += 1
258
+ ground_truth_script_content = ground_truth_models[problem_name]
259
+ summary_f.write(" - SUCCESS: Found ground-truth model.\n")
260
+
261
+ # 3. Modify ground-truth script with solution and run self-consistency check
262
+ summary_f.write(" 3. Performing self-consistency check on ground-truth model...\n")
263
+ modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
264
+
265
+ consistency_passed_this_model = False
266
+ objective_passed_this_model = False
267
+
268
+ try:
269
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
270
+ tmp_file.write(modified_gt_script)
271
+ tmp_file_path_str = tmp_file.name
272
+
273
+ # Run the modified ground-truth script
274
+ gt_check_result = subprocess.run(
275
+ [sys.executable, tmp_file_path_str],
276
+ capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
277
+ )
278
+ os.unlink(tmp_file_path_str) # Clean up temp file
279
+
280
+ # 4. Parse output of modified ground-truth
281
+ gt_stdout = gt_check_result.stdout
282
+ gt_stderr = gt_check_result.stderr
283
+ # summary_f.write(f" Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
284
+ if gt_stderr:
285
+ summary_f.write(f" Modified GT STDERR: {gt_stderr[:500]}...\n")
286
+
287
+ if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
288
+ summary_f.write(" - CONSISTENCY: PASSED\n")
289
+ consistency_checks_passed += 1
290
+ consistency_passed_this_model = True
291
+ elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
292
+ summary_f.write(" - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
293
+ else:
294
+ summary_f.write(" - CONSISTENCY: FAILED (Could not determine consistency from output)\n")
295
+
296
+ if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
297
+ "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
298
+ summary_f.write(" - OBJECTIVE: PASSED (Consistent or no objective)\n")
299
+ objective_checks_passed += 1
300
+ objective_passed_this_model = True
301
+ elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
302
+ summary_f.write(f" - OBJECTIVE: FAILED (Value changed)\n")
303
+ elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
304
+ summary_f.write(f" - OBJECTIVE: FAILED (Error accessing final objective)\n")
305
+ else:
306
+ summary_f.write(" - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")
307
+
308
+ except subprocess.TimeoutExpired:
309
+ summary_f.write(
310
+ f" - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
311
+ print(f" ERROR: Timeout running modified GT for {problem_name}", flush=True)
312
+ except Exception as e_gt_run:
313
+ summary_f.write(
314
+ f" - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
315
+ print(f" ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)
316
+
317
+ # Final statistics
318
+ summary_f.write("\n" + "=" * 30 + "\n")
319
+ summary_f.write("Overall Evaluation Statistics:\n")
320
+ summary_f.write(f" Total Submitted Models Parsed: {total_submitted_models}\n")
321
+ summary_f.write(
322
+ f" Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
323
+ summary_f.write(
324
+ f" Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
325
+ summary_f.write(f" Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
326
+ summary_f.write(f" Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
327
+
328
+ # Define an overall score, e.g. number of models that passed both checks against found GT
329
+ fully_passed_models = 0
330
+ # This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
331
+ # This simple score is just the sum of passes, could be more nuanced
332
+ overall_score = consistency_checks_passed + objective_checks_passed
333
+ summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n") # For Gradio app to parse
334
+
335
+ elapsed_time = time.time() - start_time
336
+ print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
337
+ print(f"eval.py: Summary written to {summary_file_path}", flush=True)
338
+ return 0 # Success
339
+
340
+
341
+ if __name__ == "__main__":
342
+ if len(sys.argv) < 3:
343
+ print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
344
+ print("Example: python eval.py ./submissions/my_run ./results")
345
+ sys.exit(1)
346
+
347
+ submission_dir = sys.argv[1]
348
+ results_base_dir = sys.argv[2]
349
+
350
+ # Simple check if submission_dir exists
351
+ if not Path(submission_dir).is_dir():
352
+ print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
353
+ sys.exit(1)
354
+
355
+ exit_code = main(submission_dir, results_base_dir)
356
+ sys.exit(exit_code)
requirements.txt CHANGED
@@ -13,4 +13,5 @@ python-dateutil
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
- sentencepiece
 
 
13
  tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
+ sentencepiece
17
+ cpmpy