Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

App Files Files Community

kostis-init commited on May 20

Commit

b5712a3

1 Parent(s): 083c72d

replace with simpler app

Browse files

Files changed (4) hide show

app.py +230 -221
backup_app_.py +320 -0
eval.py +356 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,236 +1,245 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
     )
-# demo = gr.Blocks(css=custom_css)
-demo = gr.Blocks()
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Simple Submit here!", elem_id="llm-benchmark-tab-table", id=4):
-            gr.Markdown(
-                "## Submit your generated models here!",
-                elem_classes="markdown-text",
-            )
             upload_button = gr.UploadButton(
-                label="Upload your generated models (only directories accepted)",
-                size="lg",
                 file_count="directory",
-                elem_id="upload-button",
-                file_types=["text"],
             )
-            # when the directory is uploaded, we need to save it under the submissions folder
-            def upload_directory(directory):
-                # Save the directory to the EVAL_REQUESTS_PATH
-                directory_path = directory.name
-                if directory_path:
-                    # Move the uploaded directory to the desired location
-                    import shutil
-                    shutil.move(directory_path, "submissions/")
-                    return f"Directory {directory_path} uploaded successfully!"
-                else:
-                    return "No directory uploaded."
-            upload_button.upload(fn=upload_directory, inputs=upload_button, outputs=None)
-        with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
             )
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 import pandas as pd
+import os
+import shutil
+from pathlib import Path
+import subprocess  # For running eval.py
+import time
+import threading  # For background tasks
+import sys
+# --- Configuration ---
+SUBMISSIONS_DIR = "submissions"
+RESULTS_DIR = "results"
+EVAL_SCRIPT_PATH = "eval.py"
+# --- Helper Functions ---
+def setup_directories():
+    """Creates the submissions and results directories if they don't exist."""
+    os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
+    os.makedirs(RESULTS_DIR, exist_ok=True)
+    if not os.listdir(RESULTS_DIR):  # Add a placeholder if results is empty
+        initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
+        if not initial_result_demo_path.exists():
+            os.makedirs(initial_result_demo_path, exist_ok=True)
+            with open(initial_result_demo_path / "summary.txt", "w") as f:
+                f.write("This is a placeholder initial result.\nScore: 0\n")
+            print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")
+def load_leaderboard_data():
+    """
+    Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
+    Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
+    """
+    if not os.path.exists(RESULTS_DIR):
+        return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
+    result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]
+    leaderboard_entries = []
+    # Sort by modification time of the directory (newest first)
+    # This requires getting mtime for each directory.
+    sorted_result_dirs = sorted(
+        result_dirs,
+        key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
+        reverse=True
     )
+    for dir_name in sorted_result_dirs:
+        entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
+        result_dir_path = Path(RESULTS_DIR) / dir_name
+        try:
+            entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
+        except Exception:
+            pass  # Directory might have been removed during scan
+        summary_file = result_dir_path / "summary.txt"
+        if summary_file.exists():
+            try:
+                with open(summary_file, "r") as f:
+                    for line in f:
+                        if line.lower().startswith("score:"):
+                            entry["Score"] = line.split(":", 1)[1].strip()
+                            break
+            except Exception as e:
+                print(f"Error parsing summary for {dir_name}: {e}")
+        leaderboard_entries.append(entry)
+    if not leaderboard_entries:
+        return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
+    return pd.DataFrame(leaderboard_entries)
+def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
+    """
+    This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
+    Outputs from eval.py will go to the console where app.py is running.
+    """
+    print(
+        f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")
+    if not Path(EVAL_SCRIPT_PATH).exists():
+        print(
+            f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
+        return
+    command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]
+    try:
+        # Using subprocess.run which is simpler for blocking calls within this thread
+        process = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            check=False,  # Handle non-zero exit codes manually
+            timeout=300  # 5-minute timeout for the evaluation script
+        )
+        eval_output = process.stdout.strip()
+        eval_error = process.stderr.strip()
+        print(
+            f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
+        if eval_error:  # Only print stderr if it's not empty
+            print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")
+        if process.returncode == 0:
+            print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
+        else:
+            print(
+                f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")
+    except subprocess.TimeoutExpired:
+        print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
+    except FileNotFoundError:  # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
+        print(
+            f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
+    except Exception as e:
+        print(
+            f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")
+    print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")
+def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
+    """
+    Handles directory upload, saves files, and starts eval.py in a background thread.
+    Yields a status message for the UI. The leaderboard updates separately.
+    """
+    yield "Processing upload..."  # Initial status
+    if not uploaded_files_list:
+        yield "No directory uploaded. Please select a directory."
+        return
+    try:
+        # Determine original uploaded directory name
+        first_temp_file_path = Path(uploaded_files_list[0].name)
+        original_uploaded_dir_name = first_temp_file_path.parent.name
+        submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name
+        # Handle potential name collision
+        if submission_dir_path.exists():
+            timestamp = time.strftime("%Y%m%d-%H%M%S")
+            descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
+            submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
+            status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
+            original_uploaded_dir_name = descriptive_name_for_log_and_status  # Use new name for logging
+        else:
+            descriptive_name_for_log_and_status = original_uploaded_dir_name
+            status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."
+        os.makedirs(submission_dir_path, exist_ok=True)
+        progress(0.1, desc=status_update_msg)
+        for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
+            temp_file_path = Path(temp_file_obj.name)
+            file_name_in_dir = temp_file_path.name
+            target_file_path = submission_dir_path / file_name_in_dir
+            shutil.copy(str(temp_file_path), str(target_file_path))
+        upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
+        progress(0.8, desc=upload_completion_msg)
+    except Exception as e:
+        yield f"Error during upload: {str(e)}"
+        return
+    # --- Start evaluation in a background thread ---
+    if not Path(EVAL_SCRIPT_PATH).exists():
+        yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
+        return
+    # Ensure paths passed to thread are absolute strings, good practice for threads.
+    abs_submission_path = str(submission_dir_path.resolve())
+    abs_results_path = str(Path(RESULTS_DIR).resolve())
+    eval_thread = threading.Thread(
+        target=run_evaluation_in_background,
+        args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
+        daemon=True  # Set as daemon so it exits when main app exits
+    )
+    eval_thread.start()
+    final_status_msg = (
+        f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
+        "The leaderboard will auto-refresh (or use manual refresh)."
+    )
+    progress(1.0, desc="Background evaluation initiated.")
+    yield final_status_msg
+# --- Create Directories ---
+setup_directories()
+# --- Gradio App Definition ---
+with gr.Blocks(title="Background Submission, Evaluation, and Leaderboard") as demo:
+    gr.Markdown("# Background Submission, Evaluation & Results")
+    gr.Markdown(
+        f"Upload submissions (directories) to **'{SUBMISSIONS_DIR}'**. "
+        f"The evaluation script (`{EVAL_SCRIPT_PATH}`) will process them in the background. "
+        f"Results appear in **'{RESULTS_DIR}'**. The leaderboard auto-refreshes."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):  # Upload Column
+            gr.Markdown("## 📤 Upload & Evaluate Submission")
             upload_button = gr.UploadButton(
+                "Click to Upload Directory for Evaluation",
                 file_count="directory",
             )
+            upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)
+        with gr.Column(scale=2):  # Leaderboard Column
+            gr.Markdown("## 🏆 Results Leaderboard")
+            leaderboard_df_component = gr.DataFrame(
+                value=load_leaderboard_data,  # Load initial data
+                label="Leaderboard (auto-refreshes)",
+                interactive=False,
+                # every=20  # Auto-refresh leaderboard data every 20 seconds
             )
+            refresh_leaderboard_button = gr.Button("🔄 Refresh Leaderboard Manually")
+    # --- Event Handlers ---
+    upload_button.upload(
+        fn=handle_upload_and_kickoff_eval,
+        inputs=[upload_button],
+        outputs=[upload_status_textbox],  # Only one output now for the status message
+        show_progress="full"
+    )
+    refresh_leaderboard_button.click(
+        fn=load_leaderboard_data,
+        inputs=None,
+        outputs=[leaderboard_df_component]
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

backup_app_.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+import shutil # For file operations
+from pathlib import Path # For path manipulations
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0.01,
+                max=150,
+                label="Select the number of parameters (B)",
+            ),
+            ColumnFilter(
+                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            ),
+        ],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+# --- Function to handle the uploaded directory ---
+def save_uploaded_models(files):
+    if files:
+        saved_paths = []
+        # 'files' will be a list of temporary file paths when file_count="directory"
+        # The actual files are in a temporary directory.
+        # We want to recreate the structure within UPLOAD_DIR.
+        # Assuming 'files' contains full paths to files within a single uploaded directory
+        # We need to determine the base name of the uploaded directory.
+        # Gradio often provides a list of file objects. Each object has a .name attribute (path).
+        # Example: if user uploads "my_run_1" containing "model.txt" and "config.json"
+        # files might be like: ['/tmp/gradio/somerandomhash/my_run_1/model.txt', '/tmp/gradio/somerandomhash/my_run_1/config.json']
+        # Or it might be a list of tempfile._TemporaryFileWrapper objects.
+        if not isinstance(files, list):
+            files = [files] # Ensure it's a list
+        # Let's assume `files` is a list of `tempfile._TemporaryFileWrapper` or similar
+        # where `file_obj.name` gives the temporary path to each file.
+        # Get the common parent directory from the temporary paths if possible,
+        # or derive the uploaded folder name from one of the paths.
+        # This part can be tricky depending on exactly how Gradio passes directory uploads.
+        # A robust way is to create a unique sub-directory for each upload.
+        # Let's get the name of the directory the user uploaded.
+        # With file_count="directory", `files` is a list of file paths.
+        # We can infer the uploaded directory name from the first file path.
+        if files:
+            first_file_path = Path(files[0].name if hasattr(files[0], 'name') else files[0])
+            # The uploaded directory name would be the parent of the files if Gradio flattens it,
+            # or the parent of the temp directory housing the uploaded folder.
+            # For simplicity, let's try to get the original uploaded folder name.
+            # Gradio's `UploadButton` usually puts uploaded directories into a subdirectory
+            # within the temp space that has the same name as the original uploaded directory.
+            # e.g., if user uploads "my_models_run1", files might be in /tmp/somehash/my_models_run1/file1.txt
+            # A common approach: find the common prefix of all file paths,
+            # then determine the uploaded directory's name from that.
+            # However, Gradio's behavior is that `files` is a list of file objects,
+            # each with a `.name` attribute that is the full path to a temporary file.
+            # These temporary files are often placed inside a directory that *itself*
+            # represents the uploaded directory structure.
+            # Let's assume the user uploaded a directory named "user_uploaded_dir"
+            # And it contains "model1.txt" and "model2.txt"
+            # `files` might be `[<temp_file_obj_for_model1>, <temp_file_obj_for_model2>]`
+            # `files[0].name` might be `/tmp/gradio_guid/user_uploaded_dir/model1.txt`
+            # We need to extract "user_uploaded_dir"
+            # And then recreate this structure under UPLOAD_DIR.
+            # Assuming the first file gives us a good representation of the path structure.
+            temp_file_path = Path(files[0].name if hasattr(files[0], 'name') else files[0])
+            # The uploaded directory's name is usually the second to last part of the temp path
+            # e.g. /tmp/tmpxyz/uploaded_dir_name/file.txt -> "uploaded_dir_name"
+            uploaded_dir_name = temp_file_path.parent.name
+            destination_folder_path = Path(UPLOAD_DIR) / uploaded_dir_name
+            os.makedirs(destination_folder_path, exist_ok=True)
+            for uploaded_file_obj in files:
+                # Get the path to the temporary file
+                temp_path_str = uploaded_file_obj.name
+                temp_path = Path(temp_path_str)
+                # Get the original filename (relative to the uploaded directory)
+                # This should be just the filename itself if Gradio preserves the structure
+                # correctly inside the temp directory for the uploaded folder.
+                original_filename = temp_path.name # e.g., "model1.txt"
+                destination_file_path = destination_folder_path / original_filename
+                try:
+                    shutil.copy(temp_path_str, destination_file_path)
+                    saved_paths.append(str(destination_file_path))
+                except Exception as e:
+                    print(f"Error copying {temp_path_str} to {destination_file_path}: {e}")
+                    return f"Error saving files: {e}"
+            if saved_paths:
+                return f"Successfully uploaded and saved models to: {destination_folder_path}"
+            else:
+                return "No files were saved."
+        return "No files uploaded."
+# demo = gr.Blocks(css=custom_css)
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Simple Submit here!", elem_id="llm-benchmark-tab-table", id=4):
+            gr.Markdown(
+                "## Submit your generated models here!",
+                elem_classes="markdown-text",
+            )
+            upload_button = gr.UploadButton(
+                label="Upload your generated models (only directories accepted)",
+                size="lg",
+                file_count="directory",
+                elem_id="upload-button",
+            )
+            # Add an output component to display the result of the upload
+            upload_status = gr.Textbox(label="Upload Status", interactive=False)
+            # Connect the upload_button to the save_uploaded_models function
+            upload_button.upload(save_uploaded_models, upload_button, upload_status)
+        with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Model type",
+                        multiselect=False,
+                        value=None,
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

eval.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# eval.py
+import sys
+import os
+import time
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+from datasets import load_dataset  # Hugging Face datasets library
+# --- Configuration ---
+DATASET_NAME = "kostis-init/CP-Bench"
+# Column names in the Hugging Face dataset for problem identifier and model script
+PROBLEM_NAME_COLUMN = "id"
+MODEL_CODE_COLUMN = "model"
+# Timeout for running individual model scripts (both generated and modified ground-truth)
+SCRIPT_EXECUTION_TIMEOUT = 60  # seconds
+def extract_json_from_string(text_output: str):
+    """
+    Attempts to find and parse the first valid JSON object or array from a string.
+    Handles cases where JSON is preceded or followed by non-JSON text.
+    """
+    idx = 0
+    while idx < len(text_output):
+        # Find the next potential start of a JSON structure
+        start_brace = text_output.find('{', idx)
+        start_bracket = text_output.find('[', idx)
+        if start_brace == -1 and start_bracket == -1:
+            # No more '{' or '[' found in the rest of the string
+            return None
+        # Determine the actual starting character for this attempt
+        if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
+            json_start_index = start_brace
+        else:
+            json_start_index = start_bracket
+        potential_json_segment = text_output[json_start_index:]
+        try:
+            # Use raw_decode to parse the first valid JSON object from the segment
+            decoder = json.JSONDecoder()
+            json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
+            # Successfully parsed a JSON object
+            return json_obj
+        except json.JSONDecodeError:
+            # This segment (starting at json_start_index) wasn't a valid JSON.
+            # Advance the search index past the character that caused the current attempt.
+            idx = json_start_index + 1
+    return None  # No valid JSON found in the entire string
+def run_instance(instance_path_str: str,
+                 timeout: int = SCRIPT_EXECUTION_TIMEOUT):  # SCRIPT_EXECUTION_TIMEOUT should be defined
+    """Run the instance file and robustly capture the JSON output."""
+    command = [sys.executable, instance_path_str]
+    instance_name = Path(instance_path_str).name
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
+                                errors='replace')
+        # Check return code first
+        if result.returncode != 0:
+            # Log stderr for debugging if the script itself failed
+            error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
+            print(f"  ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
+            return None
+        # Attempt to extract JSON from stdout
+        stdout_text = result.stdout
+        if not stdout_text or not stdout_text.strip():
+            print(f"  ERROR: No stdout from {instance_name}.", flush=True)
+            return None
+        solution = extract_json_from_string(stdout_text)
+        if solution is None:
+            # Be more verbose if JSON extraction fails
+            abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300]  # Show newlines as \n for brevity
+            print(
+                f"  ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
+                flush=True)
+            return None
+        return solution
+    except subprocess.TimeoutExpired:
+        print(f"  ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
+        return None
+    except Exception as e:
+        print(f"  ERROR: Unexpected error running {instance_name}: {e}", flush=True)
+        return None
+def add_constraints_as_string(solution):
+    """Generate constraints as a string to be added to the original script."""
+    constraints = ""
+    if solution:  # Ensure solution is not None
+        for key, value in solution.items():
+            # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
+            if isinstance(value, str):
+                constraints += f"\nmodel += ({key} == \"{value}\")"
+            else:
+                constraints += f"\nmodel += ({key} == {value})"
+    return constraints
+def get_modified_script(script_content, solution):
+    """Add constraints to the script content and self-consistency checks."""
+    constraints_str = add_constraints_as_string(solution)
+    modified_script = f"{script_content}\n{constraints_str}"
+    modified_script += """
+# --- Self-consistency check appended by eval.py ---
+# Print the absolute path of the current directory along with the script name
+import os
+# print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug
+# Keep old objective
+old_objective_value = None
+objective_defined = False
+if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
+    try:
+        # This block assumes 'model' is the CPMpy model object or similar
+        # Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
+        # or might raise an error if objective_value() is called on an unsolved/unformulated objective.
+        # This part might need adjustment based on the specific modeling library used in CP-Bench.
+        # For now, we'll try to get it and catch errors.
+        # A more robust way might be to inspect model.objective_
+        if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
+             if model._objective_value is not None: # cpmpy does not have objective_is_min
+                objective_defined = True
+                old_objective_value = model.objective_value()
+    except Exception as e_obj_check:
+        # print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
+        pass # Objective might not be set or model not solved yet.
+# Check self-consistency
+solved_ok = False
+try:
+    if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
+        solved_ok = model.solve()
+    else:
+        print('ERROR: Model object not found or does not have a solve() method.')
+except Exception as e_solve:
+    print(f'ERROR: Exception during model.solve(): {e_solve}')
+    solved_ok = False # Ensure it's false on exception
+if not solved_ok:
+    print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
+else:
+    print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')
+    # Check if the objective value is the same
+    if not objective_defined:
+        print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
+    else:
+        try:
+            current_objective_value = model.objective_value()
+            # Handle potential floating point inaccuracies if objectives can be floats
+            if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
+                if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
+                    print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
+                else:
+                    print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
+            elif current_objective_value != old_objective_value: # Integer comparison
+                print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
+            else:
+                print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
+        except Exception as e_obj_final:
+            print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')
+"""
+    return modified_script
+# --- Main Evaluation Logic ---
+def main(submission_path_str: str, results_base_dir_str: str):
+    start_time = time.time()
+    print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
+    print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
+    print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)
+    submission_path = Path(submission_path_str)
+    submission_name = submission_path.name
+    result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
+    os.makedirs(result_dir_for_submission, exist_ok=True)
+    summary_file_path = result_dir_for_submission / "summary.txt"
+    # Load ground-truth dataset
+    try:
+        # Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
+        gt_dataset = load_dataset(DATASET_NAME, split="train")
+        ground_truth_models = {
+            item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
+            for item in gt_dataset
+            if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
+        }
+        if not ground_truth_models:
+            raise ValueError(
+                f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
+        print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
+    except Exception as e:
+        print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
+        with open(summary_file_path, "w") as f:
+            f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
+        return 1  # Indicate failure
+    # Statistics
+    total_submitted_models = 0
+    models_ran_successfully = 0
+    gt_models_found = 0
+    consistency_checks_passed = 0
+    objective_checks_passed = 0  # Includes "NO_OBJECTIVE_DEFINED" as a pass
+    with open(summary_file_path, "w") as summary_f:
+        summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
+        summary_f.write(
+            f"Ground-Truth Dataset: {DATASET_NAME}\n")
+        summary_f.write("-" * 30 + "\n")
+        submitted_model_files = list(submission_path.glob('*.py'))  # Assuming Python models
+        if not submitted_model_files:
+            summary_f.write("No .py model files found in submission.\n")
+            print("eval.py: No .py model files found in submission.", flush=True)
+            return 0  # No models to evaluate, but script ran.
+        for model_file_path in submitted_model_files:
+            total_submitted_models += 1
+            problem_name = model_file_path.stem  # Filename without .py extension
+            print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
+            summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
+            # 1. Run the submitted model to get its solution
+            summary_f.write("  1. Running submitted model...\n")
+            generated_solution = run_instance(str(model_file_path))
+            if generated_solution is None:
+                summary_f.write("    - FAILED to run or get valid JSON solution from submitted model.\n")
+                continue  # Move to the next model
+            models_ran_successfully += 1
+            summary_f.write(f"    - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")
+            # 2. Find corresponding ground-truth model
+            summary_f.write(f"  2. Checking against ground-truth for '{problem_name}'...\n")
+            if problem_name not in ground_truth_models:
+                summary_f.write(f"    - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
+                print(f"  WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
+                continue
+            gt_models_found += 1
+            ground_truth_script_content = ground_truth_models[problem_name]
+            summary_f.write("    - SUCCESS: Found ground-truth model.\n")
+            # 3. Modify ground-truth script with solution and run self-consistency check
+            summary_f.write("  3. Performing self-consistency check on ground-truth model...\n")
+            modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
+            consistency_passed_this_model = False
+            objective_passed_this_model = False
+            try:
+                with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
+                    tmp_file.write(modified_gt_script)
+                    tmp_file_path_str = tmp_file.name
+                # Run the modified ground-truth script
+                gt_check_result = subprocess.run(
+                    [sys.executable, tmp_file_path_str],
+                    capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
+                )
+                os.unlink(tmp_file_path_str)  # Clean up temp file
+                # 4. Parse output of modified ground-truth
+                gt_stdout = gt_check_result.stdout
+                gt_stderr = gt_check_result.stderr
+                # summary_f.write(f"    Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
+                if gt_stderr:
+                    summary_f.write(f"    Modified GT STDERR: {gt_stderr[:500]}...\n")
+                if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
+                    summary_f.write("    - CONSISTENCY: PASSED\n")
+                    consistency_checks_passed += 1
+                    consistency_passed_this_model = True
+                elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
+                    summary_f.write("    - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
+                else:
+                    summary_f.write("    - CONSISTENCY: FAILED (Could not determine consistency from output)\n")
+                if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
+                        "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
+                    summary_f.write("    - OBJECTIVE: PASSED (Consistent or no objective)\n")
+                    objective_checks_passed += 1
+                    objective_passed_this_model = True
+                elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
+                    summary_f.write(f"    - OBJECTIVE: FAILED (Value changed)\n")
+                elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
+                    summary_f.write(f"    - OBJECTIVE: FAILED (Error accessing final objective)\n")
+                else:
+                    summary_f.write("    - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")
+            except subprocess.TimeoutExpired:
+                summary_f.write(
+                    f"    - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
+                print(f"  ERROR: Timeout running modified GT for {problem_name}", flush=True)
+            except Exception as e_gt_run:
+                summary_f.write(
+                    f"    - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
+                print(f"  ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)
+        # Final statistics
+        summary_f.write("\n" + "=" * 30 + "\n")
+        summary_f.write("Overall Evaluation Statistics:\n")
+        summary_f.write(f"  Total Submitted Models Parsed: {total_submitted_models}\n")
+        summary_f.write(
+            f"  Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
+        summary_f.write(
+            f"  Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
+        summary_f.write(f"  Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
+        summary_f.write(f"  Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
+        # Define an overall score, e.g. number of models that passed both checks against found GT
+        fully_passed_models = 0
+        # This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
+        # This simple score is just the sum of passes, could be more nuanced
+        overall_score = consistency_checks_passed + objective_checks_passed
+        summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n")  # For Gradio app to parse
+    elapsed_time = time.time() - start_time
+    print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
+    print(f"eval.py: Summary written to {summary_file_path}", flush=True)
+    return 0  # Success
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
+        print("Example: python eval.py ./submissions/my_run ./results")
+        sys.exit(1)
+    submission_dir = sys.argv[1]
+    results_base_dir = sys.argv[2]
+    # Simple check if submission_dir exists
+    if not Path(submission_dir).is_dir():
+        print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
+        sys.exit(1)
+    exit_code = main(submission_dir, results_base_dir)
+    sys.exit(exit_code)

requirements.txt CHANGED Viewed

@@ -13,4 +13,5 @@ python-dateutil
 tqdm
 transformers
 tokenizers>=0.15.0
-sentencepiece

 tqdm
 transformers
 tokenizers>=0.15.0
+sentencepiece
+cpmpy