Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

File size: 10,337 Bytes

import gradio as gr
import pandas as pd
import os
import shutil
from pathlib import Path
import subprocess  # For running eval.py
import time
import threading  # For background tasks
import sys

# --- Configuration ---
SUBMISSIONS_DIR = "submissions"
RESULTS_DIR = "results"
EVAL_SCRIPT_PATH = "eval.py"


# --- Helper Functions ---

def setup_directories():
    """Creates the submissions and results directories if they don't exist."""
    os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
    os.makedirs(RESULTS_DIR, exist_ok=True)
    if not os.listdir(RESULTS_DIR):  # Add a placeholder if results is empty
        initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
        if not initial_result_demo_path.exists():
            os.makedirs(initial_result_demo_path, exist_ok=True)
            with open(initial_result_demo_path / "summary.txt", "w") as f:
                f.write("This is a placeholder initial result.\nScore: 0\n")
            print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")


def load_leaderboard_data():
    """
    Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
    Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
    """
    if not os.path.exists(RESULTS_DIR):
        return pd.DataFrame(columns=["Result Directory", "Score", "Files"])

    result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]

    leaderboard_entries = []
    # Sort by modification time of the directory (newest first)
    # This requires getting mtime for each directory.
    sorted_result_dirs = sorted(
        result_dirs,
        key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
        reverse=True
    )

    for dir_name in sorted_result_dirs:
        entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
        result_dir_path = Path(RESULTS_DIR) / dir_name

        try:
            entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
        except Exception:
            pass  # Directory might have been removed during scan

        summary_file = result_dir_path / "summary.txt"
        if summary_file.exists():
            try:
                with open(summary_file, "r") as f:
                    for line in f:
                        if line.lower().startswith("score:"):
                            entry["Score"] = line.split(":", 1)[1].strip()
                            break
            except Exception as e:
                print(f"Error parsing summary for {dir_name}: {e}")

        leaderboard_entries.append(entry)

    if not leaderboard_entries:
        return pd.DataFrame(columns=["Result Directory", "Score", "Files"])

    return pd.DataFrame(leaderboard_entries)


def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
    """
    This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
    Outputs from eval.py will go to the console where app.py is running.
    """
    print(
        f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")

    if not Path(EVAL_SCRIPT_PATH).exists():
        print(
            f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
        return

    command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]

    try:
        # Using subprocess.run which is simpler for blocking calls within this thread
        process = subprocess.run(
            command,
            capture_output=True,
            text=True,
            check=False,  # Handle non-zero exit codes manually
            timeout=300  # 5-minute timeout for the evaluation script
        )

        eval_output = process.stdout.strip()
        eval_error = process.stderr.strip()

        print(
            f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
        if eval_error:  # Only print stderr if it's not empty
            print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")

        if process.returncode == 0:
            print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
        else:
            print(
                f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")

    except subprocess.TimeoutExpired:
        print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
    except FileNotFoundError:  # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
        print(
            f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
    except Exception as e:
        print(
            f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")

    print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")


def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
    """
    Handles directory upload, saves files, and starts eval.py in a background thread.
    Yields a status message for the UI. The leaderboard updates separately.
    """
    yield "Processing upload..."  # Initial status

    if not uploaded_files_list:
        yield "No directory uploaded. Please select a directory."
        return

    try:
        # Determine original uploaded directory name
        first_temp_file_path = Path(uploaded_files_list[0].name)
        original_uploaded_dir_name = first_temp_file_path.parent.name

        submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name

        # Handle potential name collision
        if submission_dir_path.exists():
            timestamp = time.strftime("%Y%m%d-%H%M%S")
            descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
            submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
            status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
            original_uploaded_dir_name = descriptive_name_for_log_and_status  # Use new name for logging
        else:
            descriptive_name_for_log_and_status = original_uploaded_dir_name
            status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."

        os.makedirs(submission_dir_path, exist_ok=True)
        progress(0.1, desc=status_update_msg)

        for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
            temp_file_path = Path(temp_file_obj.name)
            file_name_in_dir = temp_file_path.name
            target_file_path = submission_dir_path / file_name_in_dir
            shutil.copy(str(temp_file_path), str(target_file_path))

        upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
        progress(0.8, desc=upload_completion_msg)

    except Exception as e:
        yield f"Error during upload: {str(e)}"
        return

    # --- Start evaluation in a background thread ---
    if not Path(EVAL_SCRIPT_PATH).exists():
        yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
        return

    # Ensure paths passed to thread are absolute strings, good practice for threads.
    abs_submission_path = str(submission_dir_path.resolve())
    abs_results_path = str(Path(RESULTS_DIR).resolve())

    eval_thread = threading.Thread(
        target=run_evaluation_in_background,
        args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
        daemon=True  # Set as daemon so it exits when main app exits
    )
    eval_thread.start()

    final_status_msg = (
        f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
        "The leaderboard will auto-refresh (or use manual refresh)."
    )
    progress(1.0, desc="Background evaluation initiated.")
    yield final_status_msg


# --- Create Directories ---
setup_directories()

# --- Gradio App Definition ---
with gr.Blocks(title="Background Submission, Evaluation, and Leaderboard") as demo:
    gr.Markdown("# Background Submission, Evaluation & Results")
    gr.Markdown(
        f"Upload submissions (directories) to **'{SUBMISSIONS_DIR}'**. "
        f"The evaluation script (`{EVAL_SCRIPT_PATH}`) will process them in the background. "
        f"Results appear in **'{RESULTS_DIR}'**. The leaderboard auto-refreshes."
    )

    with gr.Row():
        with gr.Column(scale=1):  # Upload Column
            gr.Markdown("## 📤 Upload & Evaluate Submission")
            upload_button = gr.UploadButton(
                "Click to Upload Directory for Evaluation",
                file_count="directory",
            )
            upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)

        with gr.Column(scale=2):  # Leaderboard Column
            gr.Markdown("## 🏆 Results Leaderboard")
            leaderboard_df_component = gr.DataFrame(
                value=load_leaderboard_data,  # Load initial data
                label="Leaderboard (auto-refreshes)",
                interactive=False,
                # every=20  # Auto-refresh leaderboard data every 20 seconds
            )
            refresh_leaderboard_button = gr.Button("🔄 Refresh Leaderboard Manually")

    # --- Event Handlers ---
    upload_button.upload(
        fn=handle_upload_and_kickoff_eval,
        inputs=[upload_button],
        outputs=[upload_status_textbox],  # Only one output now for the status message
        show_progress="full"
    )

    refresh_leaderboard_button.click(
        fn=load_leaderboard_data,
        inputs=None,
        outputs=[leaderboard_df_component]
    )

if __name__ == "__main__":
    demo.queue().launch()