Spaces:

MLE-Dojo
/

Leaderboard

Running

File size: 16,935 Bytes

22cc60c
 
 
b41aa3c
1007e19
 
 
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
1007e19
7b8bbf4
 
 
 
 
 
 
 
 
 
 
1007e19
 
b41aa3c
f364096
7a7ae1e
1007e19
 
f364096
7b8bbf4
 
 
 
 
 
 
 
b41aa3c
f364096
 
172585c
f364096
 
 
7a7ae1e
f364096
 
 
 
 
 
 
 
 
b41aa3c
 
 
 
f364096
7a7ae1e
 
f364096
 
 
 
 
7b8bbf4
 
 
 
 
7a7ae1e
 
 
7b8bbf4
 
 
7a7ae1e
 
 
172585c
7a7ae1e
 
 
 
 
 
 
 
 
 
172585c
 
7a7ae1e
 
 
 
f364096
 
 
 
7b8bbf4
 
 
172585c
7b8bbf4
 
7a7ae1e
172585c
 
7b8bbf4
b41aa3c
 
 
172585c
b41aa3c
 
 
 
 
 
 
 
22cc60c
1007e19
b41aa3c
1007e19
b41aa3c
7b8bbf4
b41aa3c
7b8bbf4
 
 
 
 
 
 
 
e842409
7b8bbf4
 
 
 
 
e842409
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a7ae1e
172585c
 
b41aa3c
22cc60c
1007e19
172585c
7b8bbf4
1007e19
22cc60c
 
 
1f8bbc4
b41aa3c
172585c
b41aa3c
 
172585c
 
b41aa3c
 
 
f364096
 
7b8bbf4
7a7ae1e
7b8bbf4
7a7ae1e
b41aa3c
1007e19
 
7b8bbf4
7a7ae1e
7b8bbf4
1007e19
b41aa3c
 
 
 
 
 
 
22cc60c
172585c
7b8bbf4
22cc60c
 
7a7ae1e
1007e19
7a7ae1e
172585c
 
1007e19
172585c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007e19
172585c
1007e19
172585c
 
 
 
 
1007e19
 
172585c
1007e19
172585c
 
 
 
7a7ae1e
 
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
22cc60c
b41aa3c
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b41aa3c
 
172585c
7a7ae1e
172585c
1007e19
7b8bbf4
7a7ae1e

import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
# Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard

# --- Make sure these imports work relative to your file structure ---
# Option 1: If src is a directory in the same folder as your script:
try:
    from src.about import (
        CITATION_BUTTON_LABEL,
        CITATION_BUTTON_TEXT,
        EVALUATION_QUEUE_TEXT, # Keep if used by commented-out submit tab
        INTRODUCTION_TEXT,
        LLM_BENCHMARKS_TEXT,
        TITLE,
    )
    from src.display.css_html_js import custom_css # Assuming this exists but might be empty
    from src.envs import REPO_ID # Keep if needed for restart_space or other functions
    from src.submission.submit import add_new_eval # Keep if using the submit tab
    print("Successfully imported from src module.")
# Option 2: If you don't have these files, define placeholders (REMOVE THIS if using Option 1)
except ImportError:
    print("Warning: Using placeholder values because src module imports failed.")
    CITATION_BUTTON_LABEL="Citation"
    CITATION_BUTTON_TEXT="Please cite us if you use this benchmark..."
    EVALUATION_QUEUE_TEXT="Current evaluation queue:"
    INTRODUCTION_TEXT="Welcome to the MLE-Dojo Benchmark Leaderboard."
    LLM_BENCHMARKS_TEXT="Information about the benchmarks..."
    TITLE="<h1>🏆 MLE-Dojo Benchmark Leaderboard</h1>"
    custom_css="" # Start with empty CSS if not imported
    REPO_ID="your/space-id" # Replace with actual ID if needed
    def add_new_eval(*args): return "Submission placeholder."
# --- End Placeholder Definitions ---


# --- Elo Leaderboard Configuration ---
# Enhanced data with Rank (placeholder), Organizer, License, and URL
# !!! IMPORTANT: Replace placeholder URLs with actual model/project pages. !!!
# Verify organizer and license information for accuracy.
data = [
{'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
{'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
{'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, # Fill details later
{'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
{'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
{'model_name': 'gemini-2.0-flash', 'url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
{'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
{'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
]

# Create a master DataFrame
# Note: Columns 'organizer' and 'license' are created in lowercase here.
master_df = pd.DataFrame(data)

# Define categories for selection (user-facing)
CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"] # Overall first
DEFAULT_CATEGORY = "Overall" # Set a default category

# Map user-facing categories to DataFrame column names
category_to_column = {
    "MLE-Lite": "MLE-Lite_Elo",
    "Tabular": "Tabular_Elo",
    "NLP": "NLP_Elo",
    "CV": "CV_Elo",
    "Overall": "Overall"
}

# --- Helper function to update leaderboard ---
def update_leaderboard(category):
    """
    Selects relevant columns, sorts by the chosen category's Elo score,
    adds Rank, formats model name as a link, and returns the DataFrame.
    """
    score_column = category_to_column.get(category)
    if score_column is None or score_column not in master_df.columns:
        print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
        score_column = category_to_column[DEFAULT_CATEGORY]
        # Check fallback column too
        if score_column not in master_df.columns:
            # Return empty df with correct columns if still invalid
            # Use lowercase keys here consistent with master_df for the empty case
            print(f"Error: Default column '{score_column}' also not found.")
            return pd.DataFrame({
                "Rank": [],
                "Model": [],
                "Elo Score": [],
                "Organizer": [], # Changed 'organizer' -> 'Organizer' for consistency in empty case
                "License": []   # Changed 'license' -> 'License' for consistency in empty case
            })

    # Select base columns + the score column for sorting
    # Ensure 'organizer' and 'license' are selected correctly (lowercase)
    cols_to_select = ['model_name', 'url', 'organizer', 'license', score_column]
    df = master_df[cols_to_select].copy()

    # Sort by the selected 'Elo Score' descending
    df.sort_values(by=score_column, ascending=False, inplace=True)

    # Add Rank based on the sorted order
    df.reset_index(drop=True, inplace=True)
    df.insert(0, 'Rank', df.index + 1)

    # Format Model Name as HTML Hyperlink
    # The resulting column name will be 'Model' (capitalized)
    df['Model'] = df.apply(
        lambda row: f"<a href='{row['url'] if pd.notna(row['url']) else '#'}' target='_blank' style='color: #007bff; text-decoration: none;'>{row['model_name']}</a>",
        axis=1
    )

    # Rename the score column to 'Elo Score' for consistent display
    df.rename(columns={score_column: 'Elo Score'}, inplace=True)

    # Rename 'organizer' and 'license' to match desired display headers
    df.rename(columns={'organizer': 'Organizer', 'license': 'License'}, inplace=True)

    # Select and reorder columns for final display using the ACTUAL column names in df
    # Use capitalized 'Organizer' and 'License' here because they have been renamed.
    final_columns = ["Rank", "Model", "Organizer", "License", "Elo Score"]
    df = df[final_columns]

    # Note: The DataFrame returned now has columns:
    # 'Rank', 'Model', 'Organizer', 'License', 'Elo Score'
    return df

# --- Mock/Placeholder functions/data for other tabs ---
# (If the Submit tab is used, ensure these variables are appropriately populated or handled)
print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types

# --- Keep restart function if relevant ---
def restart_space():
    # Make sure REPO_ID is correctly defined/imported if this function is used
    print(f"Attempting to restart space: {REPO_ID}")
    # Replace with your actual space restart mechanism if needed (e.g., HfApi().restart_space(REPO_ID))


# --- Gradio App Definition ---

# ***** FONT SIZE INCREASED HERE *****
# Add CSS rules to make the base font size larger.
# Adjust the '1.2em' value (e.g., to '1.4em', '16px') to change the size.
# The !important flag helps override theme defaults.
# If the imported custom_css already has content, append to it.
font_size_css = """
body {
    font-size: 1.5em !important; /* Increase base font size */
}
/* Optional: Target specific elements if needed */
/*
#leaderboard-table th, #leaderboard-table td {
    font-size: 1em !important; /* Adjust table font size relative to new body size */
    padding: 5px 7px !important; /* Increase padding for better spacing */
}
h1, .markdown-text h1 { font-size: 2.2em !important; } /* Make main title larger */
h2, .markdown-text h2 { font-size: 1.8em !important; } /* Make section titles larger */
button { font-size: 1.1em !important; padding: 8px 16px !important; } /* Slightly larger buttons */
.gr-input, .gr-dropdown, .gr-textbox textarea { font-size: 1em !important; } /* Ensure inputs scale too */
*/
"""
# Append the new CSS to any existing custom_css
custom_css += font_size_css

# Add basic table styling if not already present
if "table {" not in custom_css:
    custom_css += """
table { width: 100%; border-collapse: collapse; margin-top: 10px; margin-bottom: 10px; }
th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; white-space: normal; vertical-align: top; } /* Allow wrapping, top align */
th { background-color: #f2f2f2; font-weight: bold; }
tr:nth-child(even) { background-color: #f9f9f9; }
tr:hover { background-color: #e9e9e9; }
td a { color: #007bff; text-decoration: none; }
td a:hover { text-decoration: underline; }
"""

# Use a theme for better default styling
demo = gr.Blocks(css=custom_css, theme=gr.themes.Soft())

with demo:
    # Use the TITLE variable imported or defined above
    gr.HTML(TITLE)

    # Use the INTRODUCTION_TEXT variable imported or defined above
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 MLE-Dojo Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            with gr.Column():
                gr.Markdown("## Model Elo Rankings by Category")
                category_selector = gr.Radio(
                    choices=CATEGORIES,
                    label="Select Category:",
                    value=DEFAULT_CATEGORY,
                    interactive=True,
                )
                leaderboard_df_component = gr.Dataframe(
                    # Initialize with sorted data for the default category
                    value=update_leaderboard(DEFAULT_CATEGORY),
                    # Headers for DISPLAY should match the *renamed* columns from update_leaderboard
                    headers=["Rank", "Model", "Organizer", "License", "Elo Score"],
                    # Datatype maps to the final df columns: Rank, Model, Organizer, License, Elo Score
                    datatype=["number", "html", "str", "str", "number"],
                    interactive=False,
                    # --- FIX APPLIED: Removed unsupported 'height' argument ---
                    # row_count determines the number of rows to display
                    row_count=(len(master_df), "fixed"), # Display all rows
                    col_count=(5, "fixed"),
                    wrap=True, # Allow text wrapping in cells
                    elem_id="leaderboard-table" # CSS hook for custom styling
                )
                # Link the radio button change to the update function
                category_selector.change(
                    fn=update_leaderboard,
                    inputs=category_selector,
                    outputs=leaderboard_df_component
                )

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=1):
            # Use the LLM_BENCHMARKS_TEXT variable imported or defined above
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        # --- Submit Tab (Commented out as in original request) ---
        # Make sure EVALUATION_QUEUE_TEXT and add_new_eval are imported/defined if uncommented
        # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=2):
        #     with gr.Column():
        #          with gr.Row():
        #              gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # Requires import/definition
        #          with gr.Column():
        #              with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
        #                   finished_eval_table = gr.components.Dataframe(
        #                       value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
        #                  )
        #              with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
        #                   running_eval_table = gr.components.Dataframe(
        #                       value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
        #                  )
        #              with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
        #                  pending_eval_table = gr.components.Dataframe(
        #                      value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
        #                  )
        #     with gr.Row():
        #          gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
        #     with gr.Row():
        #          with gr.Column():
        #              model_name_textbox = gr.Textbox(label="Model name (on Hugging Face Hub)")
        #              revision_name_textbox = gr.Textbox(label="Revision / Commit Hash", placeholder="main")
        #              model_type = gr.Dropdown(choices=["Type A", "Type B", "Type C"], label="Model type", multiselect=False, value=None, interactive=True) # Example choices
        #          with gr.Column():
        #              precision = gr.Dropdown(choices=["float16", "bfloat16", "float32", "int8", "auto"], label="Precision", multiselect=False, value="auto", interactive=True)
        #              weight_type = gr.Dropdown(choices=["Original", "Adapter", "Delta"], label="Weights type", multiselect=False, value="Original", interactive=True)
        #              base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
        #     submit_button = gr.Button("Submit Eval")
        #     submission_result = gr.Markdown()
        #     # Ensure add_new_eval is correctly imported/defined and handles these inputs
        #     submit_button.click(
        #          add_new_eval, # Requires import/definition
        #          [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ],
        #          submission_result,
        #      )

    # --- Citation Row (at the bottom, outside Tabs) ---
    with gr.Accordion("📙 Citation", open=False):
        # Use the CITATION_BUTTON_TEXT and CITATION_BUTTON_LABEL variables imported or defined above
        citation_button = gr.Textbox(
            value=CITATION_BUTTON_TEXT,
            label=CITATION_BUTTON_LABEL,
            lines=10, # Adjust lines if needed for new font size
            elem_id="citation-button",
            show_copy_button=True,
        )

# IGNORE_WHEN_COPYING_START
# content_copy  download
# Use code with caution.
# IGNORE_WHEN_COPYING_END

# --- Keep scheduler if relevant ---
# Only start scheduler if the script is run directly
if __name__ == "__main__":
    try:
        scheduler = BackgroundScheduler()
        # Add job only if restart_space is callable (i.e., not a placeholder or failed import)
        if callable(restart_space):
             # Check if REPO_ID seems valid before scheduling
             if REPO_ID and REPO_ID != "your/space-id":
                 scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins
                 scheduler.start()
             else:
                 print("Warning: REPO_ID not set or is placeholder; space restart job not scheduled.")
        else:
             print("Warning: restart_space function not available; space restart job not scheduled.")
    except Exception as e:
        print(f"Failed to initialize or start scheduler: {e}")


# --- Launch the app ---
# Ensures the app launches only when the script is run directly
if __name__ == "__main__":
    # Ensure you have installed necessary libraries: pip install gradio pandas apscheduler
    # Make sure your src module files (about.py etc.) are accessible OR use the placeholder definitions above.
    print("Launching Gradio App...")
    demo.launch()