Leaderboard / app.py
Jerrycool's picture
Update app.py
c13e962 verified
raw
history blame
9.22 kB
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
"""
MLE‑Dojo Benchmark Leaderboard — Dark Elegance Edition
=====================================================
A refined, low‑glare UI with larger table text, richer surface layering, and a
subtle neon accent that pops on dark slate backgrounds.
HOW TO
------
* `pip install gradio pandas apscheduler`
* `python mle_dojo_leaderboard_app.py`
* Replace placeholder copy (TITLE …) with your own or keep the try/except.
"""
# ---------------------------------------------------------------------------
# Import app copy (falls back to placeholders if src/ is absent)
# ---------------------------------------------------------------------------
try:
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css # optional extra rules
from src.envs import REPO_ID
from src.submission.submit import add_new_eval
except ImportError:
# ── Placeholders ───────────────────────────────────────────────────────────
CITATION_BUTTON_LABEL = "Citation"
CITATION_BUTTON_TEXT = "Please cite us if you use this benchmark…"
INTRODUCTION_TEXT = "Welcome to the **MLE‑Dojo Benchmark Leaderboard** — compare LLM agents across realistic ML engineering tasks."
LLM_BENCHMARKS_TEXT = "Further details about tasks, metrics and evaluation pipelines."
TITLE = (
"<h1 class='hero-title gradient-text'>🏆 MLE‑Dojo Benchmark Leaderboard</h1>"
"<p class='subtitle'>Interactive, reproducible &amp; community‑driven ML‑agent benchmarking</p>"
)
custom_css = ""
REPO_ID = "your/space-id"
def add_new_eval(*_):
return "Submission placeholder."
# ---------------------------------------------------------------------------
# Data
# ---------------------------------------------------------------------------
data = [
{"model_name": "gpt-4o-mini", "url": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/", "organizer": "OpenAI", "license": "Proprietary", "MLE-Lite_Elo": 753, "Tabular_Elo": 839, "NLP_Elo": 758, "CV_Elo": 754, "Overall": 778},
{"model_name": "gpt-4o", "url": "https://openai.com/index/hello-gpt-4o/", "organizer": "OpenAI", "license": "Proprietary", "MLE-Lite_Elo": 830, "Tabular_Elo": 861, "NLP_Elo": 903, "CV_Elo": 761, "Overall": 841},
{"model_name": "o3-mini", "url": "https://openai.com/index/openai-o3-mini/", "organizer": "OpenAI", "license": "Proprietary", "MLE-Lite_Elo": 1108, "Tabular_Elo": 1019, "NLP_Elo": 1056, "CV_Elo": 1207, "Overall": 1096},
{"model_name": "deepseek-v3", "url": "https://api-docs.deepseek.com/news/news1226", "organizer": "DeepSeek", "license": "DeepSeek", "MLE-Lite_Elo": 1004, "Tabular_Elo": 1015, "NLP_Elo": 1028, "CV_Elo": 1067, "Overall": 1023},
{"model_name": "deepseek-r1", "url": "https://api-docs.deepseek.com/news/news250120", "organizer": "DeepSeek", "license": "DeepSeek", "MLE-Lite_Elo": 1137, "Tabular_Elo": 1053, "NLP_Elo": 1103, "CV_Elo": 1083, "Overall": 1100},
{"model_name": "gemini-2.0-flash", "url": "https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash", "organizer": "Google", "license": "Proprietary", "MLE-Lite_Elo": 847, "Tabular_Elo": 923, "NLP_Elo": 860, "CV_Elo": 978, "Overall": 895},
{"model_name": "gemini-2.0-pro", "url": "https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/", "organizer": "Google", "license": "Proprietary", "MLE-Lite_Elo": 1064, "Tabular_Elo": 1139, "NLP_Elo": 1028, "CV_Elo": 973, "Overall": 1054},
{"model_name": "gemini-2.5-pro", "url": "https://deepmind.google/technologies/gemini/pro/", "organizer": "Google", "license": "Proprietary", "MLE-Lite_Elo": 1257, "Tabular_Elo": 1150, "NLP_Elo": 1266, "CV_Elo": 1177, "Overall": 1214},
]
master_df = pd.DataFrame(data)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"]
DEFAULT_CATEGORY = "Overall"
CATEGORY_MAP = {
"Overall": "Overall",
"MLE-Lite": "MLE-Lite_Elo",
"Tabular": "Tabular_Elo",
"NLP": "NLP_Elo",
"CV": "CV_Elo",
}
def update_leaderboard(category: str, ascending: bool):
col = CATEGORY_MAP.get(category, CATEGORY_MAP[DEFAULT_CATEGORY])
df = (
master_df[["model_name", "url", "organizer", "license", col]]
.sort_values(by=col, ascending=ascending)
.reset_index(drop=True)
)
df.insert(0, "Rank", df.index + 1)
df["Model"] = df.apply(lambda r: f"<a href='{r.url}' target='_blank'>{r.model_name}</a>", axis=1)
df.rename(columns={"organizer": "Organizer", "license": "License", col: "Elo Score"}, inplace=True)
return df[["Rank", "Model", "Organizer", "License", "Elo Score"]]
# ---------------------------------------------------------------------------
# Dark‑mode CSS & Larger Table Fonts
# ---------------------------------------------------------------------------
custom_css += """
/* ————— Core Typography ————— */
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
html,body {
font-family: 'Inter', sans-serif !important;
font-size: 18px !important;
line-height: 1.55;
color: #e2e8f0;
background:#0f172a;
}
/* Accent gradient for titles */
.gradient-text {
background:linear-gradient(90deg,#6366f1 0%,#06b6d4 100%);
-webkit-background-clip:text; -webkit-text-fill-color:transparent;
}
/* Markdown headings */
.markdown-text h2{font-weight:600;margin-top:1.3em;color:#f1f5f9;}
/* Radio & checkbox containers */
.gr-radio, .gr-checkbox{background:#1e293b;border-radius:8px;padding:6px 12px;box-shadow:0 1px 3px rgba(0,0,0,.4);}
.gr-radio input:checked+label, .gr-checkbox input:checked+label{color:#38bdf8;}
/* Table Styling */
#leaderboard-table table{width:100%;border-collapse:collapse;background:#1e293b;border-radius:8px;overflow:hidden;}
#leaderboard-table th{background:#334155;font-size:0.9rem;font-weight:600;padding:0.7em;color:#f1f5f9;text-transform:uppercase;letter-spacing:.04em;}
#leaderboard-table td{padding:0.6em;font-size:1.05rem;border-top:1px solid #334155;}
#leaderboard-table tr:nth-child(even){background:#1c2431;}
#leaderboard-table tr:hover{background:#475569;}
/* Links */
a{color:#38bdf8;} a:hover{text-decoration:underline;}
/* Accordion */
.gr-accordion .label{font-weight:600;font-size:1rem;color:#f1f5f9;}
"""
# ---------------------------------------------------------------------------
# Gradio App
# ---------------------------------------------------------------------------
app = gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="sky", neutral_hue="slate", font=["Inter",]))
with app:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs():
with gr.TabItem("🏅 Leaderboard"):
gr.Markdown("### Model Elo Rankings by Category", elem_classes="markdown-text")
with gr.Row():
category_radio = gr.Radio(CATEGORIES, value=DEFAULT_CATEGORY, label="Category")
asc_check = gr.Checkbox(label="⬆️ Ascending order", value=False)
board = gr.Dataframe(
value=update_leaderboard(DEFAULT_CATEGORY, False),
headers=["Rank", "Model", "Organizer", "License", "Elo Score"],
datatype=["number", "html", "str", "str", "number"],
row_count=(len(master_df), "fixed"),
col_count=(5, "fixed"),
interactive=False,
elem_id="leaderboard-table",
)
category_radio.change(update_leaderboard, [category_radio, asc_check], board)
asc_check.change(update_leaderboard, [category_radio, asc_check], board)
with gr.TabItem("ℹ️ About"):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Accordion("📖 Citation", open=False):
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=10, show_copy_button=True)
# ---------------------------------------------------------------------------
# Optional scheduler (HF Space restarts)
# ---------------------------------------------------------------------------
def restart_space():
print(f"Restarting space → {REPO_ID}")
if __name__ == "__main__":
if REPO_ID != "your/space-id":
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
print("Launching app…")
app.launch()