Leaderboard / app.py
Jerrycool's picture
Update app.py
ffb569a verified
raw
history blame
8.1 kB
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
# --- Placeholder Imports / Definitions ---
try:
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE, # Will override below
)
from src.display.css_html_js import custom_css
from src.envs import REPO_ID
from src.submission.submit import add_new_eval
except ImportError:
CITATION_BUTTON_LABEL = "Citation"
CITATION_BUTTON_TEXT = "Please cite us if you use this benchmark..."
EVALUATION_QUEUE_TEXT = "Current evaluation queue:"
INTRODUCTION_TEXT = "Welcome to the MLE-Dojo Benchmark Leaderboard."
LLM_BENCHMARKS_TEXT = "Information about the benchmarks..."
custom_css = ""
REPO_ID = "your/space-id"
def add_new_eval(*args): return "Submission placeholder."
# --- Elo Data ---
data = [
{'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
{'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
{'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096},
{'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek','license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
{'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek','license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
{'model_name': 'gemini-2.0-flash','url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
{'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
{'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
]
master_df = pd.DataFrame(data)
CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"]
DEFAULT_CATEGORY = "Overall"
category_to_column = {
"Overall": "Overall",
"MLE-Lite": "MLE-Lite_Elo",
"Tabular": "Tabular_Elo",
"NLP": "NLP_Elo",
"CV": "CV_Elo",
}
def update_leaderboard(category):
col = category_to_column.get(category, category_to_column[DEFAULT_CATEGORY])
df = master_df[['model_name','url','organizer','license',col]].copy()
df.sort_values(by=col, ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
df.insert(0, 'Rank', df.index+1)
df['Model'] = df.apply(
lambda r: f"<a href='{r['url']}' target='_blank'>{r['model_name']}</a>",
axis=1
)
df.rename(columns={col:'Elo Score','organizer':'Organizer','license':'License'}, inplace=True)
return df[['Rank','Model','Organizer','License','Elo Score']]
# --- Dark Theme + Custom CSS ---
custom_css += """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
body {
font-family: 'Inter', sans-serif;
background-color: #1e1e2f !important;
color: #e0e0f0 !important;
}
/* Hero Section */
.hero-section {
background: linear-gradient(135deg, #6c63ff, #8f94fb);
color: #fff;
padding: 2rem 1rem;
border-radius: .75rem;
margin-bottom: 1.5rem;
text-align: center;
box-shadow: 0 4px 10px rgba(0,0,0,0.3);
}
.hero-section h1 {
margin: 0;
font-size: 2.5rem !important;
font-weight: 700 !important;
}
.hero-section h2 {
margin: .5rem 0 0 !important;
font-size: 1.25rem !important;
font-weight: 400 !important;
opacity: 0.9;
}
/* Tab Buttons */
.tab-buttons button {
border-radius: 20px !important;
padding: 0.5rem 1rem !important;
margin-right: 0.5rem !important;
background: #3a3a4c !important;
color: #e0e0f0 !important;
border: none !important;
transition: background 0.3s !important;
font-weight: 500 !important;
}
.tab-buttons button:hover {
background: #4a4a6f !important;
}
.tab-buttons button[aria-selected="true"] {
background: #6c63ff !important;
color: #fff !important;
}
/* Category Selector Pills */
#category-selector input[type="radio"] { display: none; }
#category-selector label {
display: inline-block;
padding: 0.5rem 1rem;
margin-right: 0.5rem;
border-radius: 999px;
background: #3a3a4c;
color: #e0e0f0;
cursor: pointer;
transition: background 0.3s, color 0.3s;
font-weight: 500;
}
#category-selector input[type="radio"]:checked + label {
background: #6c63ff;
color: #fff;
}
/* Table Styling */
table {
width: 100%;
border: none;
border-radius: .5rem;
overflow: hidden;
box-shadow: 0 2px 4px rgba(0,0,0,0.3);
margin: 1rem 0;
}
th {
background: #6c63ff !important;
color: #fff !important;
}
td, th {
padding: 0.75rem 1rem;
background: #1e1e2f;
color: #e0e0f0;
}
tr:nth-child(even) td {
background: #2a2a3c;
}
tr:hover td {
background: #3c3b52;
}
td a {
color: #9afeff;
text-decoration: none;
}
td a:hover {
text-decoration: underline;
}
"""
# --- Override Title with Hero ---
TITLE = """
<div class="hero-section">
<h1>πŸ† MLE-Dojo Benchmark Leaderboard</h1>
<h2>Improving LLM Agents for Machine Learning Engineering</h2>
</div>
"""
# --- Build App with valid Dark theme ---
demo = gr.Blocks(css=custom_css, theme=gr.themes.Dark())
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ“‹ Leaderboard"):
gr.Markdown("## Model Elo Rankings by Category")
category_selector = gr.Radio(
choices=CATEGORIES,
value=DEFAULT_CATEGORY,
interactive=True,
elem_id="category-selector"
)
leaderboard_df = gr.Dataframe(
value=update_leaderboard(DEFAULT_CATEGORY),
headers=["Rank","Model","Organizer","License","Elo Score"],
datatype=["number","html","str","str","number"],
interactive=False,
row_count=(len(master_df),"fixed"),
col_count=(5,"fixed"),
wrap=True,
elem_id="leaderboard-table"
)
category_selector.change(
fn=update_leaderboard,
inputs=category_selector,
outputs=leaderboard_df
)
with gr.TabItem("ℹ️ About"):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Accordion("πŸ“™ Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=10,
elem_id="citation-button",
show_copy_button=True
)
if __name__ == "__main__":
print("Launching Gradio App...")
demo.launch()