Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
# --- Placeholder Imports / Definitions --- | |
try: | |
from src.about import ( | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
EVALUATION_QUEUE_TEXT, | |
INTRODUCTION_TEXT, | |
LLM_BENCHMARKS_TEXT, | |
TITLE, # Will override below | |
) | |
from src.display.css_html_js import custom_css | |
from src.envs import REPO_ID | |
from src.submission.submit import add_new_eval | |
except ImportError: | |
CITATION_BUTTON_LABEL = "Citation" | |
CITATION_BUTTON_TEXT = "Please cite us if you use this benchmark..." | |
EVALUATION_QUEUE_TEXT = "Current evaluation queue:" | |
INTRODUCTION_TEXT = "Welcome to the MLE-Dojo Benchmark Leaderboard." | |
LLM_BENCHMARKS_TEXT = "Information about the benchmarks..." | |
custom_css = "" | |
REPO_ID = "your/space-id" | |
def add_new_eval(*args): return "Submission placeholder." | |
# --- Elo Data --- | |
data = [ | |
{'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778}, | |
{'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841}, | |
{'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, | |
{'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek','license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023}, | |
{'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek','license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100}, | |
{'model_name': 'gemini-2.0-flash','url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895}, | |
{'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054}, | |
{'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214}, | |
] | |
master_df = pd.DataFrame(data) | |
CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"] | |
DEFAULT_CATEGORY = "Overall" | |
category_to_column = { | |
"Overall": "Overall", | |
"MLE-Lite": "MLE-Lite_Elo", | |
"Tabular": "Tabular_Elo", | |
"NLP": "NLP_Elo", | |
"CV": "CV_Elo", | |
} | |
def update_leaderboard(category): | |
col = category_to_column.get(category, category_to_column[DEFAULT_CATEGORY]) | |
df = master_df[['model_name','url','organizer','license',col]].copy() | |
df.sort_values(by=col, ascending=False, inplace=True) | |
df.reset_index(drop=True, inplace=True) | |
df.insert(0, 'Rank', df.index+1) | |
df['Model'] = df.apply( | |
lambda r: f"<a href='{r['url']}' target='_blank'>{r['model_name']}</a>", | |
axis=1 | |
) | |
df.rename(columns={col:'Elo Score','organizer':'Organizer','license':'License'}, inplace=True) | |
return df[['Rank','Model','Organizer','License','Elo Score']] | |
# --- Dark Theme + Custom CSS --- | |
custom_css += """ | |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); | |
body { | |
font-family: 'Inter', sans-serif; | |
background-color: #1e1e2f !important; | |
color: #e0e0f0 !important; | |
} | |
/* Hero Section */ | |
.hero-section { | |
background: linear-gradient(135deg, #6c63ff, #8f94fb); | |
color: #fff; | |
padding: 2rem 1rem; | |
border-radius: .75rem; | |
margin-bottom: 1.5rem; | |
text-align: center; | |
box-shadow: 0 4px 10px rgba(0,0,0,0.3); | |
} | |
.hero-section h1 { | |
margin: 0; | |
font-size: 2.5rem !important; | |
font-weight: 700 !important; | |
} | |
.hero-section h2 { | |
margin: .5rem 0 0 !important; | |
font-size: 1.25rem !important; | |
font-weight: 400 !important; | |
opacity: 0.9; | |
} | |
/* Tab Buttons */ | |
.tab-buttons button { | |
border-radius: 20px !important; | |
padding: 0.5rem 1rem !important; | |
margin-right: 0.5rem !important; | |
background: #3a3a4c !important; | |
color: #e0e0f0 !important; | |
border: none !important; | |
transition: background 0.3s !important; | |
font-weight: 500 !important; | |
} | |
.tab-buttons button:hover { | |
background: #4a4a6f !important; | |
} | |
.tab-buttons button[aria-selected="true"] { | |
background: #6c63ff !important; | |
color: #fff !important; | |
} | |
/* Category Selector Pills */ | |
#category-selector input[type="radio"] { display: none; } | |
#category-selector label { | |
display: inline-block; | |
padding: 0.5rem 1rem; | |
margin-right: 0.5rem; | |
border-radius: 999px; | |
background: #3a3a4c; | |
color: #e0e0f0; | |
cursor: pointer; | |
transition: background 0.3s, color 0.3s; | |
font-weight: 500; | |
} | |
#category-selector input[type="radio"]:checked + label { | |
background: #6c63ff; | |
color: #fff; | |
} | |
/* Table Styling */ | |
table { | |
width: 100%; | |
border: none; | |
border-radius: .5rem; | |
overflow: hidden; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.3); | |
margin: 1rem 0; | |
} | |
th { | |
background: #6c63ff !important; | |
color: #fff !important; | |
} | |
td, th { | |
padding: 0.75rem 1rem; | |
background: #1e1e2f; | |
color: #e0e0f0; | |
} | |
tr:nth-child(even) td { | |
background: #2a2a3c; | |
} | |
tr:hover td { | |
background: #3c3b52; | |
} | |
td a { | |
color: #9afeff; | |
text-decoration: none; | |
} | |
td a:hover { | |
text-decoration: underline; | |
} | |
""" | |
# --- Override Title with Hero --- | |
TITLE = """ | |
<div class="hero-section"> | |
<h1>π MLE-Dojo Benchmark Leaderboard</h1> | |
<h2>Improving LLM Agents for Machine Learning Engineering</h2> | |
</div> | |
""" | |
# --- Build App with valid Dark theme --- | |
demo = gr.Blocks(css=custom_css, theme=gr.themes.Dark()) | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("π Leaderboard"): | |
gr.Markdown("## Model Elo Rankings by Category") | |
category_selector = gr.Radio( | |
choices=CATEGORIES, | |
value=DEFAULT_CATEGORY, | |
interactive=True, | |
elem_id="category-selector" | |
) | |
leaderboard_df = gr.Dataframe( | |
value=update_leaderboard(DEFAULT_CATEGORY), | |
headers=["Rank","Model","Organizer","License","Elo Score"], | |
datatype=["number","html","str","str","number"], | |
interactive=False, | |
row_count=(len(master_df),"fixed"), | |
col_count=(5,"fixed"), | |
wrap=True, | |
elem_id="leaderboard-table" | |
) | |
category_selector.change( | |
fn=update_leaderboard, | |
inputs=category_selector, | |
outputs=leaderboard_df | |
) | |
with gr.TabItem("βΉοΈ About"): | |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
with gr.Accordion("π Citation", open=False): | |
gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
lines=10, | |
elem_id="citation-button", | |
show_copy_button=True | |
) | |
if __name__ == "__main__": | |
print("Launching Gradio App...") | |
demo.launch() | |