Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from datetime import datetime | |
# --- Make sure these imports work relative to your file structure --- | |
# Option 1: If src is a directory in the same folder as your script: | |
try: | |
from src.about import ( | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
EVALUATION_QUEUE_TEXT, | |
INTRODUCTION_TEXT, | |
LLM_BENCHMARKS_TEXT, | |
TITLE, | |
) | |
from src.display.css_html_js import custom_css | |
from src.envs import REPO_ID | |
from src.submission.submit import add_new_eval | |
print("Successfully imported from src module.") | |
# Option 2: If you don't have these files, define placeholders | |
except ImportError: | |
print("Warning: Using placeholder values because src module imports failed.") | |
CITATION_BUTTON_LABEL = "Citation" | |
CITATION_BUTTON_TEXT = "Please cite us if you use this benchmark..." | |
EVALUATION_QUEUE_TEXT = "Current evaluation queue:" | |
INTRODUCTION_TEXT = """ | |
# Welcome to the MLE-Dojo Benchmark Leaderboard | |
This leaderboard tracks the performance of various AI models across multiple machine learning engineering domains. | |
Our comprehensive evaluation system uses ELO ratings to provide a fair comparison between different models. | |
## How to read this leaderboard | |
- Select a domain category to view specialized rankings | |
- Higher ELO scores indicate better performance | |
- Click on any model name to learn more about it | |
""" | |
LLM_BENCHMARKS_TEXT = """ | |
# About the MLE-Dojo Benchmark | |
## Evaluation Methodology | |
The MLE-Dojo benchmark evaluates models across various domains including: | |
- **MLE-Lite**: Basic machine learning engineering tasks | |
- **Tabular**: Data manipulation, analysis, and modeling with structured data | |
- **NLP**: Natural language processing tasks including classification, generation, and understanding | |
- **CV**: Computer vision tasks including image classification, object detection, and generation | |
Our evaluation uses a sophisticated ELO rating system that considers the relative performance of models against each other. | |
## Contact | |
For more information or to submit your model, please contact us at [email protected] | |
""" | |
TITLE = "<h1>π MLE-Dojo Benchmark Leaderboard</h1>" | |
custom_css = "" | |
REPO_ID = "your/space-id" | |
def add_new_eval(*args): return "Submission placeholder." | |
# --- Elo Leaderboard Configuration --- | |
# Enhanced data with Rank (placeholder), Organizer, License, and URL | |
data = [ | |
{'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778}, | |
{'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841}, | |
{'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, | |
{'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023}, | |
{'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100}, | |
{'model_name': 'gemini-2.0-flash', 'url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895}, | |
{'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054}, | |
{'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214}, | |
] | |
# Add organization logos (for visual enhancement) | |
org_logos = { | |
'OpenAI': 'π±', # You can replace these with actual icon URLs in production | |
'DeepSeek': 'π', | |
'Google': 'π', | |
'Default': 'π€' | |
} | |
# Create a master DataFrame | |
master_df = pd.DataFrame(data) | |
# Add last updated timestamp | |
last_updated = datetime.now().strftime("%B %d, %Y at %H:%M:%S") | |
# Define categories with fancy icons | |
CATEGORIES = [ | |
("π Overall", "Overall"), | |
("π‘ MLE-Lite", "MLE-Lite"), | |
("π Tabular", "Tabular"), | |
("π NLP", "NLP"), | |
("ποΈ CV", "CV") | |
] | |
DEFAULT_CATEGORY = "Overall" | |
# Map user-facing categories to DataFrame column names | |
category_to_column = { | |
"MLE-Lite": "MLE-Lite_Elo", | |
"Tabular": "Tabular_Elo", | |
"NLP": "NLP_Elo", | |
"CV": "CV_Elo", | |
"Overall": "Overall" | |
} | |
# --- Helper function to update leaderboard --- | |
def update_leaderboard(category_label): | |
""" | |
Enhanced function to update the leaderboard with visual improvements | |
""" | |
# Extract the category value from the label if it's a tuple (icon, value) | |
if isinstance(category_label, tuple): | |
category = category_label[1] | |
else: | |
# For backward compatibility or direct values | |
category = category_label.split(" ")[-1] if " " in category_label else category_label | |
score_column = category_to_column.get(category) | |
if score_column is None or score_column not in master_df.columns: | |
print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.") | |
score_column = category_to_column[DEFAULT_CATEGORY] | |
if score_column not in master_df.columns: | |
print(f"Error: Default column '{score_column}' also not found.") | |
return pd.DataFrame({ | |
"Rank": [], | |
"Model": [], | |
"Organizer": [], | |
"License": [], | |
"Elo Score": [] | |
}) | |
# Select base columns + the score column for sorting | |
cols_to_select = ['model_name', 'url', 'organizer', 'license', score_column] | |
df = master_df[cols_to_select].copy() | |
# Sort by the selected 'Elo Score' descending | |
df.sort_values(by=score_column, ascending=False, inplace=True) | |
# Add Rank with medal emojis for top 3 | |
df.reset_index(drop=True, inplace=True) | |
# Create fancy rank with medals for top positions | |
def get_rank_display(idx): | |
if idx == 0: | |
return "π₯ 1" | |
elif idx == 1: | |
return "π₯ 2" | |
elif idx == 2: | |
return "π₯ 3" | |
else: | |
return f"{idx + 1}" | |
df.insert(0, 'Rank', df.index.map(get_rank_display)) | |
# Add organization icons to model names | |
df['Model'] = df.apply( | |
lambda row: f"""<div style="display: flex; align-items: center;"> | |
<span style="font-size: 1.5em; margin-right: 10px;">{org_logos.get(row['organizer'], org_logos['Default'])}</span> | |
<a href='{row['url'] if pd.notna(row['url']) else '#'}' target='_blank' | |
style='color: #0066cc; text-decoration: none; font-weight: 500; font-size: 1.05em;'> | |
{row['model_name']} | |
</a> | |
</div>""", | |
axis=1 | |
) | |
# Format Elo scores with visual indicators | |
df['Elo Display'] = df[score_column].apply( | |
lambda score: f"""<div style="display: flex; align-items: center;"> | |
<span style="font-weight: bold; color: {'#1a5fb4' if score >= 1000 else '#2ec27e' if score >= 900 else '#e5a50a' if score >= 800 else '#ff7800'}"> | |
{score} | |
</span> | |
<div style="margin-left: 10px; height: 12px; width: 60px; background-color: #eaeaea; border-radius: 6px; overflow: hidden;"> | |
<div style="height: 100%; width: {min(100, max(5, (score-700)/7))}%; background-color: {'#1a5fb4' if score >= 1000 else '#2ec27e' if score >= 900 else '#e5a50a' if score >= 800 else '#ff7800'};"></div> | |
</div> | |
</div>""" | |
) | |
# Rename columns for display | |
df.rename(columns={score_column: 'Elo Score'}, inplace=True) | |
df.rename(columns={'organizer': 'Organizer', 'license': 'License'}, inplace=True) | |
# Select and reorder columns for final display | |
final_columns = ["Rank", "Model", "Organizer", "License", "Elo Display"] | |
df = df[final_columns] | |
# Rename for display | |
df.columns = ["Rank", "Model", "Organization", "License", f"Elo Score ({category})"] | |
return df | |
# --- Mock/Placeholder functions/data for other tabs --- | |
print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.") | |
finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) | |
running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) | |
pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) | |
EVAL_COLS = ["Model", "Status", "Requested", "Started"] | |
EVAL_TYPES = ["str", "str", "str", "str"] | |
# --- Keep restart function if relevant --- | |
def restart_space(): | |
print(f"Attempting to restart space: {REPO_ID}") | |
# Replace with your actual space restart mechanism if needed | |
# --- Enhanced CSS for beauty and readability --- | |
enhanced_css = """ | |
/* Base styling */ | |
:root { | |
--primary-color: #1a5fb4; | |
--secondary-color: #2ec27e; | |
--accent-color: #e5a50a; | |
--warning-color: #ff7800; | |
--text-color: #333333; | |
--background-color: #ffffff; | |
--card-background: #f9f9f9; | |
--border-color: #e0e0e0; | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
/* Typography */ | |
body, .gradio-container { | |
font-family: 'Inter', 'Segoe UI', Roboto, -apple-system, BlinkMacSystemFont, system-ui, sans-serif !important; | |
font-size: 16px !important; | |
line-height: 1.6 !important; | |
color: var(--text-color) !important; | |
background-color: var(--background-color) !important; | |
} | |
/* Headings */ | |
h1 { | |
font-size: 2.5rem !important; | |
font-weight: 700 !important; | |
margin-bottom: 1.5rem !important; | |
color: var(--primary-color) !important; | |
text-align: center !important; | |
letter-spacing: -0.02em !important; | |
line-height: 1.2 !important; | |
} | |
h2 { | |
font-size: 1.8rem !important; | |
font-weight: 600 !important; | |
margin-top: 1.5rem !important; | |
margin-bottom: 1rem !important; | |
color: var(--primary-color) !important; | |
letter-spacing: -0.01em !important; | |
} | |
h3 { | |
font-size: 1.4rem !important; | |
font-weight: 600 !important; | |
margin-top: 1.2rem !important; | |
margin-bottom: 0.8rem !important; | |
color: var(--text-color) !important; | |
} | |
/* Tabs styling */ | |
.tabs { | |
margin-top: 1rem !important; | |
border-radius: 12px !important; | |
overflow: hidden !important; | |
box-shadow: 0 4px 12px var(--shadow-color) !important; | |
} | |
.tab-nav button { | |
font-size: 1.1rem !important; | |
font-weight: 500 !important; | |
padding: 0.8rem 1.5rem !important; | |
border-radius: 0 !important; | |
transition: all 0.2s ease !important; | |
} | |
.tab-nav button.selected { | |
background-color: var(--primary-color) !important; | |
color: white !important; | |
font-weight: 600 !important; | |
} | |
/* Card styling */ | |
.gradio-container .gr-box, .gradio-container .gr-panel { | |
border-radius: 12px !important; | |
border: 1px solid var(--border-color) !important; | |
box-shadow: 0 4px 12px var(--shadow-color) !important; | |
overflow: hidden !important; | |
} | |
/* Table styling */ | |
table { | |
width: 100% !important; | |
border-collapse: separate !important; | |
border-spacing: 0 !important; | |
margin: 1.5rem 0 !important; | |
border-radius: 8px !important; | |
overflow: hidden !important; | |
box-shadow: 0 4px 12px var(--shadow-color) !important; | |
} | |
th { | |
background-color: #f0f5ff !important; | |
color: var(--primary-color) !important; | |
font-weight: 600 !important; | |
padding: 1rem !important; | |
font-size: 1.1rem !important; | |
text-align: left !important; | |
border-bottom: 2px solid var(--primary-color) !important; | |
} | |
td { | |
padding: 1rem !important; | |
border-bottom: 1px solid var(--border-color) !important; | |
font-size: 1rem !important; | |
vertical-align: middle !important; | |
} | |
tr:nth-child(even) { | |
background-color: #f8fafd !important; | |
} | |
tr:hover { | |
background-color: #edf2fb !important; | |
} | |
tr:first-child td { | |
border-top: none !important; | |
} | |
/* Button styling */ | |
button.primary, .gr-button.primary { | |
background-color: var(--primary-color) !important; | |
color: white !important; | |
font-weight: 500 !important; | |
padding: 0.8rem 1.5rem !important; | |
border-radius: 8px !important; | |
border: none !important; | |
cursor: pointer !important; | |
transition: all 0.2s ease !important; | |
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1) !important; | |
} | |
button.primary:hover, .gr-button.primary:hover { | |
background-color: #0b4a9e !important; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important; | |
transform: translateY(-1px) !important; | |
} | |
/* Radio buttons */ | |
.gr-radio { | |
display: flex !important; | |
flex-wrap: wrap !important; | |
gap: 10px !important; | |
margin: 1rem 0 !important; | |
} | |
.gr-radio label { | |
background-color: #f5f7fa !important; | |
border: 1px solid var(--border-color) !important; | |
border-radius: 8px !important; | |
padding: 0.7rem 1.2rem !important; | |
font-size: 1rem !important; | |
font-weight: 500 !important; | |
cursor: pointer !important; | |
transition: all 0.2s ease !important; | |
display: flex !important; | |
align-items: center !important; | |
gap: 8px !important; | |
} | |
.gr-radio label:hover { | |
background-color: #eaeef3 !important; | |
border-color: #c0c9d6 !important; | |
} | |
.gr-radio label.selected { | |
background-color: #e0e9f7 !important; | |
border-color: var(--primary-color) !important; | |
color: var(--primary-color) !important; | |
font-weight: 600 !important; | |
} | |
/* Input fields */ | |
input, textarea, select { | |
font-size: 1rem !important; | |
padding: 0.8rem !important; | |
border-radius: 8px !important; | |
border: 1px solid var(--border-color) !important; | |
transition: all 0.2s ease !important; | |
} | |
input:focus, textarea:focus, select:focus { | |
border-color: var(--primary-color) !important; | |
box-shadow: 0 0 0 2px rgba(26, 95, 180, 0.2) !important; | |
outline: none !important; | |
} | |
/* Accordion styling */ | |
.gr-accordion { | |
border-radius: 8px !important; | |
overflow: hidden !important; | |
margin: 1rem 0 !important; | |
border: 1px solid var(--border-color) !important; | |
} | |
.gr-accordion-header { | |
padding: 1rem !important; | |
background-color: #f5f7fa !important; | |
font-weight: 600 !important; | |
font-size: 1.1rem !important; | |
color: var(--text-color) !important; | |
} | |
.gr-accordion-content { | |
padding: 1rem !important; | |
background-color: white !important; | |
} | |
/* Markdown text improvements */ | |
.markdown-text { | |
font-size: 1.05rem !important; | |
line-height: 1.7 !important; | |
} | |
.markdown-text p { | |
margin-bottom: 1rem !important; | |
} | |
.markdown-text ul, .markdown-text ol { | |
margin-left: 1.5rem !important; | |
margin-bottom: 1rem !important; | |
} | |
.markdown-text li { | |
margin-bottom: 0.5rem !important; | |
} | |
.markdown-text strong { | |
font-weight: 600 !important; | |
color: #333 !important; | |
} | |
/* Status indicators */ | |
.status-badge { | |
display: inline-block; | |
padding: 0.3rem 0.7rem; | |
border-radius: 99px; | |
font-size: 0.85rem; | |
font-weight: 500; | |
text-align: center; | |
} | |
.status-pending { | |
background-color: #fff8e0; | |
color: #b58a00; | |
border: 1px solid #ffd74d; | |
} | |
.status-running { | |
background-color: #e0f2ff; | |
color: #0066cc; | |
border: 1px solid #66b3ff; | |
} | |
.status-completed { | |
background-color: #e6f7ef; | |
color: #00875a; | |
border: 1px solid #57d9a3; | |
} | |
/* Footer */ | |
.footer { | |
margin-top: 2rem; | |
padding: 1rem; | |
text-align: center; | |
font-size: 0.9rem; | |
color: #666; | |
border-top: 1px solid var(--border-color); | |
} | |
/* Enhanced leaderboard title */ | |
.leaderboard-header { | |
display: flex; | |
align-items: center; | |
justify-content: space-between; | |
margin-bottom: 1.5rem; | |
padding-bottom: 1rem; | |
border-bottom: 2px solid var(--border-color); | |
} | |
.leaderboard-title { | |
font-size: 2.2rem; | |
font-weight: 700; | |
color: var(--primary-color); | |
margin: 0; | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
} | |
.leaderboard-subtitle { | |
font-size: 1.1rem; | |
color: #666; | |
margin-top: 0.5rem; | |
} | |
.timestamp { | |
font-size: 0.85rem; | |
color: #666; | |
font-style: italic; | |
} | |
/* Category selector buttons */ | |
.category-buttons { | |
display: flex; | |
flex-wrap: wrap; | |
gap: 10px; | |
margin-bottom: 1.5rem; | |
} | |
.category-button { | |
padding: 0.7rem 1.2rem; | |
background-color: #f0f5ff; | |
border: 1px solid #d0e0ff; | |
border-radius: 8px; | |
font-weight: 500; | |
cursor: pointer; | |
transition: all 0.2s ease; | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
} | |
.category-button:hover { | |
background-color: #e0ebff; | |
border-color: #b0d0ff; | |
} | |
.category-button.active { | |
background-color: var(--primary-color); | |
color: white; | |
border-color: var(--primary-color); | |
} | |
/* Logo and brand styling */ | |
.logo { | |
font-size: 2.5em; | |
margin-right: 0.5rem; | |
} | |
/* Medal styling for top ranks */ | |
.rank-1 { | |
color: #ffd700; | |
font-weight: bold; | |
} | |
.rank-2 { | |
color: #c0c0c0; | |
font-weight: bold; | |
} | |
.rank-3 { | |
color: #cd7f32; | |
font-weight: bold; | |
} | |
""" | |
# Combine with any existing CSS | |
custom_css = enhanced_css + custom_css | |
# --- Gradio App Definition --- | |
demo = gr.Blocks(css=custom_css, theme=gr.themes.Soft()) | |
with demo: | |
# Enhanced header with timestamp | |
gr.HTML(f""" | |
<div class="leaderboard-header"> | |
<div> | |
<div class="leaderboard-title"> | |
<span class="logo">π</span> MLE-Dojo Benchmark Leaderboard | |
</div> | |
<div class="leaderboard-subtitle"> | |
Comprehensive evaluation of AI models across multiple domains | |
</div> | |
</div> | |
<div class="timestamp"> | |
Last updated: {last_updated} | |
</div> | |
</div> | |
""") | |
# Introduction with enhanced styling | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("π Leaderboard", elem_id="llm-benchmark-tab-table", id=0): | |
with gr.Column(): | |
gr.HTML(""" | |
<h2 style="display: flex; align-items: center; gap: 10px;"> | |
<span style="font-size: 1.3em;">π</span> Model Performance Rankings | |
</h2> | |
<p class="leaderboard-subtitle">Select a category to view specialized performance metrics</p> | |
""") | |
# Enhanced category selector | |
category_selector = gr.Radio( | |
choices=[x[0] for x in CATEGORIES], | |
label="Select Performance Domain:", | |
value="π Overall", | |
interactive=True, | |
elem_classes="fancy-radio" | |
) | |
# Visual separator | |
gr.HTML('<div style="height: 1px; background-color: #e0e0e0; margin: 20px 0;"></div>') | |
# Enhanced leaderboard table | |
leaderboard_df_component = gr.Dataframe( | |
value=update_leaderboard(DEFAULT_CATEGORY), | |
headers=["Rank", "Model", "Organization", "License", f"Elo Score ({DEFAULT_CATEGORY})"], | |
datatype=["html", "html", "str", "str", "html"], | |
interactive=False, | |
row_count=(len(master_df), "fixed"), | |
col_count=(5, "fixed"), | |
wrap=True, | |
elem_id="leaderboard-table", | |
) | |
# Stats cards (visual enhancement) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.HTML(f""" | |
<div style="background-color: #f0f5ff; padding: 20px; border-radius: 12px; text-align: center;"> | |
<div style="font-size: 2em;">π</div> | |
<div style="font-size: 2em; font-weight: bold; color: #1a5fb4;">{len(master_df)}</div> | |
<div style="font-size: 1.1em; color: #666;">Models Evaluated</div> | |
</div> | |
""") | |
with gr.Column(scale=1): | |
gr.HTML(f""" | |
<div style="background-color: #e6f7ef; padding: 20px; border-radius: 12px; text-align: center;"> | |
<div style="font-size: 2em;">π</div> | |
<div style="font-size: 2em; font-weight: bold; color: #00875a;">{master_df['organizer'].nunique()}</div> | |
<div style="font-size: 1.1em; color: #666;">Organizations</div> | |
</div> | |
""") | |
with gr.Column(scale=1): | |
gr.HTML(f""" | |
<div style="background-color: #fff8e0; padding: 20px; border-radius: 12px; text-align: center;"> | |
<div style="font-size: 2em;">π </div> | |
<div style="font-size: 2em; font-weight: bold; color: #b58a00;">{len(CATEGORIES)}</div> | |
<div style="font-size: 1.1em; color: #666;">Performance Domains</div> | |
</div> | |
""") | |
# Link the radio button change to the update function | |
category_selector.change( | |
fn=update_leaderboard, | |
inputs=category_selector, | |
outputs=leaderboard_df_component | |
) | |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-about", id=1): | |
# Enhanced about section | |
gr.HTML(""" | |
<div class="about-header" style="display: flex; align-items: center; gap: 20px; margin-bottom: 20px;"> | |
<div style="font-size: 4em;">π§ͺ</div> | |
<div> | |
<h2 style="margin: 0;">About the MLE-Dojo Benchmark</h2> | |
<p style="margin: 5px 0 0 0; color: #666;">A comprehensive evaluation framework for AI models</p> | |
</div> | |
</div> | |
""") | |
# Use the LLM_BENCHMARKS_TEXT variable | |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
# Add methodology cards for visual enhancement | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background-color: #f5f7fa; padding: 20px; border-radius: 12px; height: 100%;"> | |
<div style="font-size: 2em; text-align: center; margin-bottom: 15px;">π‘</div> | |
<h3 style="text-align: center; margin-top: 0;">MLE-Lite</h3> | |
<p>Evaluates a model's ability to handle basic machine learning engineering tasks including | |
data preprocessing, feature engineering, model selection, and basic deployment.</p> | |
</div> | |
""") | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background-color: #f5f7fa; padding: 20px; border-radius: 12px; height: 100%;"> | |
<div style="font-size: 2em; text-align: center; margin-bottom: 15px;">π</div> | |
<h3 style="text-align: center; margin-top: 0;">Tabular</h3> | |
<p>Tests a model's ability to process, analyze and model structured data, including | |
statistical analysis,statistical analysis, predictive modeling, and data visualization with tabular datasets.</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background-color: #f5f7fa; padding: 20px; border-radius: 12px; height: 100%;"> | |
<div style="font-size: 2em; text-align: center; margin-bottom: 15px;">π</div> | |
<h3 style="text-align: center; margin-top: 0;">NLP</h3> | |
<p>Evaluates natural language processing capabilities including text classification, | |
sentiment analysis, entity recognition, text generation, and language understanding.</p> | |
</div> | |
""") | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background-color: #f5f7fa; padding: 20px; border-radius: 12px; height: 100%;"> | |
<div style="font-size: 2em; text-align: center; margin-bottom: 15px;">ποΈ</div> | |
<h3 style="text-align: center; margin-top: 0;">CV</h3> | |
<p>Tests computer vision capabilities including image classification, object detection, | |
image generation, and visual understanding tasks across various domains.</p> | |
</div> | |
""") | |
# Optional: Uncomment if you want to re-enable the Submit tab | |
# with gr.TabItem("π Submit Model", elem_id="llm-benchmark-tab-submit", id=2): | |
# with gr.Column(): | |
# gr.HTML(""" | |
# <div class="about-header" style="display: flex; align-items: center; gap: 20px; margin-bottom: 20px;"> | |
# <div style="font-size: 4em;">π</div> | |
# <div> | |
# <h2 style="margin: 0;">Submit Your Model for Evaluation</h2> | |
# <p style="margin: 5px 0 0 0; color: #666;">Add your model to the MLE-Dojo leaderboard</p> | |
# </div> | |
# </div> | |
# """) | |
# | |
# with gr.Row(): | |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
# | |
# with gr.Column(): | |
# with gr.Accordion(f"β Finished Evaluations ({len(finished_eval_queue_df)})", open=False): | |
# finished_eval_table = gr.components.Dataframe( | |
# value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, | |
# ) | |
# with gr.Accordion(f"π Running Evaluation Queue ({len(running_eval_queue_df)})", open=False): | |
# running_eval_table = gr.components.Dataframe( | |
# value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, | |
# ) | |
# with gr.Accordion(f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False): | |
# pending_eval_table = gr.components.Dataframe( | |
# value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, | |
# ) | |
# | |
# gr.HTML('<div style="height: 1px; background-color: #e0e0e0; margin: 20px 0;"></div>') | |
# | |
# gr.HTML(""" | |
# <h2 style="display: flex; align-items: center; gap: 10px;"> | |
# <span style="font-size: 1.3em;">π</span> Model Submission Form | |
# </h2> | |
# """) | |
# | |
# with gr.Row(): | |
# with gr.Column(): | |
# model_name_textbox = gr.Textbox( | |
# label="Model Name (on Hugging Face Hub)", | |
# placeholder="Enter your model name...", | |
# elem_classes="enhanced-input" | |
# ) | |
# revision_name_textbox = gr.Textbox( | |
# label="Revision / Commit Hash", | |
# placeholder="main", | |
# elem_classes="enhanced-input" | |
# ) | |
# model_type = gr.Dropdown( | |
# choices=["Type A", "Type B", "Type C"], | |
# label="Model Type", | |
# multiselect=False, | |
# value=None, | |
# interactive=True, | |
# elem_classes="enhanced-dropdown" | |
# ) | |
# with gr.Column(): | |
# precision = gr.Dropdown( | |
# choices=["float16", "bfloat16", "float32", "int8", "auto"], | |
# label="Precision", | |
# multiselect=False, | |
# value="auto", | |
# interactive=True, | |
# elem_classes="enhanced-dropdown" | |
# ) | |
# weight_type = gr.Dropdown( | |
# choices=["Original", "Adapter", "Delta"], | |
# label="Weights Type", | |
# multiselect=False, | |
# value="Original", | |
# interactive=True, | |
# elem_classes="enhanced-dropdown" | |
# ) | |
# base_model_name_textbox = gr.Textbox( | |
# label="Base Model (for delta or adapter weights)", | |
# placeholder="Only needed for adapter/delta weights", | |
# elem_classes="enhanced-input" | |
# ) | |
# | |
# submit_button = gr.Button( | |
# "Submit for Evaluation", | |
# elem_classes="primary-button" | |
# ) | |
# submission_result = gr.Markdown() | |
# submit_button.click( | |
# add_new_eval, | |
# [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type], | |
# submission_result, | |
# ) | |
# Enhanced citation section | |
with gr.Accordion("π Citation", open=False, elem_classes="citation-accordion"): | |
gr.HTML(""" | |
<div style="display: flex; align-items: center; gap: 20px; margin-bottom: 15px;"> | |
<div style="font-size: 2.5em;">π</div> | |
<div> | |
<h3 style="margin: 0;">How to Cite This Benchmark</h3> | |
<p style="margin: 5px 0 0 0; color: #666;">Please use the following citation if you use this benchmark in your research</p> | |
</div> | |
</div> | |
""") | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
lines=10, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
# Footer | |
gr.HTML(""" | |
<div class="footer"> | |
<p>Β© 2025 MLE-Dojo Benchmark. All rights reserved.</p> | |
<p style="margin-top: 5px; display: flex; justify-content: center; gap: 20px;"> | |
<a href="#" style="color: #1a5fb4; text-decoration: none;">Privacy Policy</a> | |
<a href="#" style="color: #1a5fb4; text-decoration: none;">Terms of Service</a> | |
<a href="#" style="color: #1a5fb4; text-decoration: none;">Contact Us</a> | |
</p> | |
</div> | |
""") | |
# --- Keep scheduler if relevant --- | |
if __name__ == "__main__": | |
try: | |
scheduler = BackgroundScheduler() | |
if callable(restart_space): | |
if REPO_ID and REPO_ID != "your/space-id": | |
scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins | |
scheduler.start() | |
else: | |
print("Warning: REPO_ID not set or is placeholder; space restart job not scheduled.") | |
else: | |
print("Warning: restart_space function not available; space restart job not scheduled.") | |
except Exception as e: | |
print(f"Failed to initialize or start scheduler: {e}") | |
# --- Launch the app --- | |
if __name__ == "__main__": | |
print("Launching Enhanced Gradio App...") | |
demo.launch() |