import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from datetime import datetime # --- Make sure these imports work relative to your file structure --- # Option 1: If src is a directory in the same folder as your script: try: from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.envs import REPO_ID from src.submission.submit import add_new_eval print("Successfully imported from src module.") # Option 2: If you don't have these files, define placeholders except ImportError: print("Warning: Using placeholder values because src module imports failed.") CITATION_BUTTON_LABEL = "Citation" CITATION_BUTTON_TEXT = "Please cite us if you use this benchmark..." EVALUATION_QUEUE_TEXT = "Current evaluation queue:" INTRODUCTION_TEXT = """ # Welcome to the MLE-Dojo Benchmark Leaderboard This leaderboard tracks the performance of various AI models across multiple machine learning engineering domains. Our comprehensive evaluation system uses ELO ratings to provide a fair comparison between different models. ## How to read this leaderboard - Select a domain category to view specialized rankings - Higher ELO scores indicate better performance - Click on any model name to learn more about it """ LLM_BENCHMARKS_TEXT = """ # About the MLE-Dojo Benchmark ## Evaluation Methodology The MLE-Dojo benchmark evaluates models across various domains including: - **MLE-Lite**: Basic machine learning engineering tasks - **Tabular**: Data manipulation, analysis, and modeling with structured data - **NLP**: Natural language processing tasks including classification, generation, and understanding - **CV**: Computer vision tasks including image classification, object detection, and generation Our evaluation uses a sophisticated ELO rating system that considers the relative performance of models against each other. ## Contact For more information or to submit your model, please contact us at contact@mle-dojo.example """ TITLE = "

🏆 MLE-Dojo Benchmark Leaderboard

" custom_css = "" REPO_ID = "your/space-id" def add_new_eval(*args): return "Submission placeholder." # --- Elo Leaderboard Configuration --- # Enhanced data with Rank (placeholder), Organizer, License, and URL data = [ {'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778}, {'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841}, {'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, {'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023}, {'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100}, {'model_name': 'gemini-2.0-flash', 'url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895}, {'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054}, {'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214}, ] # Add organization logos (for visual enhancement) org_logos = { 'OpenAI': '📱', # You can replace these with actual icon URLs in production 'DeepSeek': '🔍', 'Google': '🌐', 'Default': '🤖' } # Create a master DataFrame master_df = pd.DataFrame(data) # Add last updated timestamp last_updated = datetime.now().strftime("%B %d, %Y at %H:%M:%S") # Define categories with fancy icons CATEGORIES = [ ("🏆 Overall", "Overall"), ("💡 MLE-Lite", "MLE-Lite"), ("📊 Tabular", "Tabular"), ("📝 NLP", "NLP"), ("👁️ CV", "CV") ] DEFAULT_CATEGORY = "Overall" # Map user-facing categories to DataFrame column names category_to_column = { "MLE-Lite": "MLE-Lite_Elo", "Tabular": "Tabular_Elo", "NLP": "NLP_Elo", "CV": "CV_Elo", "Overall": "Overall" } # --- Helper function to update leaderboard --- def update_leaderboard(category_label): """ Enhanced function to update the leaderboard with visual improvements """ # Extract the category value from the label if it's a tuple (icon, value) if isinstance(category_label, tuple): category = category_label[1] else: # For backward compatibility or direct values category = category_label.split(" ")[-1] if " " in category_label else category_label score_column = category_to_column.get(category) if score_column is None or score_column not in master_df.columns: print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.") score_column = category_to_column[DEFAULT_CATEGORY] if score_column not in master_df.columns: print(f"Error: Default column '{score_column}' also not found.") return pd.DataFrame({ "Rank": [], "Model": [], "Organizer": [], "License": [], "Elo Score": [] }) # Select base columns + the score column for sorting cols_to_select = ['model_name', 'url', 'organizer', 'license', score_column] df = master_df[cols_to_select].copy() # Sort by the selected 'Elo Score' descending df.sort_values(by=score_column, ascending=False, inplace=True) # Add Rank with medal emojis for top 3 df.reset_index(drop=True, inplace=True) # Create fancy rank with medals for top positions def get_rank_display(idx): if idx == 0: return "🥇 1" elif idx == 1: return "🥈 2" elif idx == 2: return "🥉 3" else: return f"{idx + 1}" df.insert(0, 'Rank', df.index.map(get_rank_display)) # Add organization icons to model names df['Model'] = df.apply( lambda row: f"""

{org_logos.get(row['organizer'], org_logos['Default'])} {row['model_name']}

""", axis=1 ) # Format Elo scores with visual indicators df['Elo Display'] = df[score_column].apply( lambda score: f"""

{score}

""" ) # Rename columns for display df.rename(columns={score_column: 'Elo Score'}, inplace=True) df.rename(columns={'organizer': 'Organizer', 'license': 'License'}, inplace=True) # Select and reorder columns for final display final_columns = ["Rank", "Model", "Organizer", "License", "Elo Display"] df = df[final_columns] # Rename for display df.columns = ["Rank", "Model", "Organization", "License", f"Elo Score ({category})"] return df # --- Mock/Placeholder functions/data for other tabs --- print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.") finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) EVAL_COLS = ["Model", "Status", "Requested", "Started"] EVAL_TYPES = ["str", "str", "str", "str"] # --- Keep restart function if relevant --- def restart_space(): print(f"Attempting to restart space: {REPO_ID}") # Replace with your actual space restart mechanism if needed # --- Enhanced CSS for beauty and readability --- enhanced_css = """ /* Base styling */ :root { --primary-color: #1a5fb4; --secondary-color: #2ec27e; --accent-color: #e5a50a; --warning-color: #ff7800; --text-color: #333333; --background-color: #ffffff; --card-background: #f9f9f9; --border-color: #e0e0e0; --shadow-color: rgba(0, 0, 0, 0.1); } /* Typography */ body, .gradio-container { font-family: 'Inter', 'Segoe UI', Roboto, -apple-system, BlinkMacSystemFont, system-ui, sans-serif !important; font-size: 16px !important; line-height: 1.6 !important; color: var(--text-color) !important; background-color: var(--background-color) !important; } /* Headings */ h1 { font-size: 2.5rem !important; font-weight: 700 !important; margin-bottom: 1.5rem !important; color: var(--primary-color) !important; text-align: center !important; letter-spacing: -0.02em !important; line-height: 1.2 !important; } h2 { font-size: 1.8rem !important; font-weight: 600 !important; margin-top: 1.5rem !important; margin-bottom: 1rem !important; color: var(--primary-color) !important; letter-spacing: -0.01em !important; } h3 { font-size: 1.4rem !important; font-weight: 600 !important; margin-top: 1.2rem !important; margin-bottom: 0.8rem !important; color: var(--text-color) !important; } /* Tabs styling */ .tabs { margin-top: 1rem !important; border-radius: 12px !important; overflow: hidden !important; box-shadow: 0 4px 12px var(--shadow-color) !important; } .tab-nav button { font-size: 1.1rem !important; font-weight: 500 !important; padding: 0.8rem 1.5rem !important; border-radius: 0 !important; transition: all 0.2s ease !important; } .tab-nav button.selected { background-color: var(--primary-color) !important; color: white !important; font-weight: 600 !important; } /* Card styling */ .gradio-container .gr-box, .gradio-container .gr-panel { border-radius: 12px !important; border: 1px solid var(--border-color) !important; box-shadow: 0 4px 12px var(--shadow-color) !important; overflow: hidden !important; } /* Table styling */ table { width: 100% !important; border-collapse: separate !important; border-spacing: 0 !important; margin: 1.5rem 0 !important; border-radius: 8px !important; overflow: hidden !important; box-shadow: 0 4px 12px var(--shadow-color) !important; } th { background-color: #f0f5ff !important; color: var(--primary-color) !important; font-weight: 600 !important; padding: 1rem !important; font-size: 1.1rem !important; text-align: left !important; border-bottom: 2px solid var(--primary-color) !important; } td { padding: 1rem !important; border-bottom: 1px solid var(--border-color) !important; font-size: 1rem !important; vertical-align: middle !important; } tr:nth-child(even) { background-color: #f8fafd !important; } tr:hover { background-color: #edf2fb !important; } tr:first-child td { border-top: none !important; } /* Button styling */ button.primary, .gr-button.primary { background-color: var(--primary-color) !important; color: white !important; font-weight: 500 !important; padding: 0.8rem 1.5rem !important; border-radius: 8px !important; border: none !important; cursor: pointer !important; transition: all 0.2s ease !important; box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1) !important; } button.primary:hover, .gr-button.primary:hover { background-color: #0b4a9e !important; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important; transform: translateY(-1px) !important; } /* Radio buttons */ .gr-radio { display: flex !important; flex-wrap: wrap !important; gap: 10px !important; margin: 1rem 0 !important; } .gr-radio label { background-color: #f5f7fa !important; border: 1px solid var(--border-color) !important; border-radius: 8px !important; padding: 0.7rem 1.2rem !important; font-size: 1rem !important; font-weight: 500 !important; cursor: pointer !important; transition: all 0.2s ease !important; display: flex !important; align-items: center !important; gap: 8px !important; } .gr-radio label:hover { background-color: #eaeef3 !important; border-color: #c0c9d6 !important; } .gr-radio label.selected { background-color: #e0e9f7 !important; border-color: var(--primary-color) !important; color: var(--primary-color) !important; font-weight: 600 !important; } /* Input fields */ input, textarea, select { font-size: 1rem !important; padding: 0.8rem !important; border-radius: 8px !important; border: 1px solid var(--border-color) !important; transition: all 0.2s ease !important; } input:focus, textarea:focus, select:focus { border-color: var(--primary-color) !important; box-shadow: 0 0 0 2px rgba(26, 95, 180, 0.2) !important; outline: none !important; } /* Accordion styling */ .gr-accordion { border-radius: 8px !important; overflow: hidden !important; margin: 1rem 0 !important; border: 1px solid var(--border-color) !important; } .gr-accordion-header { padding: 1rem !important; background-color: #f5f7fa !important; font-weight: 600 !important; font-size: 1.1rem !important; color: var(--text-color) !important; } .gr-accordion-content { padding: 1rem !important; background-color: white !important; } /* Markdown text improvements */ .markdown-text { font-size: 1.05rem !important; line-height: 1.7 !important; } .markdown-text p { margin-bottom: 1rem !important; } .markdown-text ul, .markdown-text ol { margin-left: 1.5rem !important; margin-bottom: 1rem !important; } .markdown-text li { margin-bottom: 0.5rem !important; } .markdown-text strong { font-weight: 600 !important; color: #333 !important; } /* Status indicators */ .status-badge { display: inline-block; padding: 0.3rem 0.7rem; border-radius: 99px; font-size: 0.85rem; font-weight: 500; text-align: center; } .status-pending { background-color: #fff8e0; color: #b58a00; border: 1px solid #ffd74d; } .status-running { background-color: #e0f2ff; color: #0066cc; border: 1px solid #66b3ff; } .status-completed { background-color: #e6f7ef; color: #00875a; border: 1px solid #57d9a3; } /* Footer */ .footer { margin-top: 2rem; padding: 1rem; text-align: center; font-size: 0.9rem; color: #666; border-top: 1px solid var(--border-color); } /* Enhanced leaderboard title */ .leaderboard-header { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1.5rem; padding-bottom: 1rem; border-bottom: 2px solid var(--border-color); } .leaderboard-title { font-size: 2.2rem; font-weight: 700; color: var(--primary-color); margin: 0; display: flex; align-items: center; gap: 0.5rem; } .leaderboard-subtitle { font-size: 1.1rem; color: #666; margin-top: 0.5rem; } .timestamp { font-size: 0.85rem; color: #666; font-style: italic; } /* Category selector buttons */ .category-buttons { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 1.5rem; } .category-button { padding: 0.7rem 1.2rem; background-color: #f0f5ff; border: 1px solid #d0e0ff; border-radius: 8px; font-weight: 500; cursor: pointer; transition: all 0.2s ease; display: flex; align-items: center; gap: 8px; } .category-button:hover { background-color: #e0ebff; border-color: #b0d0ff; } .category-button.active { background-color: var(--primary-color); color: white; border-color: var(--primary-color); } /* Logo and brand styling */ .logo { font-size: 2.5em; margin-right: 0.5rem; } /* Medal styling for top ranks */ .rank-1 { color: #ffd700; font-weight: bold; } .rank-2 { color: #c0c0c0; font-weight: bold; } .rank-3 { color: #cd7f32; font-weight: bold; } """ # Combine with any existing CSS custom_css = enhanced_css + custom_css # --- Gradio App Definition --- demo = gr.Blocks(css=custom_css, theme=gr.themes.Soft()) with demo: # Enhanced header with timestamp gr.HTML(f"""

🏆 MLE-Dojo Benchmark Leaderboard

Comprehensive evaluation of AI models across multiple domains

Last updated: {last_updated}

""") # Introduction with enhanced styling gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("📊 Leaderboard", elem_id="llm-benchmark-tab-table", id=0): with gr.Column(): gr.HTML("""

📈 Model Performance Rankings

Select a category to view specialized performance metrics

""") # Enhanced category selector category_selector = gr.Radio( choices=[x[0] for x in CATEGORIES], label="Select Performance Domain:", value="🏆 Overall", interactive=True, elem_classes="fancy-radio" ) # Visual separator gr.HTML('

') # Enhanced leaderboard table leaderboard_df_component = gr.Dataframe( value=update_leaderboard(DEFAULT_CATEGORY), headers=["Rank", "Model", "Organization", "License", f"Elo Score ({DEFAULT_CATEGORY})"], datatype=["html", "html", "str", "str", "html"], interactive=False, row_count=(len(master_df), "fixed"), col_count=(5, "fixed"), wrap=True, elem_id="leaderboard-table", ) # Stats cards (visual enhancement) with gr.Row(): with gr.Column(scale=1): gr.HTML(f"""

🔍

{len(master_df)}

Models Evaluated

""") with gr.Column(scale=1): gr.HTML(f"""

🌐

{master_df['organizer'].nunique()}

Organizations

""") with gr.Column(scale=1): gr.HTML(f"""

🏅

{len(CATEGORIES)}

Performance Domains

""") # Link the radio button change to the update function category_selector.change( fn=update_leaderboard, inputs=category_selector, outputs=leaderboard_df_component ) with gr.TabItem("📚 About", elem_id="llm-benchmark-tab-about", id=1): # Enhanced about section gr.HTML("""

🧪

About the MLE-Dojo Benchmark

A comprehensive evaluation framework for AI models

""") # Use the LLM_BENCHMARKS_TEXT variable gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") # Add methodology cards for visual enhancement with gr.Row(): with gr.Column(): gr.HTML("""

💡

MLE-Lite

Evaluates a model's ability to handle basic machine learning engineering tasks including data preprocessing, feature engineering, model selection, and basic deployment.

""") with gr.Column(): gr.HTML("""

📊

Tabular

Tests a model's ability to process, analyze and model structured data, including statistical analysis,statistical analysis, predictive modeling, and data visualization with tabular datasets.

""") with gr.Row(): with gr.Column(): gr.HTML("""

📝

NLP

Evaluates natural language processing capabilities including text classification, sentiment analysis, entity recognition, text generation, and language understanding.

""") with gr.Column(): gr.HTML("""

👁️

CV

Tests computer vision capabilities including image classification, object detection, image generation, and visual understanding tasks across various domains.

""") # Optional: Uncomment if you want to re-enable the Submit tab # with gr.TabItem("🚀 Submit Model", elem_id="llm-benchmark-tab-submit", id=2): # with gr.Column(): # gr.HTML(""" #

🚀

Submit Your Model for Evaluation

Add your model to the MLE-Dojo leaderboard

# """) # # with gr.Row(): # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # # with gr.Column(): # with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False): # finished_eval_table = gr.components.Dataframe( # value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, # ) # with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False): # running_eval_table = gr.components.Dataframe( # value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, # ) # with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False): # pending_eval_table = gr.components.Dataframe( # value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, # ) # # gr.HTML('

') # # gr.HTML(""" #

# 📝 Model Submission Form #

# """) # # with gr.Row(): # with gr.Column(): # model_name_textbox = gr.Textbox( # label="Model Name (on Hugging Face Hub)", # placeholder="Enter your model name...", # elem_classes="enhanced-input" # ) # revision_name_textbox = gr.Textbox( # label="Revision / Commit Hash", # placeholder="main", # elem_classes="enhanced-input" # ) # model_type = gr.Dropdown( # choices=["Type A", "Type B", "Type C"], # label="Model Type", # multiselect=False, # value=None, # interactive=True, # elem_classes="enhanced-dropdown" # ) # with gr.Column(): # precision = gr.Dropdown( # choices=["float16", "bfloat16", "float32", "int8", "auto"], # label="Precision", # multiselect=False, # value="auto", # interactive=True, # elem_classes="enhanced-dropdown" # ) # weight_type = gr.Dropdown( # choices=["Original", "Adapter", "Delta"], # label="Weights Type", # multiselect=False, # value="Original", # interactive=True, # elem_classes="enhanced-dropdown" # ) # base_model_name_textbox = gr.Textbox( # label="Base Model (for delta or adapter weights)", # placeholder="Only needed for adapter/delta weights", # elem_classes="enhanced-input" # ) # # submit_button = gr.Button( # "Submit for Evaluation", # elem_classes="primary-button" # ) # submission_result = gr.Markdown() # submit_button.click( # add_new_eval, # [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type], # submission_result, # ) # Enhanced citation section with gr.Accordion("📄 Citation", open=False, elem_classes="citation-accordion"): gr.HTML("""

📄

How to Cite This Benchmark

Please use the following citation if you use this benchmark in your research

""") citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=10, elem_id="citation-button", show_copy_button=True, ) # Footer gr.HTML(""" """) # --- Keep scheduler if relevant --- if __name__ == "__main__": try: scheduler = BackgroundScheduler() if callable(restart_space): if REPO_ID and REPO_ID != "your/space-id": scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins scheduler.start() else: print("Warning: REPO_ID not set or is placeholder; space restart job not scheduled.") else: print("Warning: restart_space function not available; space restart job not scheduled.") except Exception as e: print(f"Failed to initialize or start scheduler: {e}") # --- Launch the app --- if __name__ == "__main__": print("Launching Enhanced Gradio App...") demo.launch()