Spaces:

MLE-Dojo
/

Leaderboard

Running

App Files Files Community

Leaderboard / app.py

Jerrycool

Update app.py

39c3577 verified 4 months ago

raw

history blame

23.9 kB

	import gradio as gr
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	# Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard

	# --- Attempt to import from src or use placeholders ---
	try:
	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT, # Keep if used by commented-out submit tab
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css # Assuming this might exist but we'll override/append
	from src.envs import REPO_ID # Keep if needed for restart_space or other functions
	from src.submission.submit import add_new_eval # Keep if using the submit tab
	print("Successfully imported from src module.")
	# Ensure custom_css is initialized if it exists but is None or empty
	if not isinstance(custom_css, str):
	custom_css = ""
	except ImportError:
	print("Warning: Using placeholder values because src module imports failed.")
	CITATION_BUTTON_LABEL="Citation"
	CITATION_BUTTON_TEXT="Please cite us if you use this benchmark...\n[Your BibTeX entry here]" # Added placeholder content
	EVALUATION_QUEUE_TEXT="Current evaluation queue:"
	INTRODUCTION_TEXT="""
	Welcome to the MLE-Dojo Benchmark Leaderboard. Select a category below to see the rankings.
	Models are ranked based on their Elo scores across various machine learning tasks.
	"""
	LLM_BENCHMARKS_TEXT="""
	## About the Benchmarks

	This leaderboard tracks the performance of various models on the MLE-Dojo benchmark suite.
	The suite includes tasks covering:

	* MLE-Lite: Lightweight ML tasks.
	* Tabular: Tasks involving structured data.
	* NLP: Natural Language Processing tasks.
	* CV: Computer Vision tasks.

	Scores are calculated using an Elo rating system. Higher scores indicate better performance relative to other models in the benchmark.
	"""
	TITLE="<h1>🏆 MLE-Dojo Benchmark Leaderboard</h1>"
	custom_css="" # Start with empty CSS
	REPO_ID="your/space-id" # Replace with actual ID if needed
	def add_new_eval(*args): return "Submission placeholder."
	print("Placeholder function 'add_new_eval' defined.")
	# --- End Placeholder Definitions ---


	# --- Elo Leaderboard Configuration ---
	# Enhanced data with Rank (placeholder), Organizer, License, and URL
	data = [
	{'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
	{'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
	{'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, # Fill details later
	{'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
	{'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
	{'model_name': 'gemini-2.0-flash', 'url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
	{'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
	{'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
	]

	# Create a master DataFrame
	master_df = pd.DataFrame(data)

	# Define categories for selection (user-facing)
	CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"] # Overall first
	DEFAULT_CATEGORY = "Overall" # Set a default category

	# Map user-facing categories to DataFrame column names
	category_to_column = {
	"MLE-Lite": "MLE-Lite_Elo",
	"Tabular": "Tabular_Elo",
	"NLP": "NLP_Elo",
	"CV": "CV_Elo",
	"Overall": "Overall"
	}

	# --- Helper function to update leaderboard ---
	def update_leaderboard(category):
	"""
	Selects relevant columns, sorts by the chosen category's Elo score,
	adds Rank, formats model name as a link, and returns the DataFrame.
	"""
	score_column = category_to_column.get(category)
	if score_column is None or score_column not in master_df.columns:
	print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
	score_column = category_to_column[DEFAULT_CATEGORY]
	if score_column not in master_df.columns:
	print(f"Error: Default column '{score_column}' also not found.")
	# Return empty df with correct capitalized column names for display
	return pd.DataFrame({
	"Rank": [], "Model": [], "Elo Score": [], "Organizer": [], "License": []
	})

	# Select base columns + the score column for sorting (use original case from master_df)
	cols_to_select = ['model_name', 'url', 'organizer', 'license', score_column]
	df = master_df[cols_to_select].copy()

	# Sort by the selected 'Elo Score' descending
	df.sort_values(by=score_column, ascending=False, inplace=True)

	# Add Rank based on the sorted order
	df.reset_index(drop=True, inplace=True)
	df.insert(0, 'Rank', df.index + 1)

	# Format Model Name as HTML Hyperlink (results in 'Model' column)
	df['Model'] = df.apply(
	lambda row: f"<a href='{row['url'] if pd.notna(row['url']) else '#'}' target='_blank' style='color: #007bff; text-decoration: none; font-weight: 600;'>{row['model_name']}</a>",
	axis=1
	)

	# Rename the score column to 'Elo Score' for consistent display
	df.rename(columns={score_column: 'Elo Score'}, inplace=True)

	# Rename 'organizer' and 'license' to match desired display headers (Capitalized)
	df.rename(columns={'organizer': 'Organizer', 'license': 'License'}, inplace=True)

	# Select and reorder columns for final display (use Capitalized names)
	final_columns = ["Rank", "Model", "Organizer", "License", "Elo Score"]
	df = df[final_columns]

	# Return DataFrame with columns: 'Rank', 'Model', 'Organizer', 'License', 'Elo Score'
	return df

	# --- Mock/Placeholder functions/data for other tabs ---
	print("Warning: Evaluation queue data fetching is disabled/mocked.")
	finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
	running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
	pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
	EVAL_COLS = ["Model", "Status", "Requested", "Started"]
	EVAL_TYPES = ["str", "str", "str", "str"]

	# --- Keep restart function if relevant ---
	def restart_space():
	print(f"Attempting to restart space: {REPO_ID}")
	# Replace with actual restart mechanism if needed (e.g., HfApi().restart_space(REPO_ID))


	# --- Enhanced CSS ---
	# Concatenate existing CSS (if any) with new styles
	# Ensure custom_css is a string before appending
	if not isinstance(custom_css, str):
	custom_css = ""

	custom_css += """
	/* --- Import Font --- */
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

	/* --- Global Styles & Font --- */
	body {
	font-family: 'Inter', sans-serif;
	background: linear-gradient(to bottom right, #fdfbfb, #ebedee); /* Subtle gradient */
	color: #333;
	}

	:root {
	--primary-color: #007bff; /* Example primary color */
	--text-color: #333;
	--border-radius: 8px;
	--card-background: rgba(255, 255, 255, 0.8); /* Slightly transparent */
	--shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
	}

	/* Set base font size on html for rem units */
	html {
	font-size: 16px; /* Base font size */
	}

	/* Increase overall text size slightly using rem */
	.gradio-container {
	font-size: 1rem; /* Approx 16px */
	line-height: 1.6;
	}

	/* --- Headings --- */
	h1, .markdown-text h1 {
	font-size: 2.5rem; /* Larger title */
	font-weight: 700;
	color: #2c3e50; /* Darker heading color */
	margin-bottom: 1rem;
	text-align: center;
	padding-top: 1rem;
	}
	h2, .markdown-text h2 {
	font-size: 1.75rem; /* Larger section titles */
	font-weight: 600;
	color: #2c3e50;
	margin-top: 1.5rem;
	margin-bottom: 0.75rem;
	border-bottom: 2px solid var(--primary-color);
	padding-bottom: 0.3rem;
	}

	/* --- Markdown Text Styling --- */
	.markdown-text p, .markdown-text li {
	font-size: 1.05rem; /* Slightly larger paragraph text */
	color: var(--text-color);
	}
	.markdown-text strong {
	font-weight: 600;
	color: #0056b3;
	}

	/* --- Tab Styling --- */
	.tab-buttons > .tabs > button {
	font-size: 1.1rem !important;
	font-weight: 600;
	padding: 12px 20px !important;
	border-radius: var(--border-radius) var(--border-radius) 0 0 !important;
	background-color: #e9ecef !important;
	border-bottom: 2px solid transparent !important;
	transition: all 0.3s ease;
	}
	.tab-buttons > .tabs > button.selected {
	background-color: var(--card-background) !important;
	border-bottom: 2px solid var(--primary-color) !important;
	color: var(--primary-color) !important;
	box-shadow: 0 -2px 5px rgba(0, 0, 0, 0.05);
	}

	/* --- Radio Button "Chips" Styling --- */
	/* Targeting the container for the radio items */
	.gradio-container .styler_radio_ MuiFormGroup-root {
	display: flex;
	flex-direction: row; /* Arrange horizontally */
	flex-wrap: wrap;
	gap: 10px; /* Space between chips */
	margin-bottom: 1.5rem; /* Space below the chips */
	}

	/* Styling individual radio items as chips */
	.gradio-container .styler_radio_ MuiFormControlLabel-root {
	background-color: #f8f9fa;
	border: 1px solid #dee2e6;
	padding: 8px 16px; /* Chip padding */
	border-radius: 20px; /* Pill shape */
	cursor: pointer;
	transition: all 0.2s ease-in-out;
	margin: 0 !important; /* Override default margins */
	}

	/* Hide the actual radio button circle */
	.gradio-container .styler_radio_ .MuiRadio-root {
	display: none;
	}

	/* Style for the label text inside the chip */
	.gradio-container .styler_radio_ .MuiFormControlLabel-label {
	font-size: 1rem; /* Chip text size */
	font-weight: 600;
	color: #495057;
	}

	/* Style for the selected chip */
	.gradio-container .styler_radio_ .Mui-checked + .MuiFormControlLabel-label {
	color: white !important; /* Ensure text is readable on selected background */
	}

	.gradio-container .styler_radio_ .Mui-checked .MuiFormControlLabel-label {
	color: white !important; /* Backup selector */
	}

	.gradio-container .styler_radio_ .MuiFormControlLabel-root.Mui-checked, /* This might target the container*/
	.gradio-container .styler_radio_ span.Mui-checked + span { /* Or target based on the checked span */
	/* This seems more complex now, let's try styling the parent container */
	}
	.gradio-container .styler_radio_ label:has(input:checked) {
	background-color: var(--primary-color) !important;
	border-color: var(--primary-color) !important;
	color: white !important; /* Text color for selected */
	box-shadow: 0 2px 4px rgba(0, 123, 255, 0.3);
	}
	/* Apply white text color specifically to the label text when checked */
	.gradio-container .styler_radio_ label:has(input:checked) span {
	color: white !important;
	}


	/* Hover effect for non-selected chips */
	.gradio-container .styler_radio_ label:not(:has(input:checked)):hover {
	background-color: #e9ecef;
	border-color: #adb5bd;
	}


	/* --- Leaderboard Table Styling --- */
	#leaderboard-table {
	background-color: var(--card-background);
	border-radius: var(--border-radius);
	box-shadow: var(--shadow);
	overflow: hidden; /* Ensures rounded corners clip content */
	border-collapse: separate; /* Needed for border-radius on table */
	border-spacing: 0;
	margin-top: 1rem;
	}

	#leaderboard-table th,
	#leaderboard-table td {
	padding: 12px 16px; /* More padding */
	text-align: left;
	font-size: 1rem; /* Table font size */
	border-bottom: 1px solid #eee; /* Lighter border */
	vertical-align: middle; /* Center content vertically */
	white-space: normal; /* Allow wrapping */
	}

	#leaderboard-table th {
	background-color: #f8f9fa; /* Light grey header */
	font-weight: 600;
	color: #495057;
	font-size: 1.05rem;
	border-top: 1px solid #eee; /* Add top border for consistency */
	}

	#leaderboard-table tr:last-child td {
	border-bottom: none; /* Remove bottom border for last row */
	}

	#leaderboard-table tr:nth-child(even) td {
	background-color: rgba(249, 249, 249, 0.7); /* Slightly transparent even rows */
	}

	#leaderboard-table tr:hover td {
	background-color: rgba(233, 233, 233, 0.8); /* Hover effect */
	}

	/* Style for the model link */
	#leaderboard-table td a {
	color: var(--primary-color);
	text-decoration: none;
	font-weight: 600; /* Make model name stand out */
	transition: color 0.2s ease;
	}

	#leaderboard-table td a:hover {
	color: #0056b3; /* Darker blue on hover */
	text-decoration: underline;
	}

	/* Rank column styling */
	#leaderboard-table td:first-child,
	#leaderboard-table th:first-child {
	text-align: center;
	font-weight: 700;
	width: 60px; /* Fixed width for Rank */
	}

	/* Elo Score column styling */
	#leaderboard-table td:last-child,
	#leaderboard-table th:last-child {
	text-align: right;
	font-weight: 600;
	width: 100px; /* Fixed width for Elo Score */
	}


	/* --- Accordion Styling --- */
	.gradio-accordion, .accordion { /* Targeting gradio 4+ */
	border: 1px solid #ddd;
	border-radius: var(--border-radius);
	margin-bottom: 1rem;
	box-shadow: var(--shadow);
	background-color: var(--card-background);
	}
	.gradio-accordion > button, .accordion > button { /* Targeting header button */
	font-size: 1.1rem !important;
	font-weight: 600;
	padding: 12px 15px !important;
	background-color: #f8f9fa !important;
	border-bottom: 1px solid #eee !important;
	}
	.gradio-accordion > button[aria-expanded="true"],
	.accordion > button[aria-expanded="true"] {
	background-color: #f1f3f5 !important;
	}


	/* --- Textbox/Citation Styling --- */
	#citation-button textarea {
	font-family: 'Courier New', Courier, monospace; /* Monospace for code/citation */
	font-size: 0.95rem;
	background-color: #fdfdfd;
	border-radius: var(--border-radius);
	padding: 15px;
	line-height: 1.5;
	border: 1px solid #ccc;
	box-shadow: inset 0 1px 3px rgba(0,0,0,0.06);
	}
	#citation-button button { /* Style copy button */
	font-size: 0.9rem !important;
	padding: 5px 10px !important;
	}

	/* --- General Button Styling (if needed for submit tab) --- */
	.gradio-button, button.gr-button {
	font-size: 1.05rem !important;
	font-weight: 600;
	padding: 10px 20px !important;
	border-radius: var(--border-radius) !important;
	transition: all 0.3s ease !important;
	}

	/* Adjustments for smaller screens if necessary */
	@media (max-width: 768px) {
	html { font-size: 15px; } /* Slightly smaller base font on mobile */
	h1, .markdown-text h1 { font-size: 2rem; }
	h2, .markdown-text h2 { font-size: 1.5rem; }
	#leaderboard-table th, #leaderboard-table td { padding: 8px 10px; font-size: 0.95rem;}
	.tab-buttons > .tabs > button { font-size: 1rem !important; padding: 10px 15px !important;}
	.gradio-container .styler_radio_ MuiFormControlLabel-root { padding: 6px 12px; }
	.gradio-container .styler_radio_ .MuiFormControlLabel-label { font-size: 0.95rem; }
	}
	"""

	# --- Gradio App Definition ---
	# Use a theme for better default styling - Glass theme is modern
	demo = gr.Blocks(css=custom_css, theme=gr.themes.Glass(primary_hue="blue", secondary_hue="sky"))

	with demo:
	# Use the TITLE variable imported or defined above
	gr.HTML(TITLE)

	# Use the INTRODUCTION_TEXT variable imported or defined above
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	# Added relevant icons to tab labels
	with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
	with gr.Column():
	gr.Markdown("## Select Category to Rank By", elem_classes="markdown-text") # Changed heading
	category_selector = gr.Radio(
	choices=CATEGORIES,
	label="Category:", # Simplified label
	value=DEFAULT_CATEGORY,
	interactive=True,
	# elem_classes="category-radio-chips" # Add class for potential CSS targeting if needed
	# Use internal class instead for more robust targeting: 'styler_radio_'
	elem_classes="styler_radio_" # Add hook class
	)
	leaderboard_df_component = gr.Dataframe(
	value=update_leaderboard(DEFAULT_CATEGORY),
	headers=["Rank", "Model", "Organizer", "License", "Elo Score"],
	datatype=["number", "html", "str", "str", "number"],
	interactive=False,
	row_count=(len(master_df), "fixed"), # Display all rows
	col_count=(5, "fixed"),
	wrap=True, # Allow text wrapping in cells
	elem_id="leaderboard-table" # CSS hook for custom styling
	)
	# Link the radio button change to the update function
	category_selector.change(
	fn=update_leaderboard,
	inputs=category_selector,
	outputs=leaderboard_df_component
	)

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=1):
	# Use the LLM_BENCHMARKS_TEXT variable imported or defined above
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	# --- Submit Tab (Commented out as in original request) ---
	# Uncomment and ensure necessary variables/functions are available if needed
	# with gr.TabItem("🚀 Submit", elem_id="llm-benchmark-tab-submit", id=2):
	# with gr.Column():
	# with gr.Row():
	# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
	# with gr.Column():
	# with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
	# finished_eval_table = gr.Dataframe( # Use gr.Dataframe
	# value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
	# )
	# with gr.Accordion(f"🔄 Running Evaluations ({len(running_eval_queue_df)})", open=False):
	# running_eval_table = gr.Dataframe( # Use gr.Dataframe
	# value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
	# )
	# with gr.Accordion(f"⏳ Pending Evaluations ({len(pending_eval_queue_df)})", open=False):
	# pending_eval_table = gr.Dataframe( # Use gr.Dataframe
	# value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
	# )
	# with gr.Row():
	# gr.Markdown("## ✉️ Submit Your Model", elem_classes="markdown-text") # Changed heading
	# with gr.Row():
	# with gr.Column(scale=1):
	# model_name_textbox = gr.Textbox(label="Model Name (Hugging Face Hub ID)")
	# revision_name_textbox = gr.Textbox(label="Revision / Commit Hash", placeholder="main")
	# model_type = gr.Dropdown(choices=["CausalLM", "Seq2SeqLM", "Other"], label="Model Type", multiselect=False, value="CausalLM", interactive=True) # Example choices
	# with gr.Column(scale=1):
	# precision = gr.Dropdown(choices=["float16", "bfloat16", "float32", "int8", "auto"], label="Precision", multiselect=False, value="auto", interactive=True)
	# weight_type = gr.Dropdown(choices=["Original", "Adapter", "Delta"], label="Weights Type", multiselect=False, value="Original", interactive=True)
	# base_model_name_textbox = gr.Textbox(label="Base Model (for Adapter/Delta)", placeholder="Leave empty if Original weights")
	# submit_button = gr.Button("Submit for Evaluation", variant="primary") # Added variant
	# submission_result = gr.Markdown()
	# # Ensure add_new_eval is correctly imported/defined and handles these inputs
	# # Make sure add_new_eval is defined if you uncomment this
	# if callable(add_new_eval):
	# submit_button.click(
	# add_new_eval,
	# [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ],
	# submission_result,
	# )
	# else:
	# print("Warning: 'add_new_eval' function not callable. Submit button disabled.")
	# submit_button.interactive = False # Disable button if function missing


	# --- Citation Row (at the bottom, outside Tabs, using Accordion) ---
	with gr.Accordion("📙 Citation", open=False):
	# Use the CITATION_BUTTON_TEXT and CITATION_BUTTON_LABEL variables
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=10, # Adjust lines based on content and new font size
	elem_id="citation-button", # Keep ID for CSS targeting
	show_copy_button=True,
	interactive=False # Make it non-editable
	)

	# --- Keep scheduler if relevant ---
	# Only start scheduler if the script is run directly
	if __name__ == "__main__":
	try:
	scheduler = BackgroundScheduler()
	if callable(restart_space):
	if REPO_ID and REPO_ID != "your/space-id":
	scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins
	scheduler.start()
	print("Scheduler started for space restart.")
	else:
	print("Warning: REPO_ID not set or is placeholder; space restart job not scheduled.")
	else:
	print("Warning: restart_space function not available; space restart job not scheduled.")
	except Exception as e:
	print(f"Failed to initialize or start scheduler: {e}")


	# --- Launch the app ---
	# Ensures the app launches only when the script is run directly
	if __name__ == "__main__":
	# Ensure you have installed necessary libraries: pip install gradio pandas apscheduler
	# Make sure your src module files (about.py etc.) are accessible OR use the placeholder definitions.
	print("Launching Gradio App with enhanced styling...")
	demo.launch()