Spaces:

MLE-Dojo
/

Leaderboard

Running

App Files Files Community

Jerrycool commited on Apr 24

Commit

f364096

verified ·

1 Parent(s): b41aa3c

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -51

app.py CHANGED Viewed

@@ -21,41 +21,70 @@ from src.envs import REPO_ID # Keep if needed for restart_space or other functio
 # from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval # Keep submission logic
-# --- New Elo Leaderboard Configuration ---
-INITIAL_MODELS = [
-    "gpt-4o-mini", "gpt-4o", "gemini-2.0-flash", "deepseek-v3",
-    "gemini-2.0-pro", "o3-mini", "deepseek-r1", "gemini-2.5-pro"
 ]
-CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV"]
-DEFAULT_ELO = 1200
-# Placeholder data structure for Elo scores per category
-# *** MODIFY THE SCORES HERE AS NEEDED ***
-elo_data = {
-    category: pd.DataFrame({
-        "Model": INITIAL_MODELS,
-        "Elo Score": [DEFAULT_ELO] * len(INITIAL_MODELS)
-    }) for category in CATEGORIES
 }
-# Example: How to set specific scores for a category
-# elo_data["NLP"] = pd.DataFrame({
-#     "Model": INITIAL_MODELS,
-#     "Elo Score": [1300, 1450, 1250, 1350, 1400, 1150, 1320, 1500] # Example scores
-# })
 # --- Helper function to update leaderboard ---
 def update_leaderboard(category):
-    """Returns the DataFrame for the selected category."""
-    df = elo_data.get(category)
-    if df is None:
-        # Return default if category not found (shouldn't happen with radio)
-        return elo_data[CATEGORIES[0]]
     return df
 # --- Mock/Placeholder functions/data for other tabs ---
-# Since we removed the snapshot download, the original queue fetching will fail.
-# Provide empty DataFrames or mock data if you want the queue display to work without the original data source.
-# This is a placeholder - replace with actual data loading if needed for the submission tab.
 print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
 finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
 running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
@@ -63,17 +92,12 @@ pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "S
 EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
 EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
 # --- Keep restart function if relevant ---
-# Assuming HfApi is initialized elsewhere or REPO_ID is sufficient
-# api = HfApi() # Example initialization, adjust as needed
 def restart_space():
     print(f"Attempting to restart space: {REPO_ID}")
     # Replace with your actual space restart mechanism if needed
-    # try:
-    #     api.restart_space(repo_id=REPO_ID)
-    #     print("Space restart request sent.")
-    # except Exception as e:
-    #     print(f"Failed to restart space: {e}")
 # --- Gradio App Definition ---
 demo = gr.Blocks(css=custom_css)
@@ -88,18 +112,20 @@ with demo:
                 gr.Markdown("## Model Elo Rankings") # New title for the section
                 category_selector = gr.Radio(
                     choices=CATEGORIES,
-                    label="Select Category",
-                    value=CATEGORIES[0], # Default selection
                     interactive=True,
-                    container=False, # Make radio buttons horizontal if possible with CSS
                 )
                 leaderboard_df_component = gr.Dataframe(
-                    value=update_leaderboard(CATEGORIES[0]), # Initial value
                     headers=["Model", "Elo Score"],
                     datatype=["str", "number"],
                     interactive=False,
-                    row_count=(len(INITIAL_MODELS), "fixed"), # Fixed row count
-                    col_count=(2, "fixed"), # Fixed column count
                 )
                 # Link the radio button change to the update function
                 category_selector.change(
@@ -109,17 +135,17 @@ with demo:
                 )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            # --- This section remains largely unchanged, but relies on potentially missing data ---
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Column():
-                     # Displaying queue tables with potentially empty/mock data
                     with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", # Length might be 0
                         open=False,
                     ):
                         with gr.Row():
@@ -159,10 +185,8 @@ with demo:
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    # Using simple strings for dropdowns now, adjust if ModelType/Precision/WeightType classes are still needed
                     model_type = gr.Dropdown(
-                        # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], # Original
-                        choices=["Type A", "Type B", "Type C"], # Example choices, replace if needed
                         label="Model type",
                         multiselect=False,
                         value=None,
@@ -170,7 +194,6 @@ with demo:
                     )
                 with gr.Column():
                     precision = gr.Dropdown(
-                        # choices=[i.value.name for i in Precision if i != Precision.Unknown], # Original
                         choices=["float16", "bfloat16", "float32", "int8"], # Example choices
                         label="Precision",
                         multiselect=False,
@@ -178,7 +201,6 @@ with demo:
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
-                        # choices=[i.value.name for i in WeightType], # Original
                         choices=["Original", "Adapter", "Delta"], # Example choices
                         label="Weights type",
                         multiselect=False,
@@ -190,7 +212,6 @@ with demo:
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
-            # Keep submission logic attached
             submit_button.click(
                 add_new_eval,
                 [
@@ -206,6 +227,7 @@ with demo:
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
@@ -220,5 +242,4 @@ with demo:
 # scheduler.start()
 # --- Launch the app ---
-# demo.queue(default_concurrency_limit=40).launch() # Original launch
-demo.launch() # Simpler launch for testing

 # from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval # Keep submission logic
+# --- Elo Leaderboard Configuration ---
+# Data from the table provided by the user
+data = [
+    {'model': 'gpt-4o-mini', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
+    {'model': 'gpt-4o', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
+    {'model': 'o3-mini', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096},
+    # Renamed 'DeepSeek-v3' to match previous list - adjust if needed
+    {'model': 'deepseek-v3', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
+    # Renamed 'DeepSeek-r1' to match previous list - adjust if needed
+    {'model': 'deepseek-r1', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
+    # Renamed 'Gemini-2.0-Flash' to match previous list - adjust if needed
+    {'model': 'gemini-2.0-flash', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
+    # Renamed 'Gemini-2.0-Pro' to match previous list - adjust if needed
+    {'model': 'gemini-2.0-pro', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
+    # Renamed 'Gemini-2.5-Pro' to match previous list - adjust if needed
+    {'model': 'gemini-2.5-pro', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
 ]
+# Create a master DataFrame
+master_df = pd.DataFrame(data)
+# Define categories for selection (user-facing)
+CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV", "Overall"]
+DEFAULT_CATEGORY = "Overall" # Set a default category
+# Map user-facing categories to DataFrame column names
+category_to_column = {
+    "MLE-Lite": "MLE-Lite_Elo",
+    "Tabular": "Tabular_Elo",
+    "NLP": "NLP_Elo",
+    "CV": "CV_Elo",
+    "Overall": "Overall"
 }
 # --- Helper function to update leaderboard ---
 def update_leaderboard(category):
+    """
+    Selects the relevant columns for the category, renames the score column
+    to 'Elo Score', sorts by score descending, and returns the DataFrame.
+    """
+    score_column = category_to_column.get(category)
+    if score_column is None or score_column not in master_df.columns:
+        # Fallback if category or column is invalid
+        print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
+        score_column = category_to_column[DEFAULT_CATEGORY]
+        if score_column not in master_df.columns: # Check fallback column too
+             return pd.DataFrame({"Model": [], "Elo Score": []}) # Return empty if still invalid
+    # Select model and the specific score column
+    df = master_df[['model', score_column]].copy()
+    # Rename the score column to 'Elo Score' for consistent display
+    df.rename(columns={score_column: 'Elo Score'}, inplace=True)
+    # Sort by 'Elo Score' descending
+    df.sort_values(by='Elo Score', ascending=False, inplace=True)
+    # Reset index for cleaner display (optional)
+    df.reset_index(drop=True, inplace=True)
     return df
 # --- Mock/Placeholder functions/data for other tabs ---
+# (Same as previous version - providing empty data)
 print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
 finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
 running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
 EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
 EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
 # --- Keep restart function if relevant ---
+# (Same as previous version)
 def restart_space():
     print(f"Attempting to restart space: {REPO_ID}")
     # Replace with your actual space restart mechanism if needed
 # --- Gradio App Definition ---
 demo = gr.Blocks(css=custom_css)
                 gr.Markdown("## Model Elo Rankings") # New title for the section
                 category_selector = gr.Radio(
                     choices=CATEGORIES,
+                    label="Select Category to Sort By", # Updated label
+                    value=DEFAULT_CATEGORY, # Default selection
                     interactive=True,
+                    container=False,
                 )
                 leaderboard_df_component = gr.Dataframe(
+                    # Initialize with sorted data for the default category
+                    value=update_leaderboard(DEFAULT_CATEGORY),
                     headers=["Model", "Elo Score"],
                     datatype=["str", "number"],
                     interactive=False,
+                    # Adjust row count based on the number of models
+                    row_count=(len(master_df), "fixed"),
+                    col_count=(2, "fixed"),
                 )
                 # Link the radio button change to the update function
                 category_selector.change(
                 )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            # (Content unchanged)
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            # (Content unchanged, still uses potentially empty/mock queue data)
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Column():
                     with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
                         open=False,
                     ):
                         with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
+                        choices=["Type A", "Type B", "Type C"], # Example choices
                         label="Model type",
                         multiselect=False,
                         value=None,
                     )
                 with gr.Column():
                     precision = gr.Dropdown(
                         choices=["float16", "bfloat16", "float32", "int8"], # Example choices
                         label="Precision",
                         multiselect=False,
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
                         choices=["Original", "Adapter", "Delta"], # Example choices
                         label="Weights type",
                         multiselect=False,
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
                 add_new_eval,
                 [
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
+             # (Content unchanged)
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
 # scheduler.start()
 # --- Launch the app ---
+demo.launch()