File size: 16,935 Bytes
22cc60c
 
 
b41aa3c
1007e19
 
 
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
1007e19
7b8bbf4
 
 
 
 
 
 
 
 
 
 
1007e19
 
b41aa3c
f364096
7a7ae1e
1007e19
 
f364096
7b8bbf4
 
 
 
 
 
 
 
b41aa3c
f364096
 
172585c
f364096
 
 
7a7ae1e
f364096
 
 
 
 
 
 
 
 
b41aa3c
 
 
 
f364096
7a7ae1e
 
f364096
 
 
 
 
7b8bbf4
 
 
 
 
7a7ae1e
 
 
7b8bbf4
 
 
7a7ae1e
 
 
172585c
7a7ae1e
 
 
 
 
 
 
 
 
 
172585c
 
7a7ae1e
 
 
 
f364096
 
 
 
7b8bbf4
 
 
172585c
7b8bbf4
 
7a7ae1e
172585c
 
7b8bbf4
b41aa3c
 
 
172585c
b41aa3c
 
 
 
 
 
 
 
22cc60c
1007e19
b41aa3c
1007e19
b41aa3c
7b8bbf4
b41aa3c
7b8bbf4
 
 
 
 
 
 
 
e842409
7b8bbf4
 
 
 
 
e842409
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a7ae1e
172585c
 
b41aa3c
22cc60c
1007e19
172585c
7b8bbf4
1007e19
22cc60c
 
 
1f8bbc4
b41aa3c
172585c
b41aa3c
 
172585c
 
b41aa3c
 
 
f364096
 
7b8bbf4
7a7ae1e
7b8bbf4
7a7ae1e
b41aa3c
1007e19
 
7b8bbf4
7a7ae1e
7b8bbf4
1007e19
b41aa3c
 
 
 
 
 
 
22cc60c
172585c
7b8bbf4
22cc60c
 
7a7ae1e
1007e19
7a7ae1e
172585c
 
1007e19
172585c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007e19
172585c
1007e19
172585c
 
 
 
 
1007e19
 
172585c
1007e19
172585c
 
 
 
7a7ae1e
 
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
22cc60c
b41aa3c
7b8bbf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b41aa3c
 
172585c
7a7ae1e
172585c
1007e19
7b8bbf4
7a7ae1e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
# Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard

# --- Make sure these imports work relative to your file structure ---
# Option 1: If src is a directory in the same folder as your script:
try:
    from src.about import (
        CITATION_BUTTON_LABEL,
        CITATION_BUTTON_TEXT,
        EVALUATION_QUEUE_TEXT, # Keep if used by commented-out submit tab
        INTRODUCTION_TEXT,
        LLM_BENCHMARKS_TEXT,
        TITLE,
    )
    from src.display.css_html_js import custom_css # Assuming this exists but might be empty
    from src.envs import REPO_ID # Keep if needed for restart_space or other functions
    from src.submission.submit import add_new_eval # Keep if using the submit tab
    print("Successfully imported from src module.")
# Option 2: If you don't have these files, define placeholders (REMOVE THIS if using Option 1)
except ImportError:
    print("Warning: Using placeholder values because src module imports failed.")
    CITATION_BUTTON_LABEL="Citation"
    CITATION_BUTTON_TEXT="Please cite us if you use this benchmark..."
    EVALUATION_QUEUE_TEXT="Current evaluation queue:"
    INTRODUCTION_TEXT="Welcome to the MLE-Dojo Benchmark Leaderboard."
    LLM_BENCHMARKS_TEXT="Information about the benchmarks..."
    TITLE="<h1>πŸ† MLE-Dojo Benchmark Leaderboard</h1>"
    custom_css="" # Start with empty CSS if not imported
    REPO_ID="your/space-id" # Replace with actual ID if needed
    def add_new_eval(*args): return "Submission placeholder."
# --- End Placeholder Definitions ---


# --- Elo Leaderboard Configuration ---
# Enhanced data with Rank (placeholder), Organizer, License, and URL
# !!! IMPORTANT: Replace placeholder URLs with actual model/project pages. !!!
# Verify organizer and license information for accuracy.
data = [
{'model_name': 'gpt-4o-mini', 'url': 'https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
{'model_name': 'gpt-4o', 'url': 'https://openai.com/index/hello-gpt-4o/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
{'model_name': 'o3-mini', 'url': 'https://openai.com/index/openai-o3-mini/', 'organizer': 'OpenAI', 'license': 'Proprietary', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, # Fill details later
{'model_name': 'deepseek-v3', 'url': 'https://api-docs.deepseek.com/news/news1226', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
{'model_name': 'deepseek-r1', 'url': 'https://api-docs.deepseek.com/news/news250120', 'organizer': 'DeepSeek', 'license': 'DeepSeek', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
{'model_name': 'gemini-2.0-flash', 'url': 'https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
{'model_name': 'gemini-2.0-pro', 'url': 'https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
{'model_name': 'gemini-2.5-pro', 'url': 'https://deepmind.google/technologies/gemini/pro/', 'organizer': 'Google', 'license': 'Proprietary', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
]

# Create a master DataFrame
# Note: Columns 'organizer' and 'license' are created in lowercase here.
master_df = pd.DataFrame(data)

# Define categories for selection (user-facing)
CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"] # Overall first
DEFAULT_CATEGORY = "Overall" # Set a default category

# Map user-facing categories to DataFrame column names
category_to_column = {
    "MLE-Lite": "MLE-Lite_Elo",
    "Tabular": "Tabular_Elo",
    "NLP": "NLP_Elo",
    "CV": "CV_Elo",
    "Overall": "Overall"
}

# --- Helper function to update leaderboard ---
def update_leaderboard(category):
    """
    Selects relevant columns, sorts by the chosen category's Elo score,
    adds Rank, formats model name as a link, and returns the DataFrame.
    """
    score_column = category_to_column.get(category)
    if score_column is None or score_column not in master_df.columns:
        print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
        score_column = category_to_column[DEFAULT_CATEGORY]
        # Check fallback column too
        if score_column not in master_df.columns:
            # Return empty df with correct columns if still invalid
            # Use lowercase keys here consistent with master_df for the empty case
            print(f"Error: Default column '{score_column}' also not found.")
            return pd.DataFrame({
                "Rank": [],
                "Model": [],
                "Elo Score": [],
                "Organizer": [], # Changed 'organizer' -> 'Organizer' for consistency in empty case
                "License": []   # Changed 'license' -> 'License' for consistency in empty case
            })

    # Select base columns + the score column for sorting
    # Ensure 'organizer' and 'license' are selected correctly (lowercase)
    cols_to_select = ['model_name', 'url', 'organizer', 'license', score_column]
    df = master_df[cols_to_select].copy()

    # Sort by the selected 'Elo Score' descending
    df.sort_values(by=score_column, ascending=False, inplace=True)

    # Add Rank based on the sorted order
    df.reset_index(drop=True, inplace=True)
    df.insert(0, 'Rank', df.index + 1)

    # Format Model Name as HTML Hyperlink
    # The resulting column name will be 'Model' (capitalized)
    df['Model'] = df.apply(
        lambda row: f"<a href='{row['url'] if pd.notna(row['url']) else '#'}' target='_blank' style='color: #007bff; text-decoration: none;'>{row['model_name']}</a>",
        axis=1
    )

    # Rename the score column to 'Elo Score' for consistent display
    df.rename(columns={score_column: 'Elo Score'}, inplace=True)

    # Rename 'organizer' and 'license' to match desired display headers
    df.rename(columns={'organizer': 'Organizer', 'license': 'License'}, inplace=True)

    # Select and reorder columns for final display using the ACTUAL column names in df
    # Use capitalized 'Organizer' and 'License' here because they have been renamed.
    final_columns = ["Rank", "Model", "Organizer", "License", "Elo Score"]
    df = df[final_columns]

    # Note: The DataFrame returned now has columns:
    # 'Rank', 'Model', 'Organizer', 'License', 'Elo Score'
    return df

# --- Mock/Placeholder functions/data for other tabs ---
# (If the Submit tab is used, ensure these variables are appropriately populated or handled)
print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types

# --- Keep restart function if relevant ---
def restart_space():
    # Make sure REPO_ID is correctly defined/imported if this function is used
    print(f"Attempting to restart space: {REPO_ID}")
    # Replace with your actual space restart mechanism if needed (e.g., HfApi().restart_space(REPO_ID))


# --- Gradio App Definition ---

# ***** FONT SIZE INCREASED HERE *****
# Add CSS rules to make the base font size larger.
# Adjust the '1.2em' value (e.g., to '1.4em', '16px') to change the size.
# The !important flag helps override theme defaults.
# If the imported custom_css already has content, append to it.
font_size_css = """
body {
    font-size: 1.5em !important; /* Increase base font size */
}
/* Optional: Target specific elements if needed */
/*
#leaderboard-table th, #leaderboard-table td {
    font-size: 1em !important; /* Adjust table font size relative to new body size */
    padding: 5px 7px !important; /* Increase padding for better spacing */
}
h1, .markdown-text h1 { font-size: 2.2em !important; } /* Make main title larger */
h2, .markdown-text h2 { font-size: 1.8em !important; } /* Make section titles larger */
button { font-size: 1.1em !important; padding: 8px 16px !important; } /* Slightly larger buttons */
.gr-input, .gr-dropdown, .gr-textbox textarea { font-size: 1em !important; } /* Ensure inputs scale too */
*/
"""
# Append the new CSS to any existing custom_css
custom_css += font_size_css

# Add basic table styling if not already present
if "table {" not in custom_css:
    custom_css += """
table { width: 100%; border-collapse: collapse; margin-top: 10px; margin-bottom: 10px; }
th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; white-space: normal; vertical-align: top; } /* Allow wrapping, top align */
th { background-color: #f2f2f2; font-weight: bold; }
tr:nth-child(even) { background-color: #f9f9f9; }
tr:hover { background-color: #e9e9e9; }
td a { color: #007bff; text-decoration: none; }
td a:hover { text-decoration: underline; }
"""

# Use a theme for better default styling
demo = gr.Blocks(css=custom_css, theme=gr.themes.Soft())

with demo:
    # Use the TITLE variable imported or defined above
    gr.HTML(TITLE)

    # Use the INTRODUCTION_TEXT variable imported or defined above
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ… MLE-Dojo Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            with gr.Column():
                gr.Markdown("## Model Elo Rankings by Category")
                category_selector = gr.Radio(
                    choices=CATEGORIES,
                    label="Select Category:",
                    value=DEFAULT_CATEGORY,
                    interactive=True,
                )
                leaderboard_df_component = gr.Dataframe(
                    # Initialize with sorted data for the default category
                    value=update_leaderboard(DEFAULT_CATEGORY),
                    # Headers for DISPLAY should match the *renamed* columns from update_leaderboard
                    headers=["Rank", "Model", "Organizer", "License", "Elo Score"],
                    # Datatype maps to the final df columns: Rank, Model, Organizer, License, Elo Score
                    datatype=["number", "html", "str", "str", "number"],
                    interactive=False,
                    # --- FIX APPLIED: Removed unsupported 'height' argument ---
                    # row_count determines the number of rows to display
                    row_count=(len(master_df), "fixed"), # Display all rows
                    col_count=(5, "fixed"),
                    wrap=True, # Allow text wrapping in cells
                    elem_id="leaderboard-table" # CSS hook for custom styling
                )
                # Link the radio button change to the update function
                category_selector.change(
                    fn=update_leaderboard,
                    inputs=category_selector,
                    outputs=leaderboard_df_component
                )

        with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-about", id=1):
            # Use the LLM_BENCHMARKS_TEXT variable imported or defined above
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        # --- Submit Tab (Commented out as in original request) ---
        # Make sure EVALUATION_QUEUE_TEXT and add_new_eval are imported/defined if uncommented
        # with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-submit", id=2):
        #     with gr.Column():
        #          with gr.Row():
        #              gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # Requires import/definition
        #          with gr.Column():
        #              with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
        #                   finished_eval_table = gr.components.Dataframe(
        #                       value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
        #                  )
        #              with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
        #                   running_eval_table = gr.components.Dataframe(
        #                       value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
        #                  )
        #              with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
        #                  pending_eval_table = gr.components.Dataframe(
        #                      value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5,
        #                  )
        #     with gr.Row():
        #          gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
        #     with gr.Row():
        #          with gr.Column():
        #              model_name_textbox = gr.Textbox(label="Model name (on Hugging Face Hub)")
        #              revision_name_textbox = gr.Textbox(label="Revision / Commit Hash", placeholder="main")
        #              model_type = gr.Dropdown(choices=["Type A", "Type B", "Type C"], label="Model type", multiselect=False, value=None, interactive=True) # Example choices
        #          with gr.Column():
        #              precision = gr.Dropdown(choices=["float16", "bfloat16", "float32", "int8", "auto"], label="Precision", multiselect=False, value="auto", interactive=True)
        #              weight_type = gr.Dropdown(choices=["Original", "Adapter", "Delta"], label="Weights type", multiselect=False, value="Original", interactive=True)
        #              base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
        #     submit_button = gr.Button("Submit Eval")
        #     submission_result = gr.Markdown()
        #     # Ensure add_new_eval is correctly imported/defined and handles these inputs
        #     submit_button.click(
        #          add_new_eval, # Requires import/definition
        #          [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ],
        #          submission_result,
        #      )

    # --- Citation Row (at the bottom, outside Tabs) ---
    with gr.Accordion("πŸ“™ Citation", open=False):
        # Use the CITATION_BUTTON_TEXT and CITATION_BUTTON_LABEL variables imported or defined above
        citation_button = gr.Textbox(
            value=CITATION_BUTTON_TEXT,
            label=CITATION_BUTTON_LABEL,
            lines=10, # Adjust lines if needed for new font size
            elem_id="citation-button",
            show_copy_button=True,
        )

# IGNORE_WHEN_COPYING_START
# content_copy  download
# Use code with caution.
# IGNORE_WHEN_COPYING_END

# --- Keep scheduler if relevant ---
# Only start scheduler if the script is run directly
if __name__ == "__main__":
    try:
        scheduler = BackgroundScheduler()
        # Add job only if restart_space is callable (i.e., not a placeholder or failed import)
        if callable(restart_space):
             # Check if REPO_ID seems valid before scheduling
             if REPO_ID and REPO_ID != "your/space-id":
                 scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins
                 scheduler.start()
             else:
                 print("Warning: REPO_ID not set or is placeholder; space restart job not scheduled.")
        else:
             print("Warning: restart_space function not available; space restart job not scheduled.")
    except Exception as e:
        print(f"Failed to initialize or start scheduler: {e}")


# --- Launch the app ---
# Ensures the app launches only when the script is run directly
if __name__ == "__main__":
    # Ensure you have installed necessary libraries: pip install gradio pandas apscheduler
    # Make sure your src module files (about.py etc.) are accessible OR use the placeholder definitions above.
    print("Launching Gradio App...")
    demo.launch()