Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -21,41 +21,70 @@ from src.envs import REPO_ID # Keep if needed for restart_space or other functio
|
|
21 |
# from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
22 |
from src.submission.submit import add_new_eval # Keep submission logic
|
23 |
|
24 |
-
# ---
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
]
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
#
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
39 |
}
|
40 |
-
# Example: How to set specific scores for a category
|
41 |
-
# elo_data["NLP"] = pd.DataFrame({
|
42 |
-
# "Model": INITIAL_MODELS,
|
43 |
-
# "Elo Score": [1300, 1450, 1250, 1350, 1400, 1150, 1320, 1500] # Example scores
|
44 |
-
# })
|
45 |
|
46 |
# --- Helper function to update leaderboard ---
|
47 |
def update_leaderboard(category):
|
48 |
-
"""
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
return df
|
54 |
|
55 |
# --- Mock/Placeholder functions/data for other tabs ---
|
56 |
-
#
|
57 |
-
# Provide empty DataFrames or mock data if you want the queue display to work without the original data source.
|
58 |
-
# This is a placeholder - replace with actual data loading if needed for the submission tab.
|
59 |
print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
|
60 |
finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
|
61 |
running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
|
@@ -63,17 +92,12 @@ pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "S
|
|
63 |
EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
|
64 |
EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
|
65 |
|
|
|
66 |
# --- Keep restart function if relevant ---
|
67 |
-
#
|
68 |
-
# api = HfApi() # Example initialization, adjust as needed
|
69 |
def restart_space():
|
70 |
print(f"Attempting to restart space: {REPO_ID}")
|
71 |
# Replace with your actual space restart mechanism if needed
|
72 |
-
# try:
|
73 |
-
# api.restart_space(repo_id=REPO_ID)
|
74 |
-
# print("Space restart request sent.")
|
75 |
-
# except Exception as e:
|
76 |
-
# print(f"Failed to restart space: {e}")
|
77 |
|
78 |
# --- Gradio App Definition ---
|
79 |
demo = gr.Blocks(css=custom_css)
|
@@ -88,18 +112,20 @@ with demo:
|
|
88 |
gr.Markdown("## Model Elo Rankings") # New title for the section
|
89 |
category_selector = gr.Radio(
|
90 |
choices=CATEGORIES,
|
91 |
-
label="Select Category",
|
92 |
-
value=
|
93 |
interactive=True,
|
94 |
-
container=False,
|
95 |
)
|
96 |
leaderboard_df_component = gr.Dataframe(
|
97 |
-
|
|
|
98 |
headers=["Model", "Elo Score"],
|
99 |
datatype=["str", "number"],
|
100 |
interactive=False,
|
101 |
-
|
102 |
-
|
|
|
103 |
)
|
104 |
# Link the radio button change to the update function
|
105 |
category_selector.change(
|
@@ -109,17 +135,17 @@ with demo:
|
|
109 |
)
|
110 |
|
111 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
112 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
113 |
|
114 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
115 |
-
#
|
116 |
with gr.Column():
|
117 |
with gr.Row():
|
118 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
119 |
with gr.Column():
|
120 |
-
# Displaying queue tables with potentially empty/mock data
|
121 |
with gr.Accordion(
|
122 |
-
f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
123 |
open=False,
|
124 |
):
|
125 |
with gr.Row():
|
@@ -159,10 +185,8 @@ with demo:
|
|
159 |
with gr.Column():
|
160 |
model_name_textbox = gr.Textbox(label="Model name")
|
161 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
162 |
-
# Using simple strings for dropdowns now, adjust if ModelType/Precision/WeightType classes are still needed
|
163 |
model_type = gr.Dropdown(
|
164 |
-
|
165 |
-
choices=["Type A", "Type B", "Type C"], # Example choices, replace if needed
|
166 |
label="Model type",
|
167 |
multiselect=False,
|
168 |
value=None,
|
@@ -170,7 +194,6 @@ with demo:
|
|
170 |
)
|
171 |
with gr.Column():
|
172 |
precision = gr.Dropdown(
|
173 |
-
# choices=[i.value.name for i in Precision if i != Precision.Unknown], # Original
|
174 |
choices=["float16", "bfloat16", "float32", "int8"], # Example choices
|
175 |
label="Precision",
|
176 |
multiselect=False,
|
@@ -178,7 +201,6 @@ with demo:
|
|
178 |
interactive=True,
|
179 |
)
|
180 |
weight_type = gr.Dropdown(
|
181 |
-
# choices=[i.value.name for i in WeightType], # Original
|
182 |
choices=["Original", "Adapter", "Delta"], # Example choices
|
183 |
label="Weights type",
|
184 |
multiselect=False,
|
@@ -190,7 +212,6 @@ with demo:
|
|
190 |
submit_button = gr.Button("Submit Eval")
|
191 |
submission_result = gr.Markdown()
|
192 |
|
193 |
-
# Keep submission logic attached
|
194 |
submit_button.click(
|
195 |
add_new_eval,
|
196 |
[
|
@@ -206,6 +227,7 @@ with demo:
|
|
206 |
|
207 |
with gr.Row():
|
208 |
with gr.Accordion("π Citation", open=False):
|
|
|
209 |
citation_button = gr.Textbox(
|
210 |
value=CITATION_BUTTON_TEXT,
|
211 |
label=CITATION_BUTTON_LABEL,
|
@@ -220,5 +242,4 @@ with demo:
|
|
220 |
# scheduler.start()
|
221 |
|
222 |
# --- Launch the app ---
|
223 |
-
|
224 |
-
demo.launch() # Simpler launch for testing
|
|
|
21 |
# from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
22 |
from src.submission.submit import add_new_eval # Keep submission logic
|
23 |
|
24 |
+
# --- Elo Leaderboard Configuration ---
|
25 |
+
# Data from the table provided by the user
|
26 |
+
data = [
|
27 |
+
{'model': 'gpt-4o-mini', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
|
28 |
+
{'model': 'gpt-4o', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
|
29 |
+
{'model': 'o3-mini', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096},
|
30 |
+
# Renamed 'DeepSeek-v3' to match previous list - adjust if needed
|
31 |
+
{'model': 'deepseek-v3', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
|
32 |
+
# Renamed 'DeepSeek-r1' to match previous list - adjust if needed
|
33 |
+
{'model': 'deepseek-r1', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
|
34 |
+
# Renamed 'Gemini-2.0-Flash' to match previous list - adjust if needed
|
35 |
+
{'model': 'gemini-2.0-flash', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
|
36 |
+
# Renamed 'Gemini-2.0-Pro' to match previous list - adjust if needed
|
37 |
+
{'model': 'gemini-2.0-pro', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
|
38 |
+
# Renamed 'Gemini-2.5-Pro' to match previous list - adjust if needed
|
39 |
+
{'model': 'gemini-2.5-pro', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
|
40 |
]
|
41 |
+
|
42 |
+
# Create a master DataFrame
|
43 |
+
master_df = pd.DataFrame(data)
|
44 |
+
|
45 |
+
# Define categories for selection (user-facing)
|
46 |
+
CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV", "Overall"]
|
47 |
+
DEFAULT_CATEGORY = "Overall" # Set a default category
|
48 |
+
|
49 |
+
# Map user-facing categories to DataFrame column names
|
50 |
+
category_to_column = {
|
51 |
+
"MLE-Lite": "MLE-Lite_Elo",
|
52 |
+
"Tabular": "Tabular_Elo",
|
53 |
+
"NLP": "NLP_Elo",
|
54 |
+
"CV": "CV_Elo",
|
55 |
+
"Overall": "Overall"
|
56 |
}
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# --- Helper function to update leaderboard ---
|
59 |
def update_leaderboard(category):
|
60 |
+
"""
|
61 |
+
Selects the relevant columns for the category, renames the score column
|
62 |
+
to 'Elo Score', sorts by score descending, and returns the DataFrame.
|
63 |
+
"""
|
64 |
+
score_column = category_to_column.get(category)
|
65 |
+
if score_column is None or score_column not in master_df.columns:
|
66 |
+
# Fallback if category or column is invalid
|
67 |
+
print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
|
68 |
+
score_column = category_to_column[DEFAULT_CATEGORY]
|
69 |
+
if score_column not in master_df.columns: # Check fallback column too
|
70 |
+
return pd.DataFrame({"Model": [], "Elo Score": []}) # Return empty if still invalid
|
71 |
+
|
72 |
+
# Select model and the specific score column
|
73 |
+
df = master_df[['model', score_column]].copy()
|
74 |
+
|
75 |
+
# Rename the score column to 'Elo Score' for consistent display
|
76 |
+
df.rename(columns={score_column: 'Elo Score'}, inplace=True)
|
77 |
+
|
78 |
+
# Sort by 'Elo Score' descending
|
79 |
+
df.sort_values(by='Elo Score', ascending=False, inplace=True)
|
80 |
+
|
81 |
+
# Reset index for cleaner display (optional)
|
82 |
+
df.reset_index(drop=True, inplace=True)
|
83 |
+
|
84 |
return df
|
85 |
|
86 |
# --- Mock/Placeholder functions/data for other tabs ---
|
87 |
+
# (Same as previous version - providing empty data)
|
|
|
|
|
88 |
print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
|
89 |
finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
|
90 |
running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
|
|
|
92 |
EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
|
93 |
EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
|
94 |
|
95 |
+
|
96 |
# --- Keep restart function if relevant ---
|
97 |
+
# (Same as previous version)
|
|
|
98 |
def restart_space():
|
99 |
print(f"Attempting to restart space: {REPO_ID}")
|
100 |
# Replace with your actual space restart mechanism if needed
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# --- Gradio App Definition ---
|
103 |
demo = gr.Blocks(css=custom_css)
|
|
|
112 |
gr.Markdown("## Model Elo Rankings") # New title for the section
|
113 |
category_selector = gr.Radio(
|
114 |
choices=CATEGORIES,
|
115 |
+
label="Select Category to Sort By", # Updated label
|
116 |
+
value=DEFAULT_CATEGORY, # Default selection
|
117 |
interactive=True,
|
118 |
+
container=False,
|
119 |
)
|
120 |
leaderboard_df_component = gr.Dataframe(
|
121 |
+
# Initialize with sorted data for the default category
|
122 |
+
value=update_leaderboard(DEFAULT_CATEGORY),
|
123 |
headers=["Model", "Elo Score"],
|
124 |
datatype=["str", "number"],
|
125 |
interactive=False,
|
126 |
+
# Adjust row count based on the number of models
|
127 |
+
row_count=(len(master_df), "fixed"),
|
128 |
+
col_count=(2, "fixed"),
|
129 |
)
|
130 |
# Link the radio button change to the update function
|
131 |
category_selector.change(
|
|
|
135 |
)
|
136 |
|
137 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
138 |
+
# (Content unchanged)
|
139 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
140 |
|
141 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
142 |
+
# (Content unchanged, still uses potentially empty/mock queue data)
|
143 |
with gr.Column():
|
144 |
with gr.Row():
|
145 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
146 |
with gr.Column():
|
|
|
147 |
with gr.Accordion(
|
148 |
+
f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
149 |
open=False,
|
150 |
):
|
151 |
with gr.Row():
|
|
|
185 |
with gr.Column():
|
186 |
model_name_textbox = gr.Textbox(label="Model name")
|
187 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
|
|
188 |
model_type = gr.Dropdown(
|
189 |
+
choices=["Type A", "Type B", "Type C"], # Example choices
|
|
|
190 |
label="Model type",
|
191 |
multiselect=False,
|
192 |
value=None,
|
|
|
194 |
)
|
195 |
with gr.Column():
|
196 |
precision = gr.Dropdown(
|
|
|
197 |
choices=["float16", "bfloat16", "float32", "int8"], # Example choices
|
198 |
label="Precision",
|
199 |
multiselect=False,
|
|
|
201 |
interactive=True,
|
202 |
)
|
203 |
weight_type = gr.Dropdown(
|
|
|
204 |
choices=["Original", "Adapter", "Delta"], # Example choices
|
205 |
label="Weights type",
|
206 |
multiselect=False,
|
|
|
212 |
submit_button = gr.Button("Submit Eval")
|
213 |
submission_result = gr.Markdown()
|
214 |
|
|
|
215 |
submit_button.click(
|
216 |
add_new_eval,
|
217 |
[
|
|
|
227 |
|
228 |
with gr.Row():
|
229 |
with gr.Accordion("π Citation", open=False):
|
230 |
+
# (Content unchanged)
|
231 |
citation_button = gr.Textbox(
|
232 |
value=CITATION_BUTTON_TEXT,
|
233 |
label=CITATION_BUTTON_LABEL,
|
|
|
242 |
# scheduler.start()
|
243 |
|
244 |
# --- Launch the app ---
|
245 |
+
demo.launch()
|
|