Jerrycool commited on
Commit
f364096
Β·
verified Β·
1 Parent(s): b41aa3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -51
app.py CHANGED
@@ -21,41 +21,70 @@ from src.envs import REPO_ID # Keep if needed for restart_space or other functio
21
  # from src.populate import get_evaluation_queue_df, get_leaderboard_df
22
  from src.submission.submit import add_new_eval # Keep submission logic
23
 
24
- # --- New Elo Leaderboard Configuration ---
25
- INITIAL_MODELS = [
26
- "gpt-4o-mini", "gpt-4o", "gemini-2.0-flash", "deepseek-v3",
27
- "gemini-2.0-pro", "o3-mini", "deepseek-r1", "gemini-2.5-pro"
 
 
 
 
 
 
 
 
 
 
 
 
28
  ]
29
- CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV"]
30
- DEFAULT_ELO = 1200
31
-
32
- # Placeholder data structure for Elo scores per category
33
- # *** MODIFY THE SCORES HERE AS NEEDED ***
34
- elo_data = {
35
- category: pd.DataFrame({
36
- "Model": INITIAL_MODELS,
37
- "Elo Score": [DEFAULT_ELO] * len(INITIAL_MODELS)
38
- }) for category in CATEGORIES
 
 
 
 
 
39
  }
40
- # Example: How to set specific scores for a category
41
- # elo_data["NLP"] = pd.DataFrame({
42
- # "Model": INITIAL_MODELS,
43
- # "Elo Score": [1300, 1450, 1250, 1350, 1400, 1150, 1320, 1500] # Example scores
44
- # })
45
 
46
  # --- Helper function to update leaderboard ---
47
  def update_leaderboard(category):
48
- """Returns the DataFrame for the selected category."""
49
- df = elo_data.get(category)
50
- if df is None:
51
- # Return default if category not found (shouldn't happen with radio)
52
- return elo_data[CATEGORIES[0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return df
54
 
55
  # --- Mock/Placeholder functions/data for other tabs ---
56
- # Since we removed the snapshot download, the original queue fetching will fail.
57
- # Provide empty DataFrames or mock data if you want the queue display to work without the original data source.
58
- # This is a placeholder - replace with actual data loading if needed for the submission tab.
59
  print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
60
  finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
61
  running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
@@ -63,17 +92,12 @@ pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "S
63
  EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
64
  EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
65
 
 
66
  # --- Keep restart function if relevant ---
67
- # Assuming HfApi is initialized elsewhere or REPO_ID is sufficient
68
- # api = HfApi() # Example initialization, adjust as needed
69
  def restart_space():
70
  print(f"Attempting to restart space: {REPO_ID}")
71
  # Replace with your actual space restart mechanism if needed
72
- # try:
73
- # api.restart_space(repo_id=REPO_ID)
74
- # print("Space restart request sent.")
75
- # except Exception as e:
76
- # print(f"Failed to restart space: {e}")
77
 
78
  # --- Gradio App Definition ---
79
  demo = gr.Blocks(css=custom_css)
@@ -88,18 +112,20 @@ with demo:
88
  gr.Markdown("## Model Elo Rankings") # New title for the section
89
  category_selector = gr.Radio(
90
  choices=CATEGORIES,
91
- label="Select Category",
92
- value=CATEGORIES[0], # Default selection
93
  interactive=True,
94
- container=False, # Make radio buttons horizontal if possible with CSS
95
  )
96
  leaderboard_df_component = gr.Dataframe(
97
- value=update_leaderboard(CATEGORIES[0]), # Initial value
 
98
  headers=["Model", "Elo Score"],
99
  datatype=["str", "number"],
100
  interactive=False,
101
- row_count=(len(INITIAL_MODELS), "fixed"), # Fixed row count
102
- col_count=(2, "fixed"), # Fixed column count
 
103
  )
104
  # Link the radio button change to the update function
105
  category_selector.change(
@@ -109,17 +135,17 @@ with demo:
109
  )
110
 
111
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
 
112
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
113
 
114
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
115
- # --- This section remains largely unchanged, but relies on potentially missing data ---
116
  with gr.Column():
117
  with gr.Row():
118
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
119
  with gr.Column():
120
- # Displaying queue tables with potentially empty/mock data
121
  with gr.Accordion(
122
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", # Length might be 0
123
  open=False,
124
  ):
125
  with gr.Row():
@@ -159,10 +185,8 @@ with demo:
159
  with gr.Column():
160
  model_name_textbox = gr.Textbox(label="Model name")
161
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
162
- # Using simple strings for dropdowns now, adjust if ModelType/Precision/WeightType classes are still needed
163
  model_type = gr.Dropdown(
164
- # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], # Original
165
- choices=["Type A", "Type B", "Type C"], # Example choices, replace if needed
166
  label="Model type",
167
  multiselect=False,
168
  value=None,
@@ -170,7 +194,6 @@ with demo:
170
  )
171
  with gr.Column():
172
  precision = gr.Dropdown(
173
- # choices=[i.value.name for i in Precision if i != Precision.Unknown], # Original
174
  choices=["float16", "bfloat16", "float32", "int8"], # Example choices
175
  label="Precision",
176
  multiselect=False,
@@ -178,7 +201,6 @@ with demo:
178
  interactive=True,
179
  )
180
  weight_type = gr.Dropdown(
181
- # choices=[i.value.name for i in WeightType], # Original
182
  choices=["Original", "Adapter", "Delta"], # Example choices
183
  label="Weights type",
184
  multiselect=False,
@@ -190,7 +212,6 @@ with demo:
190
  submit_button = gr.Button("Submit Eval")
191
  submission_result = gr.Markdown()
192
 
193
- # Keep submission logic attached
194
  submit_button.click(
195
  add_new_eval,
196
  [
@@ -206,6 +227,7 @@ with demo:
206
 
207
  with gr.Row():
208
  with gr.Accordion("πŸ“™ Citation", open=False):
 
209
  citation_button = gr.Textbox(
210
  value=CITATION_BUTTON_TEXT,
211
  label=CITATION_BUTTON_LABEL,
@@ -220,5 +242,4 @@ with demo:
220
  # scheduler.start()
221
 
222
  # --- Launch the app ---
223
- # demo.queue(default_concurrency_limit=40).launch() # Original launch
224
- demo.launch() # Simpler launch for testing
 
21
  # from src.populate import get_evaluation_queue_df, get_leaderboard_df
22
  from src.submission.submit import add_new_eval # Keep submission logic
23
 
24
+ # --- Elo Leaderboard Configuration ---
25
+ # Data from the table provided by the user
26
+ data = [
27
+ {'model': 'gpt-4o-mini', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
28
+ {'model': 'gpt-4o', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
29
+ {'model': 'o3-mini', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096},
30
+ # Renamed 'DeepSeek-v3' to match previous list - adjust if needed
31
+ {'model': 'deepseek-v3', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
32
+ # Renamed 'DeepSeek-r1' to match previous list - adjust if needed
33
+ {'model': 'deepseek-r1', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
34
+ # Renamed 'Gemini-2.0-Flash' to match previous list - adjust if needed
35
+ {'model': 'gemini-2.0-flash', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
36
+ # Renamed 'Gemini-2.0-Pro' to match previous list - adjust if needed
37
+ {'model': 'gemini-2.0-pro', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
38
+ # Renamed 'Gemini-2.5-Pro' to match previous list - adjust if needed
39
+ {'model': 'gemini-2.5-pro', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
40
  ]
41
+
42
+ # Create a master DataFrame
43
+ master_df = pd.DataFrame(data)
44
+
45
+ # Define categories for selection (user-facing)
46
+ CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV", "Overall"]
47
+ DEFAULT_CATEGORY = "Overall" # Set a default category
48
+
49
+ # Map user-facing categories to DataFrame column names
50
+ category_to_column = {
51
+ "MLE-Lite": "MLE-Lite_Elo",
52
+ "Tabular": "Tabular_Elo",
53
+ "NLP": "NLP_Elo",
54
+ "CV": "CV_Elo",
55
+ "Overall": "Overall"
56
  }
 
 
 
 
 
57
 
58
  # --- Helper function to update leaderboard ---
59
  def update_leaderboard(category):
60
+ """
61
+ Selects the relevant columns for the category, renames the score column
62
+ to 'Elo Score', sorts by score descending, and returns the DataFrame.
63
+ """
64
+ score_column = category_to_column.get(category)
65
+ if score_column is None or score_column not in master_df.columns:
66
+ # Fallback if category or column is invalid
67
+ print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
68
+ score_column = category_to_column[DEFAULT_CATEGORY]
69
+ if score_column not in master_df.columns: # Check fallback column too
70
+ return pd.DataFrame({"Model": [], "Elo Score": []}) # Return empty if still invalid
71
+
72
+ # Select model and the specific score column
73
+ df = master_df[['model', score_column]].copy()
74
+
75
+ # Rename the score column to 'Elo Score' for consistent display
76
+ df.rename(columns={score_column: 'Elo Score'}, inplace=True)
77
+
78
+ # Sort by 'Elo Score' descending
79
+ df.sort_values(by='Elo Score', ascending=False, inplace=True)
80
+
81
+ # Reset index for cleaner display (optional)
82
+ df.reset_index(drop=True, inplace=True)
83
+
84
  return df
85
 
86
  # --- Mock/Placeholder functions/data for other tabs ---
87
+ # (Same as previous version - providing empty data)
 
 
88
  print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
89
  finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
90
  running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
 
92
  EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
93
  EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
94
 
95
+
96
  # --- Keep restart function if relevant ---
97
+ # (Same as previous version)
 
98
  def restart_space():
99
  print(f"Attempting to restart space: {REPO_ID}")
100
  # Replace with your actual space restart mechanism if needed
 
 
 
 
 
101
 
102
  # --- Gradio App Definition ---
103
  demo = gr.Blocks(css=custom_css)
 
112
  gr.Markdown("## Model Elo Rankings") # New title for the section
113
  category_selector = gr.Radio(
114
  choices=CATEGORIES,
115
+ label="Select Category to Sort By", # Updated label
116
+ value=DEFAULT_CATEGORY, # Default selection
117
  interactive=True,
118
+ container=False,
119
  )
120
  leaderboard_df_component = gr.Dataframe(
121
+ # Initialize with sorted data for the default category
122
+ value=update_leaderboard(DEFAULT_CATEGORY),
123
  headers=["Model", "Elo Score"],
124
  datatype=["str", "number"],
125
  interactive=False,
126
+ # Adjust row count based on the number of models
127
+ row_count=(len(master_df), "fixed"),
128
+ col_count=(2, "fixed"),
129
  )
130
  # Link the radio button change to the update function
131
  category_selector.change(
 
135
  )
136
 
137
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
138
+ # (Content unchanged)
139
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
140
 
141
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
142
+ # (Content unchanged, still uses potentially empty/mock queue data)
143
  with gr.Column():
144
  with gr.Row():
145
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
146
  with gr.Column():
 
147
  with gr.Accordion(
148
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
149
  open=False,
150
  ):
151
  with gr.Row():
 
185
  with gr.Column():
186
  model_name_textbox = gr.Textbox(label="Model name")
187
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
188
  model_type = gr.Dropdown(
189
+ choices=["Type A", "Type B", "Type C"], # Example choices
 
190
  label="Model type",
191
  multiselect=False,
192
  value=None,
 
194
  )
195
  with gr.Column():
196
  precision = gr.Dropdown(
 
197
  choices=["float16", "bfloat16", "float32", "int8"], # Example choices
198
  label="Precision",
199
  multiselect=False,
 
201
  interactive=True,
202
  )
203
  weight_type = gr.Dropdown(
 
204
  choices=["Original", "Adapter", "Delta"], # Example choices
205
  label="Weights type",
206
  multiselect=False,
 
212
  submit_button = gr.Button("Submit Eval")
213
  submission_result = gr.Markdown()
214
 
 
215
  submit_button.click(
216
  add_new_eval,
217
  [
 
227
 
228
  with gr.Row():
229
  with gr.Accordion("πŸ“™ Citation", open=False):
230
+ # (Content unchanged)
231
  citation_button = gr.Textbox(
232
  value=CITATION_BUTTON_TEXT,
233
  label=CITATION_BUTTON_LABEL,
 
242
  # scheduler.start()
243
 
244
  # --- Launch the app ---
245
+ demo.launch()