Spaces:
Running
Running
Update leaderboard display
Browse files
app.py
CHANGED
@@ -153,143 +153,90 @@ def filter_and_process_results(results, n_versions, only_actual_versions):
|
|
153 |
"""Filter results by version and process them for display."""
|
154 |
if not results or "items" not in results:
|
155 |
return pd.DataFrame(), [], [], []
|
156 |
-
|
157 |
-
all_items = results["items"]
|
158 |
-
last_version_str = results.get("last_version", "1.0")
|
159 |
-
last_version = version.parse(last_version_str)
|
160 |
|
161 |
-
|
162 |
|
163 |
-
#
|
164 |
-
|
|
|
|
|
|
|
|
|
165 |
|
|
|
166 |
for version_str, version_items in all_items.items():
|
167 |
-
|
168 |
-
|
169 |
-
model_name = item.get("model_name", "Unknown")
|
170 |
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
# Filter versions based on selection
|
185 |
-
filtered_items = []
|
186 |
-
|
187 |
-
if only_actual_versions:
|
188 |
-
# Get the n most recent actual dataset versions
|
189 |
-
all_versions = sorted([version.parse(v_str) for v_str in all_items.keys()], reverse=True)
|
190 |
-
# Take at most n_versions
|
191 |
-
versions_to_consider = all_versions[:n_versions] if all_versions else []
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
# Create row with basic info
|
205 |
-
row = {
|
206 |
-
'Model': model_name,
|
207 |
-
'Embeddings': config.get('embedding_model', 'N/A'),
|
208 |
-
'Retriever': config.get('retriever_type', 'N/A'),
|
209 |
-
'Top-K': config.get('retrieval_config', {}).get('top_k', 'N/A'),
|
210 |
-
'Versions': ", ".join([item["version_str"] for item in filtered_items]),
|
211 |
-
'Last Updated': filtered_items[0].get("timestamp", "")
|
212 |
-
}
|
213 |
-
|
214 |
-
# Format timestamp if available
|
215 |
-
if row['Last Updated']:
|
216 |
-
try:
|
217 |
-
dt = datetime.fromisoformat(row['Last Updated'].replace('Z', '+00:00'))
|
218 |
-
row['Last Updated'] = dt.strftime("%Y-%m-%d")
|
219 |
-
except:
|
220 |
-
pass
|
221 |
-
|
222 |
-
# Process metrics based on categories
|
223 |
-
category_metrics = {
|
224 |
-
category: {
|
225 |
-
metric_type: {
|
226 |
-
"avg": 0.0,
|
227 |
-
"count": 0
|
228 |
-
} for metric_type in METRIC_TYPES
|
229 |
-
} for category in QUESTION_CATEGORIES
|
230 |
-
}
|
231 |
-
|
232 |
-
# Collect metrics by category
|
233 |
-
for item in filtered_items:
|
234 |
-
metrics = item.get("metrics", {})
|
235 |
for category in QUESTION_CATEGORIES:
|
236 |
if category in metrics:
|
237 |
for metric_type in METRIC_TYPES:
|
238 |
if metric_type in metrics[category]:
|
239 |
metric_values = metrics[category][metric_type]
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
# Calculate averages and add to row
|
247 |
-
for category in QUESTION_CATEGORIES:
|
248 |
-
for metric_type in METRIC_TYPES:
|
249 |
-
metric_data = category_metrics[category][metric_type]
|
250 |
-
if metric_data["count"] > 0:
|
251 |
-
avg_value = metric_data["avg"] / metric_data["count"]
|
252 |
-
# Add to row with appropriate column name
|
253 |
-
col_name = f"{category}_{metric_type}"
|
254 |
-
row[col_name] = round(avg_value, 4)
|
255 |
-
|
256 |
-
# Calculate overall averages for each metric type
|
257 |
-
for metric_type in METRIC_TYPES:
|
258 |
-
total_sum = 0
|
259 |
-
total_count = 0
|
260 |
|
261 |
-
for
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
total_count += metric_data["count"]
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
rows.append(row)
|
271 |
-
|
272 |
-
# Create DataFrame
|
273 |
df = pd.DataFrame(rows)
|
274 |
|
275 |
# Get lists of metrics for each category
|
276 |
category_metrics = []
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
|
|
286 |
# Define retrieval and generation columns for radar charts
|
287 |
-
retrieval_metrics = [
|
288 |
-
generation_metrics = [
|
|
|
|
|
|
|
289 |
|
290 |
return df, retrieval_metrics, generation_metrics, category_metrics
|
291 |
|
292 |
-
def create_radar_chart(df, selected_models, metrics, title):
|
293 |
"""Create a radar chart for the selected models and metrics."""
|
294 |
if not metrics or len(selected_models) == 0:
|
295 |
# Return empty figure if no metrics or models selected
|
@@ -335,7 +282,7 @@ def create_radar_chart(df, selected_models, metrics, title):
|
|
335 |
categories_loop = categories + [categories[0]]
|
336 |
|
337 |
fig.add_trace(go.Scatterpolar(
|
338 |
-
name=row[
|
339 |
r=values,
|
340 |
theta=categories_loop,
|
341 |
showlegend=True,
|
@@ -398,21 +345,23 @@ def create_summary_df(df, retrieval_metrics, generation_metrics):
|
|
398 |
generation_avg = summary_df[generation_metrics].mean(axis=1).round(4)
|
399 |
summary_df['Generation (avg)'] = generation_avg
|
400 |
|
401 |
-
# Add total score if
|
402 |
-
if 'Retrieval (avg)' in summary_df.columns and 'Generation (avg)' in summary_df.columns:
|
403 |
-
summary_df['Total Score'] = summary_df['Retrieval (avg)'
|
404 |
summary_df = summary_df.sort_values('Total Score', ascending=False)
|
405 |
|
406 |
# Select columns for display
|
407 |
-
summary_cols = ['Model', 'Embeddings', '
|
|
|
|
|
408 |
if 'Retrieval (avg)' in summary_df.columns:
|
409 |
summary_cols.append('Retrieval (avg)')
|
410 |
if 'Generation (avg)' in summary_df.columns:
|
411 |
summary_cols.append('Generation (avg)')
|
412 |
if 'Total Score' in summary_df.columns:
|
413 |
summary_cols.append('Total Score')
|
414 |
-
if '
|
415 |
-
summary_cols.append('
|
416 |
if 'Last Updated' in summary_df.columns:
|
417 |
summary_cols.append('Last Updated')
|
418 |
|
@@ -432,7 +381,7 @@ def create_category_df(df, category, retrieval_col, generation_col):
|
|
432 |
category_df = category_df.sort_values(f'Score', ascending=False)
|
433 |
|
434 |
# Select columns for display
|
435 |
-
category_cols = ['Model', 'Embeddings',
|
436 |
|
437 |
# Rename columns for display
|
438 |
category_df = category_df[category_cols].rename(columns={
|
@@ -458,7 +407,7 @@ default_models = df['Model'].head(5).tolist() if not df.empty else []
|
|
458 |
initial_gen_chart_title = LANGUAGES[DEFAULT_LANG]["radar_gen_title"]
|
459 |
initial_ret_chart_title = LANGUAGES[DEFAULT_LANG]["radar_ret_title"]
|
460 |
initial_gen_chart = create_radar_chart(df, default_models, generation_metrics, initial_gen_chart_title)
|
461 |
-
initial_ret_chart = create_radar_chart(df, default_models, retrieval_metrics, initial_ret_chart_title)
|
462 |
|
463 |
# Create summary dataframe
|
464 |
summary_df = create_summary_df(df, retrieval_metrics, generation_metrics)
|
@@ -526,7 +475,7 @@ with gr.Blocks(css="""
|
|
526 |
}
|
527 |
.info-text {
|
528 |
font-size: 0.9em;
|
529 |
-
font-style: italic;
|
530 |
color: #666;
|
531 |
margin-top: 5px;
|
532 |
}
|
@@ -771,7 +720,7 @@ with gr.Blocks(css="""
|
|
771 |
print(f"IndexError: row_idx {row_idx} out of bounds for the component's data.")
|
772 |
# Potentially return current state without changes
|
773 |
gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
774 |
-
ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"])
|
775 |
return selected_models, gen_chart, ret_chart
|
776 |
except Exception as e:
|
777 |
print(f"Error extracting model name: {e}")
|
@@ -802,7 +751,7 @@ with gr.Blocks(css="""
|
|
802 |
|
803 |
# Create radar charts using the current dataframe and metrics
|
804 |
gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
805 |
-
ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"])
|
806 |
|
807 |
return selected_models, gen_chart, ret_chart
|
808 |
except Exception as e:
|
@@ -810,7 +759,7 @@ with gr.Blocks(css="""
|
|
810 |
print(traceback.format_exc())
|
811 |
# Return potentially existing chart values if error occurs
|
812 |
current_gen_chart = create_radar_chart(df, selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
813 |
-
current_ret_chart = create_radar_chart(df, selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
|
814 |
return selected_models, current_gen_chart, current_ret_chart
|
815 |
|
816 |
|
@@ -852,7 +801,7 @@ with gr.Blocks(css="""
|
|
852 |
|
853 |
# Create charts with localized titles
|
854 |
gen_chart_val = create_radar_chart(df, filtered_selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
855 |
-
ret_chart_val = create_radar_chart(df, filtered_selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
|
856 |
|
857 |
# Create summary dataframe
|
858 |
summary_df_val = create_summary_df(df, retrieval_metrics, generation_metrics)
|
@@ -916,7 +865,7 @@ with gr.Blocks(css="""
|
|
916 |
empty_models = []
|
917 |
# Create empty charts with localized titles
|
918 |
empty_gen_chart = create_radar_chart(df, empty_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
919 |
-
empty_ret_chart = create_radar_chart(df, empty_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
|
920 |
return empty_models, empty_gen_chart, empty_ret_chart
|
921 |
|
922 |
# Connect clear charts button
|
@@ -960,7 +909,7 @@ with gr.Blocks(css="""
|
|
960 |
for category, title_md in category_title_mds.items()},
|
961 |
# Update chart titles dynamically by re-plotting (needed if chart titles change)
|
962 |
generation_chart: create_radar_chart(df, selected_models.value, generation_metrics, lang_dict["radar_gen_title"]),
|
963 |
-
retrieval_chart: create_radar_chart(df, selected_models.value, retrieval_metrics, lang_dict["radar_ret_title"])
|
964 |
}
|
965 |
|
966 |
# Return updates in the correct order based on outputs list below
|
|
|
153 |
"""Filter results by version and process them for display."""
|
154 |
if not results or "items" not in results:
|
155 |
return pd.DataFrame(), [], [], []
|
|
|
|
|
|
|
|
|
156 |
|
157 |
+
all_items = results["items"]
|
158 |
|
159 |
+
# Get all versions and sort them
|
160 |
+
all_versions_sorted = sorted([version.parse(v_str) for v_str in all_items.keys()], reverse=True)
|
161 |
+
|
162 |
+
# Filter versions to consider based on n_versions slider
|
163 |
+
versions_to_consider = all_versions_sorted[:n_versions]
|
164 |
+
versions_to_consider_str = {str(v) for v in versions_to_consider}
|
165 |
|
166 |
+
rows = []
|
167 |
for version_str, version_items in all_items.items():
|
168 |
+
if version_str not in versions_to_consider_str:
|
169 |
+
continue
|
|
|
170 |
|
171 |
+
for guid, item in version_items.items():
|
172 |
+
config = item.get("config", {})
|
173 |
+
model_name = item.get("model_name", "N/A")
|
174 |
+
metrics = item.get("metrics", {})
|
175 |
+
judge_metrics = metrics.get("judge", {})
|
176 |
|
177 |
+
row = {
|
178 |
+
'Model': f"{model_name} ({guid[:6]})",
|
179 |
+
'Embeddings': config.get('embedding_model', 'N/A'),
|
180 |
+
'Top-K': config.get('retrieval_config', {}).get('top_k', 'N/A'),
|
181 |
+
'Judge': round(judge_metrics.get("judge_total_score", 0.0) / 2, 4),
|
182 |
+
'Version': version_str,
|
183 |
+
'Last Updated': item.get("timestamp", ""),
|
184 |
+
'guid': guid
|
185 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
+
if row['Last Updated']:
|
188 |
+
try:
|
189 |
+
dt = datetime.fromisoformat(row['Last Updated'].replace('Z', '+00:00'))
|
190 |
+
row['Last Updated'] = dt.strftime("%Y-%m-%d")
|
191 |
+
except (ValueError, TypeError):
|
192 |
+
pass
|
193 |
+
|
194 |
+
category_sums = {mtype: 0.0 for mtype in METRIC_TYPES}
|
195 |
+
category_counts = {mtype: 0 for mtype in METRIC_TYPES}
|
196 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
for category in QUESTION_CATEGORIES:
|
198 |
if category in metrics:
|
199 |
for metric_type in METRIC_TYPES:
|
200 |
if metric_type in metrics[category]:
|
201 |
metric_values = metrics[category][metric_type]
|
202 |
+
if metric_values and len(metric_values) > 0:
|
203 |
+
avg_value = sum(metric_values.values()) / len(metric_values)
|
204 |
+
col_name = f"{category}_{metric_type}"
|
205 |
+
row[col_name] = round(avg_value, 4)
|
206 |
+
category_sums[metric_type] += avg_value
|
207 |
+
category_counts[metric_type] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
|
209 |
+
for metric_type in METRIC_TYPES:
|
210 |
+
if category_counts[metric_type] > 0:
|
211 |
+
avg = category_sums[metric_type] / category_counts[metric_type]
|
212 |
+
row[f"{metric_type}_avg"] = round(avg, 4)
|
|
|
213 |
|
214 |
+
rows.append(row)
|
215 |
+
|
|
|
|
|
|
|
|
|
216 |
df = pd.DataFrame(rows)
|
217 |
|
218 |
# Get lists of metrics for each category
|
219 |
category_metrics = []
|
220 |
+
if not df.empty:
|
221 |
+
for category in QUESTION_CATEGORIES:
|
222 |
+
metrics_list = []
|
223 |
+
for metric_type in METRIC_TYPES:
|
224 |
+
col_name = f"{category}_{metric_type}"
|
225 |
+
if col_name in df.columns:
|
226 |
+
metrics_list.append(col_name)
|
227 |
+
if metrics_list:
|
228 |
+
category_metrics.append((category, metrics_list))
|
229 |
+
|
230 |
# Define retrieval and generation columns for radar charts
|
231 |
+
retrieval_metrics = []
|
232 |
+
generation_metrics = []
|
233 |
+
if not df.empty:
|
234 |
+
retrieval_metrics = [f"{category}_retrieval" for category, _ in category_metrics if f"{category}_retrieval" in df.columns]
|
235 |
+
generation_metrics = [f"{category}_generation" for category, _ in category_metrics if f"{category}_generation" in df.columns]
|
236 |
|
237 |
return df, retrieval_metrics, generation_metrics, category_metrics
|
238 |
|
239 |
+
def create_radar_chart(df, selected_models, metrics, title, name_col="Model"):
|
240 |
"""Create a radar chart for the selected models and metrics."""
|
241 |
if not metrics or len(selected_models) == 0:
|
242 |
# Return empty figure if no metrics or models selected
|
|
|
282 |
categories_loop = categories + [categories[0]]
|
283 |
|
284 |
fig.add_trace(go.Scatterpolar(
|
285 |
+
name=row[name_col],
|
286 |
r=values,
|
287 |
theta=categories_loop,
|
288 |
showlegend=True,
|
|
|
345 |
generation_avg = summary_df[generation_metrics].mean(axis=1).round(4)
|
346 |
summary_df['Generation (avg)'] = generation_avg
|
347 |
|
348 |
+
# Add total score if all three columns exist
|
349 |
+
if 'Retrieval (avg)' in summary_df.columns and 'Generation (avg)' in summary_df.columns and 'Judge' in summary_df.columns:
|
350 |
+
summary_df['Total Score'] = summary_df[['Retrieval (avg)', 'Generation (avg)', 'Judge']].mean(axis=1).round(4)
|
351 |
summary_df = summary_df.sort_values('Total Score', ascending=False)
|
352 |
|
353 |
# Select columns for display
|
354 |
+
summary_cols = ['Model', 'Embeddings', 'Top-K']
|
355 |
+
if 'Judge' in summary_df.columns:
|
356 |
+
summary_cols.append('Judge')
|
357 |
if 'Retrieval (avg)' in summary_df.columns:
|
358 |
summary_cols.append('Retrieval (avg)')
|
359 |
if 'Generation (avg)' in summary_df.columns:
|
360 |
summary_cols.append('Generation (avg)')
|
361 |
if 'Total Score' in summary_df.columns:
|
362 |
summary_cols.append('Total Score')
|
363 |
+
if 'Version' in summary_df.columns:
|
364 |
+
summary_cols.append('Version')
|
365 |
if 'Last Updated' in summary_df.columns:
|
366 |
summary_cols.append('Last Updated')
|
367 |
|
|
|
381 |
category_df = category_df.sort_values(f'Score', ascending=False)
|
382 |
|
383 |
# Select columns for display
|
384 |
+
category_cols = ['Model', 'Embeddings', retrieval_col, generation_col, f'Score']
|
385 |
|
386 |
# Rename columns for display
|
387 |
category_df = category_df[category_cols].rename(columns={
|
|
|
407 |
initial_gen_chart_title = LANGUAGES[DEFAULT_LANG]["radar_gen_title"]
|
408 |
initial_ret_chart_title = LANGUAGES[DEFAULT_LANG]["radar_ret_title"]
|
409 |
initial_gen_chart = create_radar_chart(df, default_models, generation_metrics, initial_gen_chart_title)
|
410 |
+
initial_ret_chart = create_radar_chart(df, default_models, retrieval_metrics, initial_ret_chart_title, name_col='Embeddings')
|
411 |
|
412 |
# Create summary dataframe
|
413 |
summary_df = create_summary_df(df, retrieval_metrics, generation_metrics)
|
|
|
475 |
}
|
476 |
.info-text {
|
477 |
font-size: 0.9em;
|
478 |
+
font-style: italic;
|
479 |
color: #666;
|
480 |
margin-top: 5px;
|
481 |
}
|
|
|
720 |
print(f"IndexError: row_idx {row_idx} out of bounds for the component's data.")
|
721 |
# Potentially return current state without changes
|
722 |
gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
723 |
+
ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
|
724 |
return selected_models, gen_chart, ret_chart
|
725 |
except Exception as e:
|
726 |
print(f"Error extracting model name: {e}")
|
|
|
751 |
|
752 |
# Create radar charts using the current dataframe and metrics
|
753 |
gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
754 |
+
ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
|
755 |
|
756 |
return selected_models, gen_chart, ret_chart
|
757 |
except Exception as e:
|
|
|
759 |
print(traceback.format_exc())
|
760 |
# Return potentially existing chart values if error occurs
|
761 |
current_gen_chart = create_radar_chart(df, selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
762 |
+
current_ret_chart = create_radar_chart(df, selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
|
763 |
return selected_models, current_gen_chart, current_ret_chart
|
764 |
|
765 |
|
|
|
801 |
|
802 |
# Create charts with localized titles
|
803 |
gen_chart_val = create_radar_chart(df, filtered_selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
804 |
+
ret_chart_val = create_radar_chart(df, filtered_selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
|
805 |
|
806 |
# Create summary dataframe
|
807 |
summary_df_val = create_summary_df(df, retrieval_metrics, generation_metrics)
|
|
|
865 |
empty_models = []
|
866 |
# Create empty charts with localized titles
|
867 |
empty_gen_chart = create_radar_chart(df, empty_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
|
868 |
+
empty_ret_chart = create_radar_chart(df, empty_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
|
869 |
return empty_models, empty_gen_chart, empty_ret_chart
|
870 |
|
871 |
# Connect clear charts button
|
|
|
909 |
for category, title_md in category_title_mds.items()},
|
910 |
# Update chart titles dynamically by re-plotting (needed if chart titles change)
|
911 |
generation_chart: create_radar_chart(df, selected_models.value, generation_metrics, lang_dict["radar_gen_title"]),
|
912 |
+
retrieval_chart: create_radar_chart(df, selected_models.value, retrieval_metrics, lang_dict["radar_ret_title"], name_col='Embeddings')
|
913 |
}
|
914 |
|
915 |
# Return updates in the correct order based on outputs list below
|