ai-forever commited on
Commit
1e69798
·
verified ·
1 Parent(s): 23f3db7

Update leaderboard display

Browse files
Files changed (1) hide show
  1. app.py +80 -131
app.py CHANGED
@@ -153,143 +153,90 @@ def filter_and_process_results(results, n_versions, only_actual_versions):
153
  """Filter results by version and process them for display."""
154
  if not results or "items" not in results:
155
  return pd.DataFrame(), [], [], []
156
-
157
- all_items = results["items"]
158
- last_version_str = results.get("last_version", "1.0")
159
- last_version = version.parse(last_version_str)
160
 
161
- print(f"Last version: {last_version_str}")
162
 
163
- # Group items by model_name
164
- model_groups = {}
 
 
 
 
165
 
 
166
  for version_str, version_items in all_items.items():
167
- version_obj = version.parse(version_str)
168
- for item_id, item in version_items.items():
169
- model_name = item.get("model_name", "Unknown")
170
 
171
- if model_name not in model_groups:
172
- model_groups[model_name] = []
 
 
 
173
 
174
- # Add version info to the item (both as string and as parsed version object for comparison)
175
- item["version_str"] = version_str
176
- item["version_obj"] = version_obj
177
- model_groups[model_name].append(item)
178
-
179
- rows = []
180
- for model_name, items in model_groups.items():
181
- # Sort items by version (newest first)
182
- items.sort(key=lambda x: x["version_obj"], reverse=True)
183
-
184
- # Filter versions based on selection
185
- filtered_items = []
186
-
187
- if only_actual_versions:
188
- # Get the n most recent actual dataset versions
189
- all_versions = sorted([version.parse(v_str) for v_str in all_items.keys()], reverse=True)
190
- # Take at most n_versions
191
- versions_to_consider = all_versions[:n_versions] if all_versions else []
192
 
193
- # Filter items that match those versions
194
- filtered_items = [item for item in items if any(item["version_obj"] == v for v in versions_to_consider)]
195
- else:
196
- # Consider n_versions most recent items for this model
197
- filtered_items = items[:n_versions]
198
-
199
- if not filtered_items:
200
- continue
201
-
202
- config = filtered_items[0]["config"] # Use config from most recent version
203
-
204
- # Create row with basic info
205
- row = {
206
- 'Model': model_name,
207
- 'Embeddings': config.get('embedding_model', 'N/A'),
208
- 'Retriever': config.get('retriever_type', 'N/A'),
209
- 'Top-K': config.get('retrieval_config', {}).get('top_k', 'N/A'),
210
- 'Versions': ", ".join([item["version_str"] for item in filtered_items]),
211
- 'Last Updated': filtered_items[0].get("timestamp", "")
212
- }
213
-
214
- # Format timestamp if available
215
- if row['Last Updated']:
216
- try:
217
- dt = datetime.fromisoformat(row['Last Updated'].replace('Z', '+00:00'))
218
- row['Last Updated'] = dt.strftime("%Y-%m-%d")
219
- except:
220
- pass
221
-
222
- # Process metrics based on categories
223
- category_metrics = {
224
- category: {
225
- metric_type: {
226
- "avg": 0.0,
227
- "count": 0
228
- } for metric_type in METRIC_TYPES
229
- } for category in QUESTION_CATEGORIES
230
- }
231
-
232
- # Collect metrics by category
233
- for item in filtered_items:
234
- metrics = item.get("metrics", {})
235
  for category in QUESTION_CATEGORIES:
236
  if category in metrics:
237
  for metric_type in METRIC_TYPES:
238
  if metric_type in metrics[category]:
239
  metric_values = metrics[category][metric_type]
240
- avg_value = sum(metric_values.values()) / len(metric_values)
241
-
242
- # Add to the running sum for this category and metric type
243
- category_metrics[category][metric_type]["avg"] += avg_value
244
- category_metrics[category][metric_type]["count"] += 1
245
-
246
- # Calculate averages and add to row
247
- for category in QUESTION_CATEGORIES:
248
- for metric_type in METRIC_TYPES:
249
- metric_data = category_metrics[category][metric_type]
250
- if metric_data["count"] > 0:
251
- avg_value = metric_data["avg"] / metric_data["count"]
252
- # Add to row with appropriate column name
253
- col_name = f"{category}_{metric_type}"
254
- row[col_name] = round(avg_value, 4)
255
-
256
- # Calculate overall averages for each metric type
257
- for metric_type in METRIC_TYPES:
258
- total_sum = 0
259
- total_count = 0
260
 
261
- for category in QUESTION_CATEGORIES:
262
- metric_data = category_metrics[category][metric_type]
263
- if metric_data["count"] > 0:
264
- total_sum += metric_data["avg"]
265
- total_count += metric_data["count"]
266
 
267
- if total_count > 0:
268
- row[f"{metric_type}_avg"] = round(total_sum / total_count, 4)
269
-
270
- rows.append(row)
271
-
272
- # Create DataFrame
273
  df = pd.DataFrame(rows)
274
 
275
  # Get lists of metrics for each category
276
  category_metrics = []
277
- for category in QUESTION_CATEGORIES:
278
- metrics = []
279
- for metric_type in METRIC_TYPES:
280
- col_name = f"{category}_{metric_type}"
281
- if col_name in df.columns:
282
- metrics.append(col_name)
283
- if metrics:
284
- category_metrics.append((category, metrics))
285
-
 
286
  # Define retrieval and generation columns for radar charts
287
- retrieval_metrics = [f"{category}_retrieval" for category in QUESTION_CATEGORIES if f"{category}_retrieval" in df.columns]
288
- generation_metrics = [f"{category}_generation" for category in QUESTION_CATEGORIES if f"{category}_generation" in df.columns]
 
 
 
289
 
290
  return df, retrieval_metrics, generation_metrics, category_metrics
291
 
292
- def create_radar_chart(df, selected_models, metrics, title):
293
  """Create a radar chart for the selected models and metrics."""
294
  if not metrics or len(selected_models) == 0:
295
  # Return empty figure if no metrics or models selected
@@ -335,7 +282,7 @@ def create_radar_chart(df, selected_models, metrics, title):
335
  categories_loop = categories + [categories[0]]
336
 
337
  fig.add_trace(go.Scatterpolar(
338
- name=row['Model'],
339
  r=values,
340
  theta=categories_loop,
341
  showlegend=True,
@@ -398,21 +345,23 @@ def create_summary_df(df, retrieval_metrics, generation_metrics):
398
  generation_avg = summary_df[generation_metrics].mean(axis=1).round(4)
399
  summary_df['Generation (avg)'] = generation_avg
400
 
401
- # Add total score if both averages exist
402
- if 'Retrieval (avg)' in summary_df.columns and 'Generation (avg)' in summary_df.columns:
403
- summary_df['Total Score'] = summary_df['Retrieval (avg)'] + summary_df['Generation (avg)']
404
  summary_df = summary_df.sort_values('Total Score', ascending=False)
405
 
406
  # Select columns for display
407
- summary_cols = ['Model', 'Embeddings', 'Retriever', 'Top-K']
 
 
408
  if 'Retrieval (avg)' in summary_df.columns:
409
  summary_cols.append('Retrieval (avg)')
410
  if 'Generation (avg)' in summary_df.columns:
411
  summary_cols.append('Generation (avg)')
412
  if 'Total Score' in summary_df.columns:
413
  summary_cols.append('Total Score')
414
- if 'Versions' in summary_df.columns:
415
- summary_cols.append('Versions')
416
  if 'Last Updated' in summary_df.columns:
417
  summary_cols.append('Last Updated')
418
 
@@ -432,7 +381,7 @@ def create_category_df(df, category, retrieval_col, generation_col):
432
  category_df = category_df.sort_values(f'Score', ascending=False)
433
 
434
  # Select columns for display
435
- category_cols = ['Model', 'Embeddings', 'Retriever', retrieval_col, generation_col, f'Score']
436
 
437
  # Rename columns for display
438
  category_df = category_df[category_cols].rename(columns={
@@ -458,7 +407,7 @@ default_models = df['Model'].head(5).tolist() if not df.empty else []
458
  initial_gen_chart_title = LANGUAGES[DEFAULT_LANG]["radar_gen_title"]
459
  initial_ret_chart_title = LANGUAGES[DEFAULT_LANG]["radar_ret_title"]
460
  initial_gen_chart = create_radar_chart(df, default_models, generation_metrics, initial_gen_chart_title)
461
- initial_ret_chart = create_radar_chart(df, default_models, retrieval_metrics, initial_ret_chart_title)
462
 
463
  # Create summary dataframe
464
  summary_df = create_summary_df(df, retrieval_metrics, generation_metrics)
@@ -526,7 +475,7 @@ with gr.Blocks(css="""
526
  }
527
  .info-text {
528
  font-size: 0.9em;
529
- font-style: italic;
530
  color: #666;
531
  margin-top: 5px;
532
  }
@@ -771,7 +720,7 @@ with gr.Blocks(css="""
771
  print(f"IndexError: row_idx {row_idx} out of bounds for the component's data.")
772
  # Potentially return current state without changes
773
  gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
774
- ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"])
775
  return selected_models, gen_chart, ret_chart
776
  except Exception as e:
777
  print(f"Error extracting model name: {e}")
@@ -802,7 +751,7 @@ with gr.Blocks(css="""
802
 
803
  # Create radar charts using the current dataframe and metrics
804
  gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
805
- ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"])
806
 
807
  return selected_models, gen_chart, ret_chart
808
  except Exception as e:
@@ -810,7 +759,7 @@ with gr.Blocks(css="""
810
  print(traceback.format_exc())
811
  # Return potentially existing chart values if error occurs
812
  current_gen_chart = create_radar_chart(df, selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
813
- current_ret_chart = create_radar_chart(df, selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
814
  return selected_models, current_gen_chart, current_ret_chart
815
 
816
 
@@ -852,7 +801,7 @@ with gr.Blocks(css="""
852
 
853
  # Create charts with localized titles
854
  gen_chart_val = create_radar_chart(df, filtered_selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
855
- ret_chart_val = create_radar_chart(df, filtered_selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
856
 
857
  # Create summary dataframe
858
  summary_df_val = create_summary_df(df, retrieval_metrics, generation_metrics)
@@ -916,7 +865,7 @@ with gr.Blocks(css="""
916
  empty_models = []
917
  # Create empty charts with localized titles
918
  empty_gen_chart = create_radar_chart(df, empty_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
919
- empty_ret_chart = create_radar_chart(df, empty_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
920
  return empty_models, empty_gen_chart, empty_ret_chart
921
 
922
  # Connect clear charts button
@@ -960,7 +909,7 @@ with gr.Blocks(css="""
960
  for category, title_md in category_title_mds.items()},
961
  # Update chart titles dynamically by re-plotting (needed if chart titles change)
962
  generation_chart: create_radar_chart(df, selected_models.value, generation_metrics, lang_dict["radar_gen_title"]),
963
- retrieval_chart: create_radar_chart(df, selected_models.value, retrieval_metrics, lang_dict["radar_ret_title"])
964
  }
965
 
966
  # Return updates in the correct order based on outputs list below
 
153
  """Filter results by version and process them for display."""
154
  if not results or "items" not in results:
155
  return pd.DataFrame(), [], [], []
 
 
 
 
156
 
157
+ all_items = results["items"]
158
 
159
+ # Get all versions and sort them
160
+ all_versions_sorted = sorted([version.parse(v_str) for v_str in all_items.keys()], reverse=True)
161
+
162
+ # Filter versions to consider based on n_versions slider
163
+ versions_to_consider = all_versions_sorted[:n_versions]
164
+ versions_to_consider_str = {str(v) for v in versions_to_consider}
165
 
166
+ rows = []
167
  for version_str, version_items in all_items.items():
168
+ if version_str not in versions_to_consider_str:
169
+ continue
 
170
 
171
+ for guid, item in version_items.items():
172
+ config = item.get("config", {})
173
+ model_name = item.get("model_name", "N/A")
174
+ metrics = item.get("metrics", {})
175
+ judge_metrics = metrics.get("judge", {})
176
 
177
+ row = {
178
+ 'Model': f"{model_name} ({guid[:6]})",
179
+ 'Embeddings': config.get('embedding_model', 'N/A'),
180
+ 'Top-K': config.get('retrieval_config', {}).get('top_k', 'N/A'),
181
+ 'Judge': round(judge_metrics.get("judge_total_score", 0.0) / 2, 4),
182
+ 'Version': version_str,
183
+ 'Last Updated': item.get("timestamp", ""),
184
+ 'guid': guid
185
+ }
 
 
 
 
 
 
 
 
 
186
 
187
+ if row['Last Updated']:
188
+ try:
189
+ dt = datetime.fromisoformat(row['Last Updated'].replace('Z', '+00:00'))
190
+ row['Last Updated'] = dt.strftime("%Y-%m-%d")
191
+ except (ValueError, TypeError):
192
+ pass
193
+
194
+ category_sums = {mtype: 0.0 for mtype in METRIC_TYPES}
195
+ category_counts = {mtype: 0 for mtype in METRIC_TYPES}
196
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  for category in QUESTION_CATEGORIES:
198
  if category in metrics:
199
  for metric_type in METRIC_TYPES:
200
  if metric_type in metrics[category]:
201
  metric_values = metrics[category][metric_type]
202
+ if metric_values and len(metric_values) > 0:
203
+ avg_value = sum(metric_values.values()) / len(metric_values)
204
+ col_name = f"{category}_{metric_type}"
205
+ row[col_name] = round(avg_value, 4)
206
+ category_sums[metric_type] += avg_value
207
+ category_counts[metric_type] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ for metric_type in METRIC_TYPES:
210
+ if category_counts[metric_type] > 0:
211
+ avg = category_sums[metric_type] / category_counts[metric_type]
212
+ row[f"{metric_type}_avg"] = round(avg, 4)
 
213
 
214
+ rows.append(row)
215
+
 
 
 
 
216
  df = pd.DataFrame(rows)
217
 
218
  # Get lists of metrics for each category
219
  category_metrics = []
220
+ if not df.empty:
221
+ for category in QUESTION_CATEGORIES:
222
+ metrics_list = []
223
+ for metric_type in METRIC_TYPES:
224
+ col_name = f"{category}_{metric_type}"
225
+ if col_name in df.columns:
226
+ metrics_list.append(col_name)
227
+ if metrics_list:
228
+ category_metrics.append((category, metrics_list))
229
+
230
  # Define retrieval and generation columns for radar charts
231
+ retrieval_metrics = []
232
+ generation_metrics = []
233
+ if not df.empty:
234
+ retrieval_metrics = [f"{category}_retrieval" for category, _ in category_metrics if f"{category}_retrieval" in df.columns]
235
+ generation_metrics = [f"{category}_generation" for category, _ in category_metrics if f"{category}_generation" in df.columns]
236
 
237
  return df, retrieval_metrics, generation_metrics, category_metrics
238
 
239
+ def create_radar_chart(df, selected_models, metrics, title, name_col="Model"):
240
  """Create a radar chart for the selected models and metrics."""
241
  if not metrics or len(selected_models) == 0:
242
  # Return empty figure if no metrics or models selected
 
282
  categories_loop = categories + [categories[0]]
283
 
284
  fig.add_trace(go.Scatterpolar(
285
+ name=row[name_col],
286
  r=values,
287
  theta=categories_loop,
288
  showlegend=True,
 
345
  generation_avg = summary_df[generation_metrics].mean(axis=1).round(4)
346
  summary_df['Generation (avg)'] = generation_avg
347
 
348
+ # Add total score if all three columns exist
349
+ if 'Retrieval (avg)' in summary_df.columns and 'Generation (avg)' in summary_df.columns and 'Judge' in summary_df.columns:
350
+ summary_df['Total Score'] = summary_df[['Retrieval (avg)', 'Generation (avg)', 'Judge']].mean(axis=1).round(4)
351
  summary_df = summary_df.sort_values('Total Score', ascending=False)
352
 
353
  # Select columns for display
354
+ summary_cols = ['Model', 'Embeddings', 'Top-K']
355
+ if 'Judge' in summary_df.columns:
356
+ summary_cols.append('Judge')
357
  if 'Retrieval (avg)' in summary_df.columns:
358
  summary_cols.append('Retrieval (avg)')
359
  if 'Generation (avg)' in summary_df.columns:
360
  summary_cols.append('Generation (avg)')
361
  if 'Total Score' in summary_df.columns:
362
  summary_cols.append('Total Score')
363
+ if 'Version' in summary_df.columns:
364
+ summary_cols.append('Version')
365
  if 'Last Updated' in summary_df.columns:
366
  summary_cols.append('Last Updated')
367
 
 
381
  category_df = category_df.sort_values(f'Score', ascending=False)
382
 
383
  # Select columns for display
384
+ category_cols = ['Model', 'Embeddings', retrieval_col, generation_col, f'Score']
385
 
386
  # Rename columns for display
387
  category_df = category_df[category_cols].rename(columns={
 
407
  initial_gen_chart_title = LANGUAGES[DEFAULT_LANG]["radar_gen_title"]
408
  initial_ret_chart_title = LANGUAGES[DEFAULT_LANG]["radar_ret_title"]
409
  initial_gen_chart = create_radar_chart(df, default_models, generation_metrics, initial_gen_chart_title)
410
+ initial_ret_chart = create_radar_chart(df, default_models, retrieval_metrics, initial_ret_chart_title, name_col='Embeddings')
411
 
412
  # Create summary dataframe
413
  summary_df = create_summary_df(df, retrieval_metrics, generation_metrics)
 
475
  }
476
  .info-text {
477
  font-size: 0.9em;
478
+ font-style: italic;
479
  color: #666;
480
  margin-top: 5px;
481
  }
 
720
  print(f"IndexError: row_idx {row_idx} out of bounds for the component's data.")
721
  # Potentially return current state without changes
722
  gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
723
+ ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
724
  return selected_models, gen_chart, ret_chart
725
  except Exception as e:
726
  print(f"Error extracting model name: {e}")
 
751
 
752
  # Create radar charts using the current dataframe and metrics
753
  gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
754
+ ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
755
 
756
  return selected_models, gen_chart, ret_chart
757
  except Exception as e:
 
759
  print(traceback.format_exc())
760
  # Return potentially existing chart values if error occurs
761
  current_gen_chart = create_radar_chart(df, selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
762
+ current_ret_chart = create_radar_chart(df, selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
763
  return selected_models, current_gen_chart, current_ret_chart
764
 
765
 
 
801
 
802
  # Create charts with localized titles
803
  gen_chart_val = create_radar_chart(df, filtered_selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
804
+ ret_chart_val = create_radar_chart(df, filtered_selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
805
 
806
  # Create summary dataframe
807
  summary_df_val = create_summary_df(df, retrieval_metrics, generation_metrics)
 
865
  empty_models = []
866
  # Create empty charts with localized titles
867
  empty_gen_chart = create_radar_chart(df, empty_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
868
+ empty_ret_chart = create_radar_chart(df, empty_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"], name_col='Embeddings')
869
  return empty_models, empty_gen_chart, empty_ret_chart
870
 
871
  # Connect clear charts button
 
909
  for category, title_md in category_title_mds.items()},
910
  # Update chart titles dynamically by re-plotting (needed if chart titles change)
911
  generation_chart: create_radar_chart(df, selected_models.value, generation_metrics, lang_dict["radar_gen_title"]),
912
+ retrieval_chart: create_radar_chart(df, selected_models.value, retrieval_metrics, lang_dict["radar_ret_title"], name_col='Embeddings')
913
  }
914
 
915
  # Return updates in the correct order based on outputs list below