apsys commited on
Commit
76284d6
·
1 Parent(s): 0ba1062
Files changed (1) hide show
  1. app.py +467 -227
app.py CHANGED
@@ -47,13 +47,15 @@ from src.envs import (
47
  RESULTS_DATASET_ID,
48
  SUBMITTER_TOKEN,
49
  TOKEN,
50
- DATA_PATH
51
  )
52
  from src.populate import get_leaderboard_df, get_category_leaderboard_df
53
  from src.submission.submit import process_submission
54
 
55
  # Configure logging
56
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
57
  logger = logging.getLogger(__name__)
58
 
59
  # Ensure data directory exists
@@ -76,65 +78,65 @@ custom_theme = gr.themes.Default(
76
  primary_hue=colors.slate,
77
  secondary_hue=colors.slate,
78
  neutral_hue=colors.neutral,
79
- font=(fonts.GoogleFont("Inter"), "sans-serif")
80
  ).set(
81
  # font_size="16px",
82
  body_background_fill="#0f0f10",
83
  body_background_fill_dark="#0f0f10",
84
  body_text_color="#f4f4f5",
85
  body_text_color_subdued="#a1a1aa",
86
- block_background_fill="#1e1e1e", # Cooler Grey
87
- block_border_color="#333333", # Cooler Grey
88
  block_shadow="none",
89
  # Swapped primary and secondary button styles
90
- button_primary_background_fill="#121212", # Changed to specific color for Refresh button
91
  button_primary_text_color="#f4f4f5",
92
- button_primary_border_color="#333333", # Keep border grey or change to #121212?
93
  button_secondary_background_fill="#f4f4f5",
94
  button_secondary_text_color="#0f0f10",
95
  button_secondary_border_color="#f4f4f5",
96
- input_background_fill="#1e1e1e", # Cooler Grey
97
- input_border_color="#333333", # Cooler Grey
98
  input_placeholder_color="#71717a",
99
- table_border_color="#333333", # Cooler Grey
100
- table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
101
- table_odd_background_fill="#1e1e1e", # Cooler Grey
102
  table_text_color="#f4f4f5",
103
  link_text_color="#ffffff",
104
- border_color_primary="#333333", # Cooler Grey
105
- background_fill_secondary="#333333", # Cooler Grey
106
  color_accent="#f4f4f5",
107
- border_color_accent="#333333", # Cooler Grey
108
- button_primary_background_fill_hover="#424242", # Cooler Grey
109
  block_title_text_color="#f4f4f5",
110
  accordion_text_color="#f4f4f5",
111
- panel_background_fill="#1e1e1e", # Cooler Grey
112
- panel_border_color="#333333", # Cooler Grey
113
  # Explicitly setting primary/secondary/accent colors/borders
114
  background_fill_primary="#0f0f10",
115
  background_fill_primary_dark="#0f0f10",
116
- background_fill_secondary_dark="#333333", # Cooler Grey
117
- border_color_primary_dark="#333333", # Cooler Grey
118
- border_color_accent_dark="#333333", # Cooler Grey
119
- border_color_accent_subdued="#424242", # Cooler Grey
120
  border_color_accent_subdued_dark="#424242", # Cooler Grey
121
  color_accent_soft="#a1a1aa",
122
  color_accent_soft_dark="#a1a1aa",
123
  # Explicitly setting input hover/focus states
124
- input_background_fill_dark="#1e1e1e", # Cooler Grey
125
- input_background_fill_focus="#424242", # Cooler Grey
126
- input_background_fill_focus_dark="#424242",# Cooler Grey
127
- input_background_fill_hover="#2d2d2d", # Cooler Grey
128
- input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
129
- input_border_color_dark="#333333", # Cooler Grey
130
  input_border_color_focus="#f4f4f5",
131
  input_border_color_focus_dark="#f4f4f5",
132
- input_border_color_hover="#424242", # Cooler Grey
133
- input_border_color_hover_dark="#424242", # Cooler Grey
134
  input_placeholder_color_dark="#71717a",
135
  # Explicitly set dark variants for table backgrounds
136
- table_even_background_fill_dark="#2d2d2d", # Cooler Grey
137
- table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
138
  # Explicitly set dark text variants
139
  body_text_color_dark="#f4f4f5",
140
  body_text_color_subdued_dark="#a1a1aa",
@@ -142,15 +144,17 @@ custom_theme = gr.themes.Default(
142
  accordion_text_color_dark="#f4f4f5",
143
  table_text_color_dark="#f4f4f5",
144
  # Explicitly set dark panel/block variants
145
- panel_background_fill_dark="#1e1e1e", # Cooler Grey
146
- panel_border_color_dark="#333333", # Cooler Grey
147
- block_background_fill_dark="#1e1e1e", # Cooler Grey
148
- block_border_color_dark="#333333", # Cooler Grey
149
  )
150
 
 
151
  @dataclass
152
  class ColumnInfo:
153
  """Information about a column in the leaderboard."""
 
154
  name: str
155
  display_name: str
156
  type: str = "text"
@@ -158,6 +162,7 @@ class ColumnInfo:
158
  never_hidden: bool = False
159
  displayed_by_default: bool = True
160
 
 
161
  def update_column_choices(df):
162
  """Update column choices based on what's actually in the dataframe"""
163
  if df is None or df.empty:
@@ -170,8 +175,11 @@ def update_column_choices(df):
170
  all_columns = get_all_column_choices()
171
 
172
  # Filter to only include columns that exist in the dataframe
173
- valid_columns = [(col_name, display_name) for col_name, display_name in all_columns
174
- if col_name in existing_columns]
 
 
 
175
 
176
  # Return default if there are no valid columns
177
  if not valid_columns:
@@ -179,6 +187,7 @@ def update_column_choices(df):
179
 
180
  return valid_columns
181
 
 
182
  # Update the column_selector initialization
183
  def get_initial_columns():
184
  """Get initial columns to show in the dropdown"""
@@ -192,7 +201,9 @@ def get_initial_columns():
192
  return get_default_visible_columns()
193
 
194
  # Get default visible columns that actually exist in the dataframe
195
- valid_defaults = [col for col in get_default_visible_columns() if col in available_cols]
 
 
196
 
197
  # If none of the defaults exist, return all available columns
198
  if not valid_defaults:
@@ -203,6 +214,7 @@ def get_initial_columns():
203
  logger.error(f"Error getting initial columns: {e}")
204
  return get_default_visible_columns()
205
 
 
206
  def init_leaderboard(dataframe, visible_columns=None):
207
  """
208
  Initialize a standard Gradio Dataframe component for the leaderboard.
@@ -216,7 +228,9 @@ def init_leaderboard(dataframe, visible_columns=None):
216
  # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
217
 
218
  # Determine which columns to display
219
- display_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
 
 
220
  hidden_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS]
221
 
222
  # Columns that should always be shown
@@ -225,7 +239,9 @@ def init_leaderboard(dataframe, visible_columns=None):
225
  # Use provided visible columns if specified, otherwise use default
226
  if visible_columns is None:
227
  # Determine which columns to show initially
228
- visible_columns = [col for col in display_column_names if col not in hidden_column_names]
 
 
229
 
230
  # Always include the never-hidden columns
231
  for col in always_visible:
@@ -238,13 +254,13 @@ def init_leaderboard(dataframe, visible_columns=None):
238
  # Map GuardBench column types to Gradio's expected datatype strings
239
  # Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
240
  type_mapping = {
241
- 'text': 'str',
242
- 'number': 'number',
243
- 'bool': 'bool',
244
- 'date': 'date',
245
- 'markdown': 'markdown',
246
- 'html': 'html',
247
- 'image': 'image'
248
  }
249
 
250
  # Create a list of datatypes in the format Gradio expects
@@ -256,26 +272,26 @@ def init_leaderboard(dataframe, visible_columns=None):
256
  if getattr(GUARDBENCH_COLUMN, display_col).name == col:
257
  orig_type = getattr(GUARDBENCH_COLUMN, display_col).type
258
  # Map to Gradio's expected types
259
- col_type = type_mapping.get(orig_type, 'str')
260
  break
261
 
262
  # Default to 'str' if type not found or not mappable
263
  if col_type is None:
264
- col_type = 'str'
265
 
266
  datatypes.append(col_type)
267
 
268
  # Create a dummy column for search functionality if it doesn't exist
269
- if 'search_dummy' not in dataframe.columns:
270
- dataframe['search_dummy'] = dataframe.apply(
271
- lambda row: ' '.join(str(val) for val in row.values if pd.notna(val)),
272
- axis=1
273
  )
274
 
275
  # Select only the visible columns for display
276
- visible_columns.remove('model_name')
277
 
278
- visible_columns = ['model_name'] + visible_columns
279
  display_df = dataframe[visible_columns].copy()
280
 
281
  # print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
@@ -288,17 +304,25 @@ def init_leaderboard(dataframe, visible_columns=None):
288
  # Avoid rounding integer columns like counts
289
  if not pd.api.types.is_integer_dtype(display_df[col]):
290
  # Format floats to exactly 3 decimal places, preserving trailing zeros
291
- display_df[col] = display_df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else None)
292
-
 
293
 
294
- column_info_map = {f.name: getattr(GUARDBENCH_COLUMN, f.name) for f in fields(GUARDBENCH_COLUMN)}
295
- column_mapping = {col: column_info_map.get(col, ColumnInfo(col, col)).display_name for col in visible_columns}
 
 
 
 
 
296
 
297
  # Rename columns in the DataFrame
298
  display_df.rename(columns=column_mapping, inplace=True)
299
 
300
  # Apply styling - note: styling might need adjustment if it relies on column names
301
- styler = display_df.style.set_properties(**{'text-align': 'right'})
 
 
302
 
303
  return gr.Dataframe(
304
  value=styler,
@@ -307,11 +331,13 @@ def init_leaderboard(dataframe, visible_columns=None):
307
  wrap=True,
308
  height=2500,
309
  elem_id="leaderboard-table",
310
- row_count=len(display_df)
311
  )
312
 
313
 
314
- def search_filter_leaderboard(df, search_query="", model_types=None, version=CURRENT_VERSION):
 
 
315
  """
316
  Filter the leaderboard based on search query and model types.
317
  """
@@ -321,23 +347,29 @@ def search_filter_leaderboard(df, search_query="", model_types=None, version=CUR
321
  filtered_df = df.copy()
322
 
323
  # Add search dummy column if it doesn't exist
324
- if 'search_dummy' not in filtered_df.columns:
325
- filtered_df['search_dummy'] = filtered_df.apply(
326
- lambda row: ' '.join(str(val) for val in row.values if pd.notna(val)),
327
- axis=1
328
  )
329
 
330
  # Apply model type filter
331
  if model_types and len(model_types) > 0:
332
- filtered_df = filtered_df[filtered_df[GUARDBENCH_COLUMN.model_type.name].isin(model_types)]
 
 
333
 
334
  # Apply search query
335
  if search_query:
336
- search_terms = [term.strip() for term in search_query.split(";") if term.strip()]
 
 
337
  if search_terms:
338
  combined_mask = None
339
  for term in search_terms:
340
- mask = filtered_df['search_dummy'].str.contains(term, case=False, na=False)
 
 
341
  if combined_mask is None:
342
  combined_mask = mask
343
  else:
@@ -347,11 +379,13 @@ def search_filter_leaderboard(df, search_query="", model_types=None, version=CUR
347
  filtered_df = filtered_df[combined_mask]
348
 
349
  # Drop the search dummy column before returning
350
- visible_columns = [col for col in filtered_df.columns if col != 'search_dummy']
351
  return filtered_df[visible_columns]
352
 
353
 
354
- def refresh_data_with_filters(version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None):
 
 
355
  """
356
  Refresh the leaderboard data and update all components with filtering.
357
  Ensures we handle cases where dataframes might have limited columns.
@@ -362,14 +396,27 @@ def refresh_data_with_filters(version=CURRENT_VERSION, search_query="", model_ty
362
  # Get new data
363
  main_df = get_leaderboard_df(version=version)
364
  LEADERBOARD_DF = main_df
365
- category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
366
- selected_columns = [x.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("_recall", "_recall_binary").replace("_precision", "_precision_binary") for x in selected_columns]
 
 
 
 
 
 
 
 
 
 
 
367
 
368
  # Log the actual columns we have
369
  logger.info(f"Main dataframe columns: {list(main_df.columns)}")
370
 
371
  # Apply filters to each dataframe
372
- filtered_main_df = search_filter_leaderboard(main_df, search_query, model_types, version)
 
 
373
  filtered_category_dfs = [
374
  search_filter_leaderboard(df, search_query, model_types, version)
375
  for df in category_dfs
@@ -381,15 +428,30 @@ def refresh_data_with_filters(version=CURRENT_VERSION, search_query="", model_ty
381
  # Filter selected columns to only those available in the data
382
  if selected_columns:
383
  # Convert display names to internal names first
384
- internal_selected_columns = [x.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("_recall", "_recall_binary").replace("_precision", "_precision_binary") for x in selected_columns]
385
- valid_selected_columns = [col for col in internal_selected_columns if col in available_columns]
386
- if not valid_selected_columns and 'model_name' in available_columns:
 
 
 
 
 
 
 
 
 
 
387
  # Fallback if conversion/filtering leads to empty selection
388
- valid_selected_columns = ['model_name'] + [col for col in get_default_visible_columns() if col in available_columns]
 
 
 
 
389
  else:
390
  # If no columns were selected in the dropdown, use default visible columns that exist
391
- valid_selected_columns = [col for col in get_default_visible_columns() if col in available_columns]
392
-
 
393
 
394
  # Initialize dataframes for display with valid selected columns
395
  main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
@@ -398,9 +460,11 @@ def refresh_data_with_filters(version=CURRENT_VERSION, search_query="", model_ty
398
  category_dataframes = []
399
  for df in filtered_category_dfs:
400
  df_columns = list(df.columns)
401
- df_valid_columns = [col for col in valid_selected_columns if col in df_columns]
402
- if not df_valid_columns and 'model_name' in df_columns:
403
- df_valid_columns = ['model_name'] + get_default_visible_columns()
 
 
404
  category_dataframes.append(init_leaderboard(df, df_valid_columns))
405
 
406
  return main_dataframe, *category_dataframes
@@ -408,7 +472,9 @@ def refresh_data_with_filters(version=CURRENT_VERSION, search_query="", model_ty
408
  except Exception as e:
409
  logger.error(f"Error in refresh with filters: {e}")
410
  # Return the current leaderboards on error
411
- return leaderboard, *[tab.children[0] for tab in category_tabs.children[1:len(CATEGORIES)+1]]
 
 
412
 
413
 
414
  def submit_results(
@@ -421,7 +487,7 @@ def submit_results(
421
  mode: str,
422
  submission_file: tempfile._TemporaryFileWrapper,
423
  version: str,
424
- guard_model_type: GuardModelType
425
  ):
426
  """
427
  Handle submission of results with model metadata.
@@ -451,7 +517,7 @@ def submit_results(
451
  "model_type": model_type,
452
  "mode": mode,
453
  "version": version,
454
- "guard_model_type": guard_model_type
455
  }
456
 
457
  # Process the submission
@@ -460,7 +526,9 @@ def submit_results(
460
  # Refresh the leaderboard data
461
  global LEADERBOARD_DF
462
  try:
463
- logger.info(f"Refreshing leaderboard data after submission for version {version}...")
 
 
464
  LEADERBOARD_DF = get_leaderboard_df(version=version)
465
  logger.info("Refreshed leaderboard data after submission")
466
  except Exception as e:
@@ -477,7 +545,10 @@ def refresh_data(version=CURRENT_VERSION):
477
  logger.info(f"Performing scheduled refresh of leaderboard data...")
478
  # Get new data
479
  main_df = get_leaderboard_df(version=version)
480
- category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
 
 
 
481
 
482
  # For gr.Dataframe, we return the actual dataframes
483
  return main_df, *category_dfs
@@ -493,14 +564,19 @@ def update_leaderboards(version):
493
  """
494
  try:
495
  new_df = get_leaderboard_df(version=version)
496
- category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
 
 
 
497
  return new_df, *category_dfs
498
  except Exception as e:
499
  logger.error(f"Error updating leaderboards for version {version}: {e}")
500
  return None, *[None for _ in CATEGORIES]
501
 
502
 
503
- def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION):
 
 
504
  """
505
  Create a radar plot comparing model performance for selected models.
506
  """
@@ -513,7 +589,7 @@ def create_performance_plot(selected_models, category, metric="f1_binary", versi
513
  return go.Figure()
514
 
515
  # Filter for selected models
516
- df = df[df['model_name'].isin(selected_models)]
517
 
518
  # Get the relevant metric columns
519
  metric_cols = [col for col in df.columns if metric in col]
@@ -522,52 +598,59 @@ def create_performance_plot(selected_models, category, metric="f1_binary", versi
522
  fig = go.Figure()
523
 
524
  # Custom colors for different models
525
- colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C'] # Pale Cyan, Pale Pink, Pale Green, Pale Orange
 
 
 
 
 
526
 
527
  # Add traces for each model
528
  for idx, model in enumerate(selected_models):
529
- model_data = df[df['model_name'] == model]
530
  if not model_data.empty:
531
  values = model_data[metric_cols].values[0].tolist()
532
  # Add the first value again at the end to complete the polygon
533
  values = values + [values[0]]
534
 
535
  # Clean up test type names
536
- categories = [col.replace(f'_{metric}', '') for col in metric_cols]
537
  # Add the first category again at the end to complete the polygon
538
  categories = categories + [categories[0]]
539
 
540
- fig.add_trace(go.Scatterpolar(
541
- r=values,
542
- theta=categories,
543
- name=model,
544
- line_color=colors[idx % len(colors)],
545
- fill='toself'
546
- ))
 
 
547
 
548
  # Update layout with all settings at once
549
  fig.update_layout(
550
- paper_bgcolor='#000000',
551
- plot_bgcolor='#000000',
552
- font={'color': '#ffffff'},
553
  title={
554
- 'text': f'{category} - {metric.upper()} Score Comparison',
555
- 'font': {'color': '#ffffff', 'size': 24}
556
  },
557
  polar=dict(
558
- bgcolor='#000000',
559
  radialaxis=dict(
560
  visible=True,
561
  range=[0, 1],
562
- gridcolor='#333333',
563
- linecolor='#333333',
564
- tickfont={'color': '#ffffff'},
565
  ),
566
  angularaxis=dict(
567
- gridcolor='#333333',
568
- linecolor='#333333',
569
- tickfont={'color': '#ffffff'},
570
- )
571
  ),
572
  height=600,
573
  showlegend=True,
@@ -576,9 +659,9 @@ def create_performance_plot(selected_models, category, metric="f1_binary", versi
576
  y=0.99,
577
  xanchor="right",
578
  x=0.99,
579
- bgcolor='rgba(0,0,0,0.5)',
580
- font={'color': '#ffffff'}
581
- )
582
  )
583
 
584
  return fig
@@ -591,7 +674,7 @@ def update_model_choices(version):
591
  df = get_leaderboard_df(version=version)
592
  if df.empty:
593
  return []
594
- return sorted(df['model_name'].unique().tolist())
595
 
596
 
597
  def update_visualization(selected_models, selected_category, selected_metric, version):
@@ -600,31 +683,33 @@ def update_visualization(selected_models, selected_category, selected_metric, ve
600
  """
601
  if not selected_models:
602
  return go.Figure()
603
- return create_performance_plot(selected_models, selected_category, selected_metric, version)
 
 
604
 
605
 
606
  # Create Gradio app
607
  demo = gr.Blocks(css=custom_css, theme=custom_theme)
608
 
609
  CATEGORY_DISPLAY_MAP = {
610
- 'Political Corruption and Legal Evasion': 'Corruption & Legal Evasion',
611
- 'Financial Fraud and Unethical Business': 'Financial Fraud',
612
- 'AI Manipulation and Jailbreaking': 'AI Jailbreaking',
613
- 'Child Exploitation and Abuse': 'Child Exploitation',
614
- 'Hate Speech, Extremism, and Discrimination': 'Hate Speech',
615
- 'Labor Exploitation and Human Trafficking': 'Labor Exploitation',
616
- 'Manipulation, Deception, and Misinformation': 'Misinformation',
617
- 'Environmental and Industrial Harm': 'Environmental Harm',
618
- 'Academic Dishonesty and Cheating': 'Academic Dishonesty',
619
- 'Self–Harm and Suicidal Ideation': 'Self-Harm',
620
- 'Animal Cruelty and Exploitation': 'Animal Harm',
621
- 'Criminal, Violent, and Terrorist Activity': 'Crime & Violence',
622
- 'Drug– and Substance–Related Activities': 'Drug Use',
623
- 'Sexual Content and Violence': 'Sexual Content',
624
- 'Weapon, Explosives, and Hazardous Materials': 'Weapons & Harmful Materials',
625
- 'Cybercrime, Hacking, and Digital Exploits': 'Cybercrime',
626
- 'Creative Content Involving Illicit Themes': 'Illicit Creative',
627
- 'Safe Prompts': 'Safe Prompts'
628
  }
629
  # Create reverse mapping for lookups
630
  CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
@@ -637,7 +722,6 @@ with demo:
637
  with gr.Row():
638
  tabs = gr.Tabs(elem_classes="tab-buttons")
639
 
640
-
641
  with tabs:
642
  with gr.TabItem("Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
643
  with gr.Row():
@@ -648,7 +732,7 @@ with demo:
648
  interactive=True,
649
  elem_classes="version-selector",
650
  scale=1,
651
- visible=False
652
  )
653
 
654
  with gr.Row():
@@ -656,15 +740,17 @@ with demo:
656
  placeholder="Search by models (use ; to split)",
657
  label="Search",
658
  elem_id="search-bar",
659
- scale=2
660
  )
661
  model_type_filter = gr.Dropdown(
662
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
 
663
  label="Access Type",
664
  multiselect=True,
665
  value=[],
666
  interactive=True,
667
- scale=1
668
  )
669
  column_selector = gr.Dropdown(
670
  choices=get_all_column_choices(),
@@ -672,10 +758,12 @@ with demo:
672
  multiselect=True,
673
  value=get_initial_columns(),
674
  interactive=True,
675
- scale=1
676
  )
677
  with gr.Row():
678
- refresh_button = gr.Button("Refresh", scale=0, elem_id="refresh-button")
 
 
679
 
680
  # Create tabs for each category
681
  with gr.Tabs(elem_classes="category-tabs") as category_tabs:
@@ -688,49 +776,99 @@ with demo:
688
  display_name = CATEGORY_DISPLAY_MAP.get(category, category)
689
  elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
690
  with gr.TabItem(display_name, elem_id=elem_id):
691
- category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
 
 
692
  category_leaderboard = init_leaderboard(category_df)
693
 
694
  # Connect search and filter inputs to update function
695
- def update_with_search_filters(version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None):
 
 
 
 
 
696
  """
697
  Update the leaderboards with search and filter settings.
698
  """
699
- return refresh_data_with_filters(version, search_query, model_types, selected_columns)
 
 
700
 
701
  # Refresh button functionality
702
- def refresh_and_update(version, search_query, model_types, selected_columns):
 
 
703
  """
704
  Refresh data, update LEADERBOARD_DF, and return updated components.
705
  """
706
  global LEADERBOARD_DF
707
  main_df = get_leaderboard_df(version=version)
708
  LEADERBOARD_DF = main_df # Update the global DataFrame
709
- return refresh_data_with_filters(version, search_query, model_types, selected_columns)
 
 
710
 
711
  refresh_button.click(
712
  fn=refresh_and_update,
713
- inputs=[version_selector, search_input, model_type_filter, column_selector],
714
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)])
 
 
 
 
 
 
 
 
 
 
715
  # Search input functionality
716
  search_input.change(
717
  fn=refresh_data_with_filters,
718
- inputs=[version_selector, search_input, model_type_filter, column_selector],
719
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
 
 
 
 
 
 
 
 
 
720
  )
721
 
722
  # Model type filter functionality
723
  model_type_filter.change(
724
  fn=refresh_data_with_filters,
725
- inputs=[version_selector, search_input, model_type_filter, column_selector],
726
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
 
 
 
 
 
 
 
 
 
727
  )
728
 
729
  # Version selector functionality
730
  version_selector.change(
731
  fn=refresh_data_with_filters,
732
- inputs=[version_selector, search_input, model_type_filter, column_selector],
733
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
 
 
 
 
 
 
 
 
 
734
  )
735
 
736
  # Update the update_columns function to handle updating all tabs at once
@@ -747,30 +885,52 @@ with demo:
747
  # If no columns are selected, use default visible columns
748
  if not selected_columns or len(selected_columns) == 0:
749
  selected_columns = get_default_visible_columns()
750
- logger.info(f"No columns selected, using defaults: {selected_columns}")
 
 
751
 
752
  # Convert display names to internal names
753
- internal_selected_columns = [x.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("_recall", "_recall_binary").replace("_precision", "_precision_binary") for x in selected_columns]
754
-
 
 
 
 
 
 
 
755
 
756
  # Get the current data with ALL columns preserved
757
  main_df = get_leaderboard_df(version=version_selector.value)
758
 
759
  # Get category dataframes with ALL columns preserved
760
- category_dfs = [get_category_leaderboard_df(category, version=version_selector.value)
761
- for category in CATEGORIES]
 
 
 
 
762
 
763
  # Log columns for debugging
764
  logger.info(f"Main dataframe columns: {list(main_df.columns)}")
765
- logger.info(f"Selected columns (internal): {internal_selected_columns}")
 
 
766
 
767
  # IMPORTANT: Make sure model_name is always included
768
- if 'model_name' in main_df.columns and 'model_name' not in internal_selected_columns:
769
- internal_selected_columns = ['model_name'] + internal_selected_columns
 
 
 
 
 
770
 
771
  # Initialize the main leaderboard with the selected columns
772
  # We're passing the internal_selected_columns directly to preserve the selection
773
- main_leaderboard = init_leaderboard(main_df, internal_selected_columns)
 
 
774
 
775
  # Initialize category dataframes with the same selected columns
776
  # This ensures consistency across all tabs
@@ -778,24 +938,33 @@ with demo:
778
  for df in category_dfs:
779
  # Use the same selected columns for each category
780
  # init_leaderboard will automatically handle filtering to columns that exist
781
- category_leaderboards.append(init_leaderboard(df, internal_selected_columns))
 
 
782
 
783
  return main_leaderboard, *category_leaderboards
784
 
785
  except Exception as e:
786
  logger.error(f"Error updating columns: {e}")
787
  import traceback
 
788
  logger.error(traceback.format_exc())
789
- return leaderboard, *[tab.children[0] for tab in category_tabs.children[1:len(CATEGORIES)+1]]
 
 
 
790
 
791
  # Connect column selector to update function
792
  column_selector.change(
793
  fn=update_columns,
794
  inputs=[column_selector],
795
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
 
 
 
 
796
  )
797
 
798
-
799
  with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
800
  with gr.Row():
801
  with gr.Column():
@@ -804,91 +973,132 @@ with demo:
804
  label="Benchmark Version",
805
  value=CURRENT_VERSION,
806
  interactive=True,
807
- visible=False
808
  )
 
809
  # New: Mode selector
810
  def get_model_mode_choices(version):
811
  df = get_leaderboard_df(version=version)
812
  if df.empty:
813
  return []
814
  # Return list of tuples (model_name, mode)
815
- return sorted([f"{row['model_name']} [{row['mode']}]" for _, row in df.drop_duplicates(subset=["model_name", "mode"]).iterrows()])
 
 
 
 
 
 
 
816
 
817
  model_mode_selector = gr.Dropdown(
818
  choices=get_model_mode_choices(CURRENT_VERSION),
819
  label="Select Model(s) [Mode] to Compare",
820
  multiselect=True,
821
- interactive=True
822
  )
823
  with gr.Column():
824
  # Add Overall Performance to categories, use display names
825
- viz_categories_display = ["All Results"] + [CATEGORY_DISPLAY_MAP.get(cat, cat) for cat in CATEGORIES]
 
 
826
  category_selector = gr.Dropdown(
827
  choices=viz_categories_display,
828
  label="Select Category",
829
  value=viz_categories_display[0],
830
- interactive=True
831
  )
832
  metric_selector = gr.Dropdown(
833
- choices=["accuracy", "f1_binary", "precision_binary", "recall_binary", "error_ratio"],
 
 
 
 
 
 
834
  label="Select Metric",
835
  value="accuracy",
836
- interactive=True
837
  )
838
 
839
  plot_output = gr.Plot()
840
 
841
  # Update visualization when any selector changes
842
- def update_visualization_with_mode(selected_model_modes, selected_category, selected_metric, version):
 
 
843
  if not selected_model_modes:
844
  return go.Figure()
845
- df = get_leaderboard_df(version=version) if selected_category == "All Results" else get_category_leaderboard_df(selected_category, version=version)
 
 
 
 
 
 
846
  if df.empty:
847
  return go.Figure()
848
  # Parse selected_model_modes into model_name and mode
849
  selected_pairs = [s.rsplit(" [", 1) for s in selected_model_modes]
850
- selected_pairs = [(name.strip(), mode.strip("] ")) for name, mode in selected_pairs]
851
- mask = df.apply(lambda row: (row['model_name'], str(row['mode'])) in selected_pairs, axis=1)
 
 
 
 
 
 
 
852
  filtered_df = df[mask]
853
- metric_cols = [col for col in filtered_df.columns if selected_metric in col]
 
 
854
  fig = go.Figure()
855
- colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C']
856
  for idx, (model_name, mode) in enumerate(selected_pairs):
857
- model_data = filtered_df[(filtered_df['model_name'] == model_name) & (filtered_df['mode'] == mode)]
 
 
 
858
  if not model_data.empty:
859
  values = model_data[metric_cols].values[0].tolist()
860
  values = values + [values[0]]
861
- categories = [col.replace(f'_{selected_metric}', '') for col in metric_cols]
 
 
 
862
  categories = categories + [categories[0]]
863
- fig.add_trace(go.Scatterpolar(
864
- r=values,
865
- theta=categories,
866
- name=f"{model_name} [{mode}]",
867
- line_color=colors[idx % len(colors)],
868
- fill='toself'
869
- ))
 
 
870
  fig.update_layout(
871
- paper_bgcolor='#000000',
872
- plot_bgcolor='#000000',
873
- font={'color': '#ffffff'},
874
  title={
875
- 'text': f'{selected_category} - {selected_metric.upper()} Score Comparison',
876
- 'font': {'color': '#ffffff', 'size': 24}
877
  },
878
  polar=dict(
879
- bgcolor='#000000',
880
  radialaxis=dict(
881
  visible=True,
882
  range=[0, 1],
883
- gridcolor='#333333',
884
- linecolor='#333333',
885
- tickfont={'color': '#ffffff'},
886
  ),
887
  angularaxis=dict(
888
- gridcolor='#333333',
889
- linecolor='#333333',
890
- tickfont={'color': '#ffffff'},
891
- )
892
  ),
893
  height=600,
894
  showlegend=True,
@@ -897,25 +1107,37 @@ with demo:
897
  y=0.99,
898
  xanchor="right",
899
  x=0.99,
900
- bgcolor='rgba(0,0,0,0.5)',
901
- font={'color': '#ffffff'}
902
- )
903
  )
904
  return fig
905
 
906
  # Connect selectors to update function
907
- for control in [viz_version_selector, model_mode_selector, category_selector, metric_selector]:
 
 
 
 
 
908
  control.change(
909
- fn=lambda smm, sc, s_metric, v: update_visualization_with_mode(smm, CATEGORY_REVERSE_MAP.get(sc, sc), s_metric, v),
910
- inputs=[model_mode_selector, category_selector, metric_selector, viz_version_selector],
911
- outputs=plot_output
 
 
 
 
 
 
 
912
  )
913
 
914
  # Update model_mode_selector choices when version changes
915
  viz_version_selector.change(
916
  fn=get_model_mode_choices,
917
  inputs=[viz_version_selector],
918
- outputs=[model_mode_selector]
919
  )
920
 
921
  # with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
@@ -935,7 +1157,7 @@ with demo:
935
  value=CURRENT_VERSION,
936
  interactive=True,
937
  elem_classes="version-selector",
938
- visible=False
939
  )
940
 
941
  with gr.Row():
@@ -948,9 +1170,15 @@ with demo:
948
  value=None,
949
  interactive=True,
950
  )
951
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
 
952
  model_type = gr.Dropdown(
953
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
 
 
 
954
  label="Model type",
955
  multiselect=False,
956
  value=None,
@@ -966,7 +1194,9 @@ with demo:
966
 
967
  with gr.Column():
968
  precision = gr.Dropdown(
969
- choices=[i.name for i in Precision if i != Precision.Unknown],
 
 
970
  label="Precision",
971
  multiselect=False,
972
  value="float16",
@@ -979,12 +1209,13 @@ with demo:
979
  value="Original",
980
  interactive=True,
981
  )
982
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
983
 
984
  with gr.Row():
985
  file_input = gr.File(
986
- label="Upload JSONL Results File",
987
- file_types=[".jsonl"]
988
  )
989
 
990
  submit_button = gr.Button("Submit Results")
@@ -1002,25 +1233,34 @@ with demo:
1002
  mode_selector,
1003
  file_input,
1004
  submission_version_selector,
1005
- guard_model_type
1006
  ],
1007
- outputs=result_output
1008
  )
1009
 
1010
  # Version selector functionality
1011
  version_selector.change(
1012
  fn=update_leaderboards,
1013
  inputs=[version_selector],
1014
- outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
1015
- ).then(lambda version: refresh_data_with_filters(version), inputs=[version_selector], outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)])
 
 
 
 
 
 
 
 
 
 
1016
 
1017
 
1018
  # Set up the scheduler to refresh data periodically
1019
  scheduler = BackgroundScheduler()
1020
- scheduler.add_job(refresh_data, 'interval', minutes=30)
1021
  scheduler.start()
1022
 
1023
  # Launch the app
1024
  if __name__ == "__main__":
1025
-
1026
  demo.launch()
 
47
  RESULTS_DATASET_ID,
48
  SUBMITTER_TOKEN,
49
  TOKEN,
50
+ DATA_PATH,
51
  )
52
  from src.populate import get_leaderboard_df, get_category_leaderboard_df
53
  from src.submission.submit import process_submission
54
 
55
  # Configure logging
56
+ logging.basicConfig(
57
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
58
+ )
59
  logger = logging.getLogger(__name__)
60
 
61
  # Ensure data directory exists
 
78
  primary_hue=colors.slate,
79
  secondary_hue=colors.slate,
80
  neutral_hue=colors.neutral,
81
+ font=(fonts.GoogleFont("Inter"), "sans-serif"),
82
  ).set(
83
  # font_size="16px",
84
  body_background_fill="#0f0f10",
85
  body_background_fill_dark="#0f0f10",
86
  body_text_color="#f4f4f5",
87
  body_text_color_subdued="#a1a1aa",
88
+ block_background_fill="#1e1e1e", # Cooler Grey
89
+ block_border_color="#333333", # Cooler Grey
90
  block_shadow="none",
91
  # Swapped primary and secondary button styles
92
+ button_primary_background_fill="#121212", # Changed to specific color for Refresh button
93
  button_primary_text_color="#f4f4f5",
94
+ button_primary_border_color="#333333", # Keep border grey or change to #121212?
95
  button_secondary_background_fill="#f4f4f5",
96
  button_secondary_text_color="#0f0f10",
97
  button_secondary_border_color="#f4f4f5",
98
+ input_background_fill="#1e1e1e", # Cooler Grey
99
+ input_border_color="#333333", # Cooler Grey
100
  input_placeholder_color="#71717a",
101
+ table_border_color="#333333", # Cooler Grey
102
+ table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
103
+ table_odd_background_fill="#1e1e1e", # Cooler Grey
104
  table_text_color="#f4f4f5",
105
  link_text_color="#ffffff",
106
+ border_color_primary="#333333", # Cooler Grey
107
+ background_fill_secondary="#333333", # Cooler Grey
108
  color_accent="#f4f4f5",
109
+ border_color_accent="#333333", # Cooler Grey
110
+ button_primary_background_fill_hover="#424242", # Cooler Grey
111
  block_title_text_color="#f4f4f5",
112
  accordion_text_color="#f4f4f5",
113
+ panel_background_fill="#1e1e1e", # Cooler Grey
114
+ panel_border_color="#333333", # Cooler Grey
115
  # Explicitly setting primary/secondary/accent colors/borders
116
  background_fill_primary="#0f0f10",
117
  background_fill_primary_dark="#0f0f10",
118
+ background_fill_secondary_dark="#333333", # Cooler Grey
119
+ border_color_primary_dark="#333333", # Cooler Grey
120
+ border_color_accent_dark="#333333", # Cooler Grey
121
+ border_color_accent_subdued="#424242", # Cooler Grey
122
  border_color_accent_subdued_dark="#424242", # Cooler Grey
123
  color_accent_soft="#a1a1aa",
124
  color_accent_soft_dark="#a1a1aa",
125
  # Explicitly setting input hover/focus states
126
+ input_background_fill_dark="#1e1e1e", # Cooler Grey
127
+ input_background_fill_focus="#424242", # Cooler Grey
128
+ input_background_fill_focus_dark="#424242", # Cooler Grey
129
+ input_background_fill_hover="#2d2d2d", # Cooler Grey
130
+ input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
131
+ input_border_color_dark="#333333", # Cooler Grey
132
  input_border_color_focus="#f4f4f5",
133
  input_border_color_focus_dark="#f4f4f5",
134
+ input_border_color_hover="#424242", # Cooler Grey
135
+ input_border_color_hover_dark="#424242", # Cooler Grey
136
  input_placeholder_color_dark="#71717a",
137
  # Explicitly set dark variants for table backgrounds
138
+ table_even_background_fill_dark="#2d2d2d", # Cooler Grey
139
+ table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
140
  # Explicitly set dark text variants
141
  body_text_color_dark="#f4f4f5",
142
  body_text_color_subdued_dark="#a1a1aa",
 
144
  accordion_text_color_dark="#f4f4f5",
145
  table_text_color_dark="#f4f4f5",
146
  # Explicitly set dark panel/block variants
147
+ panel_background_fill_dark="#1e1e1e", # Cooler Grey
148
+ panel_border_color_dark="#333333", # Cooler Grey
149
+ block_background_fill_dark="#1e1e1e", # Cooler Grey
150
+ block_border_color_dark="#333333", # Cooler Grey
151
  )
152
 
153
+
154
  @dataclass
155
  class ColumnInfo:
156
  """Information about a column in the leaderboard."""
157
+
158
  name: str
159
  display_name: str
160
  type: str = "text"
 
162
  never_hidden: bool = False
163
  displayed_by_default: bool = True
164
 
165
+
166
  def update_column_choices(df):
167
  """Update column choices based on what's actually in the dataframe"""
168
  if df is None or df.empty:
 
175
  all_columns = get_all_column_choices()
176
 
177
  # Filter to only include columns that exist in the dataframe
178
+ valid_columns = [
179
+ (col_name, display_name)
180
+ for col_name, display_name in all_columns
181
+ if col_name in existing_columns
182
+ ]
183
 
184
  # Return default if there are no valid columns
185
  if not valid_columns:
 
187
 
188
  return valid_columns
189
 
190
+
191
  # Update the column_selector initialization
192
  def get_initial_columns():
193
  """Get initial columns to show in the dropdown"""
 
201
  return get_default_visible_columns()
202
 
203
  # Get default visible columns that actually exist in the dataframe
204
+ valid_defaults = [
205
+ col for col in get_default_visible_columns() if col in available_cols
206
+ ]
207
 
208
  # If none of the defaults exist, return all available columns
209
  if not valid_defaults:
 
214
  logger.error(f"Error getting initial columns: {e}")
215
  return get_default_visible_columns()
216
 
217
+
218
  def init_leaderboard(dataframe, visible_columns=None):
219
  """
220
  Initialize a standard Gradio Dataframe component for the leaderboard.
 
228
  # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
229
 
230
  # Determine which columns to display
231
+ display_column_names = [
232
+ getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS
233
+ ]
234
  hidden_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS]
235
 
236
  # Columns that should always be shown
 
239
  # Use provided visible columns if specified, otherwise use default
240
  if visible_columns is None:
241
  # Determine which columns to show initially
242
+ visible_columns = [
243
+ col for col in display_column_names if col not in hidden_column_names
244
+ ]
245
 
246
  # Always include the never-hidden columns
247
  for col in always_visible:
 
254
  # Map GuardBench column types to Gradio's expected datatype strings
255
  # Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
256
  type_mapping = {
257
+ "text": "str",
258
+ "number": "number",
259
+ "bool": "bool",
260
+ "date": "date",
261
+ "markdown": "markdown",
262
+ "html": "html",
263
+ "image": "image",
264
  }
265
 
266
  # Create a list of datatypes in the format Gradio expects
 
272
  if getattr(GUARDBENCH_COLUMN, display_col).name == col:
273
  orig_type = getattr(GUARDBENCH_COLUMN, display_col).type
274
  # Map to Gradio's expected types
275
+ col_type = type_mapping.get(orig_type, "str")
276
  break
277
 
278
  # Default to 'str' if type not found or not mappable
279
  if col_type is None:
280
+ col_type = "str"
281
 
282
  datatypes.append(col_type)
283
 
284
  # Create a dummy column for search functionality if it doesn't exist
285
+ if "search_dummy" not in dataframe.columns:
286
+ dataframe["search_dummy"] = dataframe.apply(
287
+ lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
288
+ axis=1,
289
  )
290
 
291
  # Select only the visible columns for display
292
+ visible_columns.remove("model_name")
293
 
294
+ visible_columns = ["model_name"] + visible_columns
295
  display_df = dataframe[visible_columns].copy()
296
 
297
  # print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
 
304
  # Avoid rounding integer columns like counts
305
  if not pd.api.types.is_integer_dtype(display_df[col]):
306
  # Format floats to exactly 3 decimal places, preserving trailing zeros
307
+ display_df[col] = display_df[col].apply(
308
+ lambda x: f"{x:.3f}" if pd.notna(x) else None
309
+ )
310
 
311
+ column_info_map = {
312
+ f.name: getattr(GUARDBENCH_COLUMN, f.name) for f in fields(GUARDBENCH_COLUMN)
313
+ }
314
+ column_mapping = {
315
+ col: column_info_map.get(col, ColumnInfo(col, col)).display_name
316
+ for col in visible_columns
317
+ }
318
 
319
  # Rename columns in the DataFrame
320
  display_df.rename(columns=column_mapping, inplace=True)
321
 
322
  # Apply styling - note: styling might need adjustment if it relies on column names
323
+ styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
324
+ subset=["model_name"], **{"width": "100px"}
325
+ )
326
 
327
  return gr.Dataframe(
328
  value=styler,
 
331
  wrap=True,
332
  height=2500,
333
  elem_id="leaderboard-table",
334
+ row_count=len(display_df),
335
  )
336
 
337
 
338
+ def search_filter_leaderboard(
339
+ df, search_query="", model_types=None, version=CURRENT_VERSION
340
+ ):
341
  """
342
  Filter the leaderboard based on search query and model types.
343
  """
 
347
  filtered_df = df.copy()
348
 
349
  # Add search dummy column if it doesn't exist
350
+ if "search_dummy" not in filtered_df.columns:
351
+ filtered_df["search_dummy"] = filtered_df.apply(
352
+ lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
353
+ axis=1,
354
  )
355
 
356
  # Apply model type filter
357
  if model_types and len(model_types) > 0:
358
+ filtered_df = filtered_df[
359
+ filtered_df[GUARDBENCH_COLUMN.model_type.name].isin(model_types)
360
+ ]
361
 
362
  # Apply search query
363
  if search_query:
364
+ search_terms = [
365
+ term.strip() for term in search_query.split(";") if term.strip()
366
+ ]
367
  if search_terms:
368
  combined_mask = None
369
  for term in search_terms:
370
+ mask = filtered_df["search_dummy"].str.contains(
371
+ term, case=False, na=False
372
+ )
373
  if combined_mask is None:
374
  combined_mask = mask
375
  else:
 
379
  filtered_df = filtered_df[combined_mask]
380
 
381
  # Drop the search dummy column before returning
382
+ visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
383
  return filtered_df[visible_columns]
384
 
385
 
386
+ def refresh_data_with_filters(
387
+ version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None
388
+ ):
389
  """
390
  Refresh the leaderboard data and update all components with filtering.
391
  Ensures we handle cases where dataframes might have limited columns.
 
396
  # Get new data
397
  main_df = get_leaderboard_df(version=version)
398
  LEADERBOARD_DF = main_df
399
+ category_dfs = [
400
+ get_category_leaderboard_df(category, version=version)
401
+ for category in CATEGORIES
402
+ ]
403
+ selected_columns = [
404
+ x.lower()
405
+ .replace(" ", "_")
406
+ .replace("(", "")
407
+ .replace(")", "")
408
+ .replace("_recall", "_recall_binary")
409
+ .replace("_precision", "_precision_binary")
410
+ for x in selected_columns
411
+ ]
412
 
413
  # Log the actual columns we have
414
  logger.info(f"Main dataframe columns: {list(main_df.columns)}")
415
 
416
  # Apply filters to each dataframe
417
+ filtered_main_df = search_filter_leaderboard(
418
+ main_df, search_query, model_types, version
419
+ )
420
  filtered_category_dfs = [
421
  search_filter_leaderboard(df, search_query, model_types, version)
422
  for df in category_dfs
 
428
  # Filter selected columns to only those available in the data
429
  if selected_columns:
430
  # Convert display names to internal names first
431
+ internal_selected_columns = [
432
+ x.lower()
433
+ .replace(" ", "_")
434
+ .replace("(", "")
435
+ .replace(")", "")
436
+ .replace("_recall", "_recall_binary")
437
+ .replace("_precision", "_precision_binary")
438
+ for x in selected_columns
439
+ ]
440
+ valid_selected_columns = [
441
+ col for col in internal_selected_columns if col in available_columns
442
+ ]
443
+ if not valid_selected_columns and "model_name" in available_columns:
444
  # Fallback if conversion/filtering leads to empty selection
445
+ valid_selected_columns = ["model_name"] + [
446
+ col
447
+ for col in get_default_visible_columns()
448
+ if col in available_columns
449
+ ]
450
  else:
451
  # If no columns were selected in the dropdown, use default visible columns that exist
452
+ valid_selected_columns = [
453
+ col for col in get_default_visible_columns() if col in available_columns
454
+ ]
455
 
456
  # Initialize dataframes for display with valid selected columns
457
  main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
 
460
  category_dataframes = []
461
  for df in filtered_category_dfs:
462
  df_columns = list(df.columns)
463
+ df_valid_columns = [
464
+ col for col in valid_selected_columns if col in df_columns
465
+ ]
466
+ if not df_valid_columns and "model_name" in df_columns:
467
+ df_valid_columns = ["model_name"] + get_default_visible_columns()
468
  category_dataframes.append(init_leaderboard(df, df_valid_columns))
469
 
470
  return main_dataframe, *category_dataframes
 
472
  except Exception as e:
473
  logger.error(f"Error in refresh with filters: {e}")
474
  # Return the current leaderboards on error
475
+ return leaderboard, *[
476
+ tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
477
+ ]
478
 
479
 
480
  def submit_results(
 
487
  mode: str,
488
  submission_file: tempfile._TemporaryFileWrapper,
489
  version: str,
490
+ guard_model_type: GuardModelType,
491
  ):
492
  """
493
  Handle submission of results with model metadata.
 
517
  "model_type": model_type,
518
  "mode": mode,
519
  "version": version,
520
+ "guard_model_type": guard_model_type,
521
  }
522
 
523
  # Process the submission
 
526
  # Refresh the leaderboard data
527
  global LEADERBOARD_DF
528
  try:
529
+ logger.info(
530
+ f"Refreshing leaderboard data after submission for version {version}..."
531
+ )
532
  LEADERBOARD_DF = get_leaderboard_df(version=version)
533
  logger.info("Refreshed leaderboard data after submission")
534
  except Exception as e:
 
545
  logger.info(f"Performing scheduled refresh of leaderboard data...")
546
  # Get new data
547
  main_df = get_leaderboard_df(version=version)
548
+ category_dfs = [
549
+ get_category_leaderboard_df(category, version=version)
550
+ for category in CATEGORIES
551
+ ]
552
 
553
  # For gr.Dataframe, we return the actual dataframes
554
  return main_df, *category_dfs
 
564
  """
565
  try:
566
  new_df = get_leaderboard_df(version=version)
567
+ category_dfs = [
568
+ get_category_leaderboard_df(category, version=version)
569
+ for category in CATEGORIES
570
+ ]
571
  return new_df, *category_dfs
572
  except Exception as e:
573
  logger.error(f"Error updating leaderboards for version {version}: {e}")
574
  return None, *[None for _ in CATEGORIES]
575
 
576
 
577
+ def create_performance_plot(
578
+ selected_models, category, metric="f1_binary", version=CURRENT_VERSION
579
+ ):
580
  """
581
  Create a radar plot comparing model performance for selected models.
582
  """
 
589
  return go.Figure()
590
 
591
  # Filter for selected models
592
+ df = df[df["model_name"].isin(selected_models)]
593
 
594
  # Get the relevant metric columns
595
  metric_cols = [col for col in df.columns if metric in col]
 
598
  fig = go.Figure()
599
 
600
  # Custom colors for different models
601
+ colors = [
602
+ "#8FCCCC",
603
+ "#C2A4B6",
604
+ "#98B4A6",
605
+ "#B68F7C",
606
+ ] # Pale Cyan, Pale Pink, Pale Green, Pale Orange
607
 
608
  # Add traces for each model
609
  for idx, model in enumerate(selected_models):
610
+ model_data = df[df["model_name"] == model]
611
  if not model_data.empty:
612
  values = model_data[metric_cols].values[0].tolist()
613
  # Add the first value again at the end to complete the polygon
614
  values = values + [values[0]]
615
 
616
  # Clean up test type names
617
+ categories = [col.replace(f"_{metric}", "") for col in metric_cols]
618
  # Add the first category again at the end to complete the polygon
619
  categories = categories + [categories[0]]
620
 
621
+ fig.add_trace(
622
+ go.Scatterpolar(
623
+ r=values,
624
+ theta=categories,
625
+ name=model,
626
+ line_color=colors[idx % len(colors)],
627
+ fill="toself",
628
+ )
629
+ )
630
 
631
  # Update layout with all settings at once
632
  fig.update_layout(
633
+ paper_bgcolor="#000000",
634
+ plot_bgcolor="#000000",
635
+ font={"color": "#ffffff"},
636
  title={
637
+ "text": f"{category} - {metric.upper()} Score Comparison",
638
+ "font": {"color": "#ffffff", "size": 24},
639
  },
640
  polar=dict(
641
+ bgcolor="#000000",
642
  radialaxis=dict(
643
  visible=True,
644
  range=[0, 1],
645
+ gridcolor="#333333",
646
+ linecolor="#333333",
647
+ tickfont={"color": "#ffffff"},
648
  ),
649
  angularaxis=dict(
650
+ gridcolor="#333333",
651
+ linecolor="#333333",
652
+ tickfont={"color": "#ffffff"},
653
+ ),
654
  ),
655
  height=600,
656
  showlegend=True,
 
659
  y=0.99,
660
  xanchor="right",
661
  x=0.99,
662
+ bgcolor="rgba(0,0,0,0.5)",
663
+ font={"color": "#ffffff"},
664
+ ),
665
  )
666
 
667
  return fig
 
674
  df = get_leaderboard_df(version=version)
675
  if df.empty:
676
  return []
677
+ return sorted(df["model_name"].unique().tolist())
678
 
679
 
680
  def update_visualization(selected_models, selected_category, selected_metric, version):
 
683
  """
684
  if not selected_models:
685
  return go.Figure()
686
+ return create_performance_plot(
687
+ selected_models, selected_category, selected_metric, version
688
+ )
689
 
690
 
691
  # Create Gradio app
692
  demo = gr.Blocks(css=custom_css, theme=custom_theme)
693
 
694
  CATEGORY_DISPLAY_MAP = {
695
+ "Political Corruption and Legal Evasion": "Corruption & Legal Evasion",
696
+ "Financial Fraud and Unethical Business": "Financial Fraud",
697
+ "AI Manipulation and Jailbreaking": "AI Jailbreaking",
698
+ "Child Exploitation and Abuse": "Child Exploitation",
699
+ "Hate Speech, Extremism, and Discrimination": "Hate Speech",
700
+ "Labor Exploitation and Human Trafficking": "Labor Exploitation",
701
+ "Manipulation, Deception, and Misinformation": "Misinformation",
702
+ "Environmental and Industrial Harm": "Environmental Harm",
703
+ "Academic Dishonesty and Cheating": "Academic Dishonesty",
704
+ "Self–Harm and Suicidal Ideation": "Self-Harm",
705
+ "Animal Cruelty and Exploitation": "Animal Harm",
706
+ "Criminal, Violent, and Terrorist Activity": "Crime & Violence",
707
+ "Drug– and Substance–Related Activities": "Drug Use",
708
+ "Sexual Content and Violence": "Sexual Content",
709
+ "Weapon, Explosives, and Hazardous Materials": "Weapons & Harmful Materials",
710
+ "Cybercrime, Hacking, and Digital Exploits": "Cybercrime",
711
+ "Creative Content Involving Illicit Themes": "Illicit Creative",
712
+ "Safe Prompts": "Safe Prompts",
713
  }
714
  # Create reverse mapping for lookups
715
  CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
 
722
  with gr.Row():
723
  tabs = gr.Tabs(elem_classes="tab-buttons")
724
 
 
725
  with tabs:
726
  with gr.TabItem("Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
727
  with gr.Row():
 
732
  interactive=True,
733
  elem_classes="version-selector",
734
  scale=1,
735
+ visible=False,
736
  )
737
 
738
  with gr.Row():
 
740
  placeholder="Search by models (use ; to split)",
741
  label="Search",
742
  elem_id="search-bar",
743
+ scale=2,
744
  )
745
  model_type_filter = gr.Dropdown(
746
+ choices=[
747
+ t.to_str(" : ") for t in ModelType if t != ModelType.Unknown
748
+ ],
749
  label="Access Type",
750
  multiselect=True,
751
  value=[],
752
  interactive=True,
753
+ scale=1,
754
  )
755
  column_selector = gr.Dropdown(
756
  choices=get_all_column_choices(),
 
758
  multiselect=True,
759
  value=get_initial_columns(),
760
  interactive=True,
761
+ scale=1,
762
  )
763
  with gr.Row():
764
+ refresh_button = gr.Button(
765
+ "Refresh", scale=0, elem_id="refresh-button"
766
+ )
767
 
768
  # Create tabs for each category
769
  with gr.Tabs(elem_classes="category-tabs") as category_tabs:
 
776
  display_name = CATEGORY_DISPLAY_MAP.get(category, category)
777
  elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
778
  with gr.TabItem(display_name, elem_id=elem_id):
779
+ category_df = get_category_leaderboard_df(
780
+ category, version=CURRENT_VERSION
781
+ )
782
  category_leaderboard = init_leaderboard(category_df)
783
 
784
  # Connect search and filter inputs to update function
785
+ def update_with_search_filters(
786
+ version=CURRENT_VERSION,
787
+ search_query="",
788
+ model_types=None,
789
+ selected_columns=None,
790
+ ):
791
  """
792
  Update the leaderboards with search and filter settings.
793
  """
794
+ return refresh_data_with_filters(
795
+ version, search_query, model_types, selected_columns
796
+ )
797
 
798
  # Refresh button functionality
799
+ def refresh_and_update(
800
+ version, search_query, model_types, selected_columns
801
+ ):
802
  """
803
  Refresh data, update LEADERBOARD_DF, and return updated components.
804
  """
805
  global LEADERBOARD_DF
806
  main_df = get_leaderboard_df(version=version)
807
  LEADERBOARD_DF = main_df # Update the global DataFrame
808
+ return refresh_data_with_filters(
809
+ version, search_query, model_types, selected_columns
810
+ )
811
 
812
  refresh_button.click(
813
  fn=refresh_and_update,
814
+ inputs=[
815
+ version_selector,
816
+ search_input,
817
+ model_type_filter,
818
+ column_selector,
819
+ ],
820
+ outputs=[leaderboard]
821
+ + [
822
+ category_tabs.children[i].children[0]
823
+ for i in range(1, len(CATEGORIES) + 1)
824
+ ],
825
+ )
826
  # Search input functionality
827
  search_input.change(
828
  fn=refresh_data_with_filters,
829
+ inputs=[
830
+ version_selector,
831
+ search_input,
832
+ model_type_filter,
833
+ column_selector,
834
+ ],
835
+ outputs=[leaderboard]
836
+ + [
837
+ category_tabs.children[i].children[0]
838
+ for i in range(1, len(CATEGORIES) + 1)
839
+ ],
840
  )
841
 
842
  # Model type filter functionality
843
  model_type_filter.change(
844
  fn=refresh_data_with_filters,
845
+ inputs=[
846
+ version_selector,
847
+ search_input,
848
+ model_type_filter,
849
+ column_selector,
850
+ ],
851
+ outputs=[leaderboard]
852
+ + [
853
+ category_tabs.children[i].children[0]
854
+ for i in range(1, len(CATEGORIES) + 1)
855
+ ],
856
  )
857
 
858
  # Version selector functionality
859
  version_selector.change(
860
  fn=refresh_data_with_filters,
861
+ inputs=[
862
+ version_selector,
863
+ search_input,
864
+ model_type_filter,
865
+ column_selector,
866
+ ],
867
+ outputs=[leaderboard]
868
+ + [
869
+ category_tabs.children[i].children[0]
870
+ for i in range(1, len(CATEGORIES) + 1)
871
+ ],
872
  )
873
 
874
  # Update the update_columns function to handle updating all tabs at once
 
885
  # If no columns are selected, use default visible columns
886
  if not selected_columns or len(selected_columns) == 0:
887
  selected_columns = get_default_visible_columns()
888
+ logger.info(
889
+ f"No columns selected, using defaults: {selected_columns}"
890
+ )
891
 
892
  # Convert display names to internal names
893
+ internal_selected_columns = [
894
+ x.lower()
895
+ .replace(" ", "_")
896
+ .replace("(", "")
897
+ .replace(")", "")
898
+ .replace("_recall", "_recall_binary")
899
+ .replace("_precision", "_precision_binary")
900
+ for x in selected_columns
901
+ ]
902
 
903
  # Get the current data with ALL columns preserved
904
  main_df = get_leaderboard_df(version=version_selector.value)
905
 
906
  # Get category dataframes with ALL columns preserved
907
+ category_dfs = [
908
+ get_category_leaderboard_df(
909
+ category, version=version_selector.value
910
+ )
911
+ for category in CATEGORIES
912
+ ]
913
 
914
  # Log columns for debugging
915
  logger.info(f"Main dataframe columns: {list(main_df.columns)}")
916
+ logger.info(
917
+ f"Selected columns (internal): {internal_selected_columns}"
918
+ )
919
 
920
  # IMPORTANT: Make sure model_name is always included
921
+ if (
922
+ "model_name" in main_df.columns
923
+ and "model_name" not in internal_selected_columns
924
+ ):
925
+ internal_selected_columns = [
926
+ "model_name"
927
+ ] + internal_selected_columns
928
 
929
  # Initialize the main leaderboard with the selected columns
930
  # We're passing the internal_selected_columns directly to preserve the selection
931
+ main_leaderboard = init_leaderboard(
932
+ main_df, internal_selected_columns
933
+ )
934
 
935
  # Initialize category dataframes with the same selected columns
936
  # This ensures consistency across all tabs
 
938
  for df in category_dfs:
939
  # Use the same selected columns for each category
940
  # init_leaderboard will automatically handle filtering to columns that exist
941
+ category_leaderboards.append(
942
+ init_leaderboard(df, internal_selected_columns)
943
+ )
944
 
945
  return main_leaderboard, *category_leaderboards
946
 
947
  except Exception as e:
948
  logger.error(f"Error updating columns: {e}")
949
  import traceback
950
+
951
  logger.error(traceback.format_exc())
952
+ return leaderboard, *[
953
+ tab.children[0]
954
+ for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
955
+ ]
956
 
957
  # Connect column selector to update function
958
  column_selector.change(
959
  fn=update_columns,
960
  inputs=[column_selector],
961
+ outputs=[leaderboard]
962
+ + [
963
+ category_tabs.children[i].children[0]
964
+ for i in range(1, len(CATEGORIES) + 1)
965
+ ],
966
  )
967
 
 
968
  with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1):
969
  with gr.Row():
970
  with gr.Column():
 
973
  label="Benchmark Version",
974
  value=CURRENT_VERSION,
975
  interactive=True,
976
+ visible=False,
977
  )
978
+
979
  # New: Mode selector
980
  def get_model_mode_choices(version):
981
  df = get_leaderboard_df(version=version)
982
  if df.empty:
983
  return []
984
  # Return list of tuples (model_name, mode)
985
+ return sorted(
986
+ [
987
+ f"{row['model_name']} [{row['mode']}]"
988
+ for _, row in df.drop_duplicates(
989
+ subset=["model_name", "mode"]
990
+ ).iterrows()
991
+ ]
992
+ )
993
 
994
  model_mode_selector = gr.Dropdown(
995
  choices=get_model_mode_choices(CURRENT_VERSION),
996
  label="Select Model(s) [Mode] to Compare",
997
  multiselect=True,
998
+ interactive=True,
999
  )
1000
  with gr.Column():
1001
  # Add Overall Performance to categories, use display names
1002
+ viz_categories_display = ["All Results"] + [
1003
+ CATEGORY_DISPLAY_MAP.get(cat, cat) for cat in CATEGORIES
1004
+ ]
1005
  category_selector = gr.Dropdown(
1006
  choices=viz_categories_display,
1007
  label="Select Category",
1008
  value=viz_categories_display[0],
1009
+ interactive=True,
1010
  )
1011
  metric_selector = gr.Dropdown(
1012
+ choices=[
1013
+ "accuracy",
1014
+ "f1_binary",
1015
+ "precision_binary",
1016
+ "recall_binary",
1017
+ "error_ratio",
1018
+ ],
1019
  label="Select Metric",
1020
  value="accuracy",
1021
+ interactive=True,
1022
  )
1023
 
1024
  plot_output = gr.Plot()
1025
 
1026
  # Update visualization when any selector changes
1027
+ def update_visualization_with_mode(
1028
+ selected_model_modes, selected_category, selected_metric, version
1029
+ ):
1030
  if not selected_model_modes:
1031
  return go.Figure()
1032
+ df = (
1033
+ get_leaderboard_df(version=version)
1034
+ if selected_category == "All Results"
1035
+ else get_category_leaderboard_df(
1036
+ selected_category, version=version
1037
+ )
1038
+ )
1039
  if df.empty:
1040
  return go.Figure()
1041
  # Parse selected_model_modes into model_name and mode
1042
  selected_pairs = [s.rsplit(" [", 1) for s in selected_model_modes]
1043
+ selected_pairs = [
1044
+ (name.strip(), mode.strip("] "))
1045
+ for name, mode in selected_pairs
1046
+ ]
1047
+ mask = df.apply(
1048
+ lambda row: (row["model_name"], str(row["mode"]))
1049
+ in selected_pairs,
1050
+ axis=1,
1051
+ )
1052
  filtered_df = df[mask]
1053
+ metric_cols = [
1054
+ col for col in filtered_df.columns if selected_metric in col
1055
+ ]
1056
  fig = go.Figure()
1057
+ colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
1058
  for idx, (model_name, mode) in enumerate(selected_pairs):
1059
+ model_data = filtered_df[
1060
+ (filtered_df["model_name"] == model_name)
1061
+ & (filtered_df["mode"] == mode)
1062
+ ]
1063
  if not model_data.empty:
1064
  values = model_data[metric_cols].values[0].tolist()
1065
  values = values + [values[0]]
1066
+ categories = [
1067
+ col.replace(f"_{selected_metric}", "")
1068
+ for col in metric_cols
1069
+ ]
1070
  categories = categories + [categories[0]]
1071
+ fig.add_trace(
1072
+ go.Scatterpolar(
1073
+ r=values,
1074
+ theta=categories,
1075
+ name=f"{model_name} [{mode}]",
1076
+ line_color=colors[idx % len(colors)],
1077
+ fill="toself",
1078
+ )
1079
+ )
1080
  fig.update_layout(
1081
+ paper_bgcolor="#000000",
1082
+ plot_bgcolor="#000000",
1083
+ font={"color": "#ffffff"},
1084
  title={
1085
+ "text": f"{selected_category} - {selected_metric.upper()} Score Comparison",
1086
+ "font": {"color": "#ffffff", "size": 24},
1087
  },
1088
  polar=dict(
1089
+ bgcolor="#000000",
1090
  radialaxis=dict(
1091
  visible=True,
1092
  range=[0, 1],
1093
+ gridcolor="#333333",
1094
+ linecolor="#333333",
1095
+ tickfont={"color": "#ffffff"},
1096
  ),
1097
  angularaxis=dict(
1098
+ gridcolor="#333333",
1099
+ linecolor="#333333",
1100
+ tickfont={"color": "#ffffff"},
1101
+ ),
1102
  ),
1103
  height=600,
1104
  showlegend=True,
 
1107
  y=0.99,
1108
  xanchor="right",
1109
  x=0.99,
1110
+ bgcolor="rgba(0,0,0,0.5)",
1111
+ font={"color": "#ffffff"},
1112
+ ),
1113
  )
1114
  return fig
1115
 
1116
  # Connect selectors to update function
1117
+ for control in [
1118
+ viz_version_selector,
1119
+ model_mode_selector,
1120
+ category_selector,
1121
+ metric_selector,
1122
+ ]:
1123
  control.change(
1124
+ fn=lambda smm, sc, s_metric, v: update_visualization_with_mode(
1125
+ smm, CATEGORY_REVERSE_MAP.get(sc, sc), s_metric, v
1126
+ ),
1127
+ inputs=[
1128
+ model_mode_selector,
1129
+ category_selector,
1130
+ metric_selector,
1131
+ viz_version_selector,
1132
+ ],
1133
+ outputs=plot_output,
1134
  )
1135
 
1136
  # Update model_mode_selector choices when version changes
1137
  viz_version_selector.change(
1138
  fn=get_model_mode_choices,
1139
  inputs=[viz_version_selector],
1140
+ outputs=[model_mode_selector],
1141
  )
1142
 
1143
  # with gr.TabItem("About", elem_id="guardbench-about-tab", id=2):
 
1157
  value=CURRENT_VERSION,
1158
  interactive=True,
1159
  elem_classes="version-selector",
1160
+ visible=False,
1161
  )
1162
 
1163
  with gr.Row():
 
1170
  value=None,
1171
  interactive=True,
1172
  )
1173
+ revision_name_textbox = gr.Textbox(
1174
+ label="Revision commit", placeholder="main"
1175
+ )
1176
  model_type = gr.Dropdown(
1177
+ choices=[
1178
+ t.to_str(" : ")
1179
+ for t in ModelType
1180
+ if t != ModelType.Unknown
1181
+ ],
1182
  label="Model type",
1183
  multiselect=False,
1184
  value=None,
 
1194
 
1195
  with gr.Column():
1196
  precision = gr.Dropdown(
1197
+ choices=[
1198
+ i.name for i in Precision if i != Precision.Unknown
1199
+ ],
1200
  label="Precision",
1201
  multiselect=False,
1202
  value="float16",
 
1209
  value="Original",
1210
  interactive=True,
1211
  )
1212
+ base_model_name_textbox = gr.Textbox(
1213
+ label="Base model (for delta or adapter weights)"
1214
+ )
1215
 
1216
  with gr.Row():
1217
  file_input = gr.File(
1218
+ label="Upload JSONL Results File", file_types=[".jsonl"]
 
1219
  )
1220
 
1221
  submit_button = gr.Button("Submit Results")
 
1233
  mode_selector,
1234
  file_input,
1235
  submission_version_selector,
1236
+ guard_model_type,
1237
  ],
1238
+ outputs=result_output,
1239
  )
1240
 
1241
  # Version selector functionality
1242
  version_selector.change(
1243
  fn=update_leaderboards,
1244
  inputs=[version_selector],
1245
+ outputs=[leaderboard]
1246
+ + [
1247
+ category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1248
+ ],
1249
+ ).then(
1250
+ lambda version: refresh_data_with_filters(version),
1251
+ inputs=[version_selector],
1252
+ outputs=[leaderboard]
1253
+ + [
1254
+ category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1255
+ ],
1256
+ )
1257
 
1258
 
1259
  # Set up the scheduler to refresh data periodically
1260
  scheduler = BackgroundScheduler()
1261
+ scheduler.add_job(refresh_data, "interval", minutes=30)
1262
  scheduler.start()
1263
 
1264
  # Launch the app
1265
  if __name__ == "__main__":
 
1266
  demo.launch()