apsys commited on
Commit
5565a34
·
1 Parent(s): ddcdcd7

new metrics

Browse files
Files changed (2) hide show
  1. src/display/utils.py +48 -3
  2. src/leaderboard/processor.py +127 -23
src/display/utils.py CHANGED
@@ -138,7 +138,7 @@ class GuardBenchColumn:
138
  name="default_prompts_f1",
139
  display_name="Default Prompts F1",
140
  type="number",
141
- displayed_by_default=True
142
  ))
143
  default_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
144
  name="default_prompts_recall_binary",
@@ -176,7 +176,7 @@ class GuardBenchColumn:
176
  name="jailbreaked_prompts_f1",
177
  display_name="Jailbreaked Prompts F1",
178
  type="number",
179
- displayed_by_default=True
180
  ))
181
  jailbreaked_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
182
  name="jailbreaked_prompts_recall_binary",
@@ -214,7 +214,7 @@ class GuardBenchColumn:
214
  name="default_answers_f1",
215
  display_name="Default Answers F1",
216
  type="number",
217
- displayed_by_default=True
218
  ))
219
  default_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
220
  name="default_answers_recall_binary",
@@ -279,6 +279,51 @@ class GuardBenchColumn:
279
  displayed_by_default=False
280
  ))
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
  # Create instances for easy access
284
  GUARDBENCH_COLUMN = GuardBenchColumn()
 
138
  name="default_prompts_f1",
139
  display_name="Default Prompts F1",
140
  type="number",
141
+ displayed_by_default=False
142
  ))
143
  default_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
144
  name="default_prompts_recall_binary",
 
176
  name="jailbreaked_prompts_f1",
177
  display_name="Jailbreaked Prompts F1",
178
  type="number",
179
+ displayed_by_default=False
180
  ))
181
  jailbreaked_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
182
  name="jailbreaked_prompts_recall_binary",
 
214
  name="default_answers_f1",
215
  display_name="Default Answers F1",
216
  type="number",
217
+ displayed_by_default=False
218
  ))
219
  default_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
220
  name="default_answers_recall_binary",
 
279
  displayed_by_default=False
280
  ))
281
 
282
+ # Calculated overall metrics (renamed)
283
+ macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
284
+ name="macro_accuracy",
285
+ display_name="Macro Accuracy",
286
+ type="number",
287
+ displayed_by_default=True
288
+ ))
289
+ macro_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
290
+ name="macro_recall",
291
+ display_name="Macro Recall",
292
+ type="number",
293
+ displayed_by_default=True
294
+ ))
295
+ macro_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
296
+ name="macro_precision",
297
+ display_name="Macro Precision",
298
+ type="number",
299
+ displayed_by_default=False
300
+ ))
301
+ integral_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
302
+ name="integral_score",
303
+ display_name="Integral Score",
304
+ type="number",
305
+ displayed_by_default=True
306
+ ))
307
+ # NEW Summary Metrics
308
+ micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
309
+ name="micro_avg_error_ratio",
310
+ display_name="Micro Error %",
311
+ type="number",
312
+ displayed_by_default=True
313
+ ))
314
+ micro_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
315
+ name="micro_avg_runtime_ms",
316
+ display_name="Micro Avg Time (ms)",
317
+ type="number",
318
+ displayed_by_default=True
319
+ ))
320
+ total_evals_count: ColumnInfo = field(default_factory=lambda: ColumnInfo(
321
+ name="total_evals_count",
322
+ display_name="Total Evals Count",
323
+ type="number",
324
+ displayed_by_default=True
325
+ ))
326
+
327
 
328
  # Create instances for easy access
329
  GUARDBENCH_COLUMN = GuardBenchColumn()
src/leaderboard/processor.py CHANGED
@@ -7,9 +7,86 @@ import os
7
  import pandas as pd
8
  from datetime import datetime
9
  from typing import Dict, List, Any, Tuple
 
10
 
11
  from src.display.utils import CATEGORIES, TEST_TYPES, METRICS
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def load_leaderboard_data(file_path: str) -> Dict:
15
  """
@@ -133,29 +210,39 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
133
  row[f"{test_type}_f1"] = metrics[metric]
134
 
135
  # Calculate averages if not present
136
- if "average_f1" not in row:
137
  f1_values = []
138
  for test_type in TEST_TYPES:
139
- if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
140
  f1_values.append(avg_metrics[test_type]["f1_binary"])
141
  if f1_values:
142
- row["average_f1"] = sum(f1_values) / len(f1_values)
143
 
144
- if "average_recall" not in row:
145
  recall_values = []
146
  for test_type in TEST_TYPES:
147
- if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
148
  recall_values.append(avg_metrics[test_type]["recall_binary"])
149
  if recall_values:
150
- row["average_recall"] = sum(recall_values) / len(recall_values)
151
 
152
- if "average_precision" not in row:
153
- precision_values = []
 
154
  for test_type in TEST_TYPES:
155
- if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
156
- precision_values.append(avg_metrics[test_type]["precision_binary"])
157
- if precision_values:
158
- row["average_precision"] = sum(precision_values) / len(precision_values)
 
 
 
 
 
 
 
 
 
159
 
160
  rows.append(row)
161
 
@@ -164,17 +251,34 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
164
 
165
  # Ensure all expected columns exist
166
  for test_type in TEST_TYPES:
167
- if f"{test_type}_f1" not in df.columns:
168
- df[f"{test_type}_f1"] = None
169
- if f"{test_type}_f1_binary" not in df.columns:
170
- df[f"{test_type}_f1_binary"] = None
171
- if f"{test_type}_recall_binary" not in df.columns:
172
- df[f"{test_type}_recall_binary"] = None
173
- if f"{test_type}_precision_binary" not in df.columns:
174
- df[f"{test_type}_precision_binary"] = None
175
-
176
- if not df.empty and "average_f1" in df.columns:
177
- df = df.sort_values(by="average_f1", ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  return df
180
 
 
7
  import pandas as pd
8
  from datetime import datetime
9
  from typing import Dict, List, Any, Tuple
10
+ import numpy as np
11
 
12
  from src.display.utils import CATEGORIES, TEST_TYPES, METRICS
13
 
14
+ # Constants for Integral Score calculation (mirrors guardbench library)
15
+ MAX_PUNISHABLE_RUNTIME_MS = 6000.0
16
+ MIN_PUNISHABLE_RUNTIME_MS = 200.0
17
+ MAX_RUNTIME_PENALTY = 0.75 # Corresponds to 1.0 - MIN_TIME_FACTOR, library used 0.75
18
+
19
+ def calculate_integral_score(row: pd.Series) -> float:
20
+ """
21
+ Calculate the integral score for a given model entry row.
22
+ Uses F1-binary as the primary metric, error ratio, and runtime penalty.
23
+ """
24
+ integral_score = 1.0
25
+ metric_count = 0
26
+
27
+ # Primary metric (using f1_binary, could be changed to accuracy if needed)
28
+ for test_type in TEST_TYPES:
29
+ metric_col = f"{test_type}_f1_binary"
30
+ if metric_col in row and pd.notna(row[metric_col]):
31
+ integral_score *= row[metric_col]
32
+ metric_count += 1
33
+
34
+ # If no primary metrics found, return 0
35
+ if metric_count == 0:
36
+ # Check for average_f1 as a fallback
37
+ if "average_f1" in row and pd.notna(row["average_f1"]):
38
+ integral_score *= row["average_f1"]
39
+ metric_count += 1
40
+ else:
41
+ return 0.0 # Cannot calculate score without primary metrics
42
+
43
+ # Account for average errors across all test types (using a simple average for now)
44
+ # This requires micro-level error data which isn't directly in avg_metrics.
45
+ # We'll approximate using the average of available error ratios.
46
+ error_ratios = []
47
+ for test_type in TEST_TYPES:
48
+ error_col = f"{test_type}_error_ratio"
49
+ if error_col in row and pd.notna(row[error_col]):
50
+ error_ratios.append(row[error_col])
51
+
52
+ if error_ratios:
53
+ avg_error_ratio = np.mean(error_ratios)
54
+ integral_score *= (1.0 - avg_error_ratio)
55
+
56
+ # Account for average runtime across all test types (using a simple average for now)
57
+ # This requires micro-level runtime data. We'll approximate.
58
+ runtimes = []
59
+ for test_type in TEST_TYPES:
60
+ runtime_col = f"{test_type}_avg_runtime_ms"
61
+ if runtime_col in row and pd.notna(row[runtime_col]):
62
+ runtimes.append(row[runtime_col])
63
+
64
+ if runtimes:
65
+ avg_runtime_ms = np.mean(runtimes)
66
+
67
+ # Apply penalty based on runtime
68
+ runtime = max(
69
+ min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
70
+ MIN_PUNISHABLE_RUNTIME_MS,
71
+ )
72
+
73
+ if MAX_PUNISHABLE_RUNTIME_MS > MIN_PUNISHABLE_RUNTIME_MS:
74
+ normalized_time = (runtime - MIN_PUNISHABLE_RUNTIME_MS) / (
75
+ MAX_PUNISHABLE_RUNTIME_MS - MIN_PUNISHABLE_RUNTIME_MS
76
+ )
77
+ time_factor = 1.0 - MAX_RUNTIME_PENALTY * normalized_time
78
+ else:
79
+ time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY) # Assign max penalty if runtime exceeds min when max==min
80
+
81
+ # Make sure the factor is not less than the minimum value (1 - MAX_PENALTY)
82
+ time_factor = max((1.0 - MAX_RUNTIME_PENALTY), time_factor)
83
+ integral_score *= time_factor
84
+
85
+ # Root the score by the number of primary metrics used? (Optional, library did this)
86
+ # return integral_score ** (1 / metric_count) if metric_count > 0 else 0.0
87
+ # Let's skip the rooting for now to keep the scale potentially larger.
88
+ return integral_score
89
+
90
 
91
  def load_leaderboard_data(file_path: str) -> Dict:
92
  """
 
210
  row[f"{test_type}_f1"] = metrics[metric]
211
 
212
  # Calculate averages if not present
213
+ if "macro_accuracy" not in row:
214
  f1_values = []
215
  for test_type in TEST_TYPES:
216
+ if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["f1_binary"]):
217
  f1_values.append(avg_metrics[test_type]["f1_binary"])
218
  if f1_values:
219
+ row["macro_accuracy"] = sum(f1_values) / len(f1_values)
220
 
221
+ if "macro_recall" not in row:
222
  recall_values = []
223
  for test_type in TEST_TYPES:
224
+ if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["recall_binary"]):
225
  recall_values.append(avg_metrics[test_type]["recall_binary"])
226
  if recall_values:
227
+ row["macro_recall"] = sum(recall_values) / len(recall_values)
228
 
229
+ if "total_evals_count" not in row:
230
+ total_samples = 0
231
+ found_samples = False
232
  for test_type in TEST_TYPES:
233
+ if test_type in avg_metrics and "sample_count" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["sample_count"]):
234
+ total_samples += avg_metrics[test_type]["sample_count"]
235
+ found_samples = True
236
+ if found_samples:
237
+ row["total_evals_count"] = total_samples
238
+
239
+ # Extract micro averages directly from entry if they exist (like in guardbench library)
240
+ row["micro_avg_error_ratio"] = entry.get("micro_avg_error_ratio", pd.NA)
241
+ row["micro_avg_runtime_ms"] = entry.get("micro_avg_runtime_ms", pd.NA)
242
+
243
+ # Convert error ratio to percentage for consistency with display name
244
+ if pd.notna(row["micro_avg_error_ratio"]):
245
+ row["micro_avg_error_ratio"] *= 100
246
 
247
  rows.append(row)
248
 
 
251
 
252
  # Ensure all expected columns exist
253
  for test_type in TEST_TYPES:
254
+ for metric in METRICS:
255
+ col_name = f"{test_type}_{metric}"
256
+ if col_name not in df.columns:
257
+ df[col_name] = pd.NA # Use pd.NA for missing numeric data
258
+ # Add non-binary F1 if binary exists
259
+ if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
260
+ df[f"{test_type}_f1"] = df[col_name] # Copy f1_binary to f1 if f1 is missing
261
+
262
+ # Calculate Integral Score
263
+ if not df.empty:
264
+ df["integral_score"] = df.apply(calculate_integral_score, axis=1)
265
+ # Sort by Integral Score instead of average_f1
266
+ df = df.sort_values(by="integral_score", ascending=False, na_position='last')
267
+ else:
268
+ # Add the column even if empty
269
+ df["integral_score"] = pd.NA
270
+
271
+ # Ensure summary columns exist
272
+ summary_cols = ["macro_accuracy", "macro_recall", "micro_avg_error_ratio", "micro_avg_runtime_ms", "total_evals_count"]
273
+ for col in summary_cols:
274
+ if col not in df.columns:
275
+ df[col] = pd.NA
276
+
277
+ # Remove old average columns if they somehow snuck in
278
+ old_avg_cols = ["average_f1", "average_recall", "average_precision"]
279
+ for col in old_avg_cols:
280
+ if col in df.columns:
281
+ df = df.drop(columns=[col])
282
 
283
  return df
284