new metrics
Browse files- src/leaderboard/processor.py +6 -25
src/leaderboard/processor.py
CHANGED
@@ -32,13 +32,11 @@ def calculate_integral_score(row: pd.Series) -> float:
|
|
32 |
integral_score *= row[metric_col]
|
33 |
metric_count += 1
|
34 |
|
35 |
-
#
|
|
|
|
|
36 |
if metric_count == 0:
|
37 |
-
|
38 |
-
integral_score *= row["macro_accuracy"]
|
39 |
-
metric_count += 1
|
40 |
-
else:
|
41 |
-
return 0.0 # Cannot calculate score without primary metrics
|
42 |
|
43 |
# Error Penalty
|
44 |
micro_error_col = "micro_avg_error_ratio"
|
@@ -46,32 +44,15 @@ def calculate_integral_score(row: pd.Series) -> float:
|
|
46 |
# Micro error is stored as %, convert back to ratio
|
47 |
micro_error_ratio = row[micro_error_col] / 100.0
|
48 |
integral_score *= (1.0 - micro_error_ratio)
|
49 |
-
else:
|
50 |
-
# Fallback: Calculate average error from per-test-type
|
51 |
-
error_ratios = []
|
52 |
-
for test_type in TEST_TYPES:
|
53 |
-
error_col = f"{test_type}_error_ratio"
|
54 |
-
if error_col in row and pd.notna(row[error_col]):
|
55 |
-
error_ratios.append(row[error_col])
|
56 |
-
if error_ratios:
|
57 |
-
avg_error_ratio = np.mean(error_ratios)
|
58 |
-
integral_score *= (1.0 - avg_error_ratio)
|
59 |
|
60 |
# Runtime Penalty
|
|
|
61 |
micro_runtime_col = "micro_avg_runtime_ms"
|
62 |
if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
|
63 |
avg_runtime_ms = row[micro_runtime_col]
|
64 |
-
else:
|
65 |
-
# Fallback: Calculate average runtime from per-test-type
|
66 |
-
runtimes = []
|
67 |
-
for test_type in TEST_TYPES:
|
68 |
-
runtime_col = f"{test_type}_avg_runtime_ms"
|
69 |
-
if runtime_col in row and pd.notna(row[runtime_col]):
|
70 |
-
runtimes.append(row[runtime_col])
|
71 |
-
avg_runtime_ms = np.mean(runtimes) if runtimes else None
|
72 |
|
73 |
if avg_runtime_ms is not None:
|
74 |
-
# Apply penalty based on runtime (
|
75 |
runtime = max(
|
76 |
min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
|
77 |
MIN_PUNISHABLE_RUNTIME_MS,
|
|
|
32 |
integral_score *= row[metric_col]
|
33 |
metric_count += 1
|
34 |
|
35 |
+
# If no accuracy metrics were found at all, the score remains 1.0 before penalties.
|
36 |
+
# The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
|
37 |
+
# Let's add that check back before applying penalties.
|
38 |
if metric_count == 0:
|
39 |
+
return 0.0
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Error Penalty
|
42 |
micro_error_col = "micro_avg_error_ratio"
|
|
|
44 |
# Micro error is stored as %, convert back to ratio
|
45 |
micro_error_ratio = row[micro_error_col] / 100.0
|
46 |
integral_score *= (1.0 - micro_error_ratio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Runtime Penalty
|
49 |
+
avg_runtime_ms = None # Initialize
|
50 |
micro_runtime_col = "micro_avg_runtime_ms"
|
51 |
if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
|
52 |
avg_runtime_ms = row[micro_runtime_col]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
if avg_runtime_ms is not None:
|
55 |
+
# Apply penalty based on runtime (only if micro avg runtime was found)
|
56 |
runtime = max(
|
57 |
min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
|
58 |
MIN_PUNISHABLE_RUNTIME_MS,
|