apsys commited on
Commit
75d59af
·
1 Parent(s): a3c3e83

new metrics

Browse files
Files changed (1) hide show
  1. src/leaderboard/processor.py +6 -25
src/leaderboard/processor.py CHANGED
@@ -32,13 +32,11 @@ def calculate_integral_score(row: pd.Series) -> float:
32
  integral_score *= row[metric_col]
33
  metric_count += 1
34
 
35
- # Fallback if no primary metrics found
 
 
36
  if metric_count == 0:
37
- if "macro_accuracy" in row and pd.notna(row["macro_accuracy"]):
38
- integral_score *= row["macro_accuracy"]
39
- metric_count += 1
40
- else:
41
- return 0.0 # Cannot calculate score without primary metrics
42
 
43
  # Error Penalty
44
  micro_error_col = "micro_avg_error_ratio"
@@ -46,32 +44,15 @@ def calculate_integral_score(row: pd.Series) -> float:
46
  # Micro error is stored as %, convert back to ratio
47
  micro_error_ratio = row[micro_error_col] / 100.0
48
  integral_score *= (1.0 - micro_error_ratio)
49
- else:
50
- # Fallback: Calculate average error from per-test-type
51
- error_ratios = []
52
- for test_type in TEST_TYPES:
53
- error_col = f"{test_type}_error_ratio"
54
- if error_col in row and pd.notna(row[error_col]):
55
- error_ratios.append(row[error_col])
56
- if error_ratios:
57
- avg_error_ratio = np.mean(error_ratios)
58
- integral_score *= (1.0 - avg_error_ratio)
59
 
60
  # Runtime Penalty
 
61
  micro_runtime_col = "micro_avg_runtime_ms"
62
  if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
63
  avg_runtime_ms = row[micro_runtime_col]
64
- else:
65
- # Fallback: Calculate average runtime from per-test-type
66
- runtimes = []
67
- for test_type in TEST_TYPES:
68
- runtime_col = f"{test_type}_avg_runtime_ms"
69
- if runtime_col in row and pd.notna(row[runtime_col]):
70
- runtimes.append(row[runtime_col])
71
- avg_runtime_ms = np.mean(runtimes) if runtimes else None
72
 
73
  if avg_runtime_ms is not None:
74
- # Apply penalty based on runtime (using micro or calculated average)
75
  runtime = max(
76
  min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
77
  MIN_PUNISHABLE_RUNTIME_MS,
 
32
  integral_score *= row[metric_col]
33
  metric_count += 1
34
 
35
+ # If no accuracy metrics were found at all, the score remains 1.0 before penalties.
36
+ # The library returns 0.0 in this case (`return integral_score if count > 0 else 0.0`)
37
+ # Let's add that check back before applying penalties.
38
  if metric_count == 0:
39
+ return 0.0
 
 
 
 
40
 
41
  # Error Penalty
42
  micro_error_col = "micro_avg_error_ratio"
 
44
  # Micro error is stored as %, convert back to ratio
45
  micro_error_ratio = row[micro_error_col] / 100.0
46
  integral_score *= (1.0 - micro_error_ratio)
 
 
 
 
 
 
 
 
 
 
47
 
48
  # Runtime Penalty
49
+ avg_runtime_ms = None # Initialize
50
  micro_runtime_col = "micro_avg_runtime_ms"
51
  if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
52
  avg_runtime_ms = row[micro_runtime_col]
 
 
 
 
 
 
 
 
53
 
54
  if avg_runtime_ms is not None:
55
+ # Apply penalty based on runtime (only if micro avg runtime was found)
56
  runtime = max(
57
  min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
58
  MIN_PUNISHABLE_RUNTIME_MS,