akera commited on
Commit
83243ea
·
verified ·
1 Parent(s): f54baf8

Update src/leaderboard.py

Browse files
Files changed (1) hide show
  1. src/leaderboard.py +51 -311
src/leaderboard.py CHANGED
@@ -11,15 +11,13 @@ from config import (
11
  HF_TOKEN,
12
  EVALUATION_TRACKS,
13
  MODEL_CATEGORIES,
14
- STATISTICAL_CONFIG,
15
  METRICS_CONFIG,
16
- SAMPLE_SIZE_RECOMMENDATIONS,
17
  )
18
  from src.utils import create_submission_id, sanitize_model_name
19
 
20
 
21
- def initialize_scientific_leaderboard() -> pd.DataFrame:
22
- """Initialize empty scientific leaderboard DataFrame with all required columns."""
23
 
24
  columns = {
25
  # Basic information
@@ -33,104 +31,76 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
33
  # Track-specific quality scores
34
  "google_comparable_quality": [],
35
  "ug40_complete_quality": [],
36
- "language_pair_matrix_quality": [],
37
 
38
  # Track-specific BLEU scores
39
  "google_comparable_bleu": [],
40
  "ug40_complete_bleu": [],
41
- "language_pair_matrix_bleu": [],
42
 
43
  # Track-specific ChrF scores
44
  "google_comparable_chrf": [],
45
  "ug40_complete_chrf": [],
46
- "language_pair_matrix_chrf": [],
47
 
48
- # Statistical metadata
49
  "google_comparable_ci_lower": [],
50
  "google_comparable_ci_upper": [],
51
  "ug40_complete_ci_lower": [],
52
  "ug40_complete_ci_upper": [],
53
- "language_pair_matrix_ci_lower": [],
54
- "language_pair_matrix_ci_upper": [],
55
 
56
  # Coverage information
57
  "google_comparable_samples": [],
58
  "ug40_complete_samples": [],
59
- "language_pair_matrix_samples": [],
60
  "google_comparable_pairs": [],
61
  "ug40_complete_pairs": [],
62
- "language_pair_matrix_pairs": [],
63
-
64
- # Statistical adequacy flags
65
- "google_comparable_adequate": [],
66
- "ug40_complete_adequate": [],
67
- "language_pair_matrix_adequate": [],
68
 
69
  # Detailed results (JSON strings)
70
  "detailed_google_comparable": [],
71
  "detailed_ug40_complete": [],
72
- "detailed_language_pair_matrix": [],
73
- "cross_track_analysis": [],
74
 
75
  # Metadata
76
  "evaluation_date": [],
77
- "leaderboard_version": [],
78
- "scientific_adequacy_score": [],
79
  }
80
 
81
  return pd.DataFrame(columns)
82
 
83
 
84
- def load_scientific_leaderboard() -> pd.DataFrame:
85
- """Load current scientific leaderboard from HuggingFace dataset."""
86
 
87
  try:
88
- print("📥 Loading scientific leaderboard...")
89
- dataset = load_dataset(LEADERBOARD_DATASET + "-scientific", split="train")
90
  df = dataset.to_pandas()
91
 
92
  # Ensure all required columns exist
93
- required_columns = list(initialize_scientific_leaderboard().columns)
94
  for col in required_columns:
95
  if col not in df.columns:
96
  if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col:
97
  df[col] = 0.0
98
  elif "samples" in col or "pairs" in col:
99
  df[col] = 0
100
- elif "adequate" in col:
101
- df[col] = False
102
- elif col == "scientific_adequacy_score":
103
- df[col] = 0.0
104
- elif col == "leaderboard_version":
105
- df[col] = 2 # Scientific version
106
  else:
107
  df[col] = ""
108
 
109
- # Ensure proper data types for boolean columns
110
- boolean_columns = [col for col in df.columns if "adequate" in col]
111
- for col in boolean_columns:
112
- df[col] = df[col].fillna(False).astype(bool)
113
-
114
  # Ensure proper data types for numeric columns
115
  numeric_columns = [
116
  col for col in df.columns
117
- if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs", "adequacy"])
118
- and "adequate" not in col
119
  ]
120
  for col in numeric_columns:
121
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
122
 
123
- print(f"✅ Loaded scientific leaderboard with {len(df)} entries")
124
  return df
125
 
126
  except Exception as e:
127
- print(f"⚠️ Could not load scientific leaderboard: {e}")
128
- print("🔄 Initializing empty scientific leaderboard...")
129
- return initialize_scientific_leaderboard()
130
 
131
 
132
- def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
133
- """Save scientific leaderboard to HuggingFace dataset."""
134
 
135
  try:
136
  # Clean data before saving
@@ -139,45 +109,42 @@ def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
139
  # Ensure numeric columns are proper types
140
  numeric_columns = [
141
  col for col in df_clean.columns
142
- if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs", "adequacy"])
143
  ]
144
 
145
  for col in numeric_columns:
146
  if col in df_clean.columns:
147
- if "adequate" in col:
148
- df_clean[col] = df_clean[col].astype(bool)
149
- else:
150
- df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
151
 
152
  # Convert to dataset
153
  dataset = Dataset.from_pandas(df_clean)
154
 
155
  # Push to hub
156
  dataset.push_to_hub(
157
- LEADERBOARD_DATASET + "-scientific",
158
  token=HF_TOKEN,
159
- commit_message=f"Update scientific leaderboard - {datetime.datetime.now().isoformat()[:19]}",
160
  )
161
 
162
- print("✅ Scientific leaderboard saved successfully!")
163
  return True
164
 
165
  except Exception as e:
166
- print(f"❌ Error saving scientific leaderboard: {e}")
167
  return False
168
 
169
 
170
- def add_model_to_scientific_leaderboard(
171
  model_name: str,
172
  author: str,
173
  evaluation_results: Dict,
174
  model_category: str = "community",
175
  description: str = "",
176
  ) -> pd.DataFrame:
177
- """Add new model results to scientific leaderboard."""
178
 
179
  # Load current leaderboard
180
- df = load_scientific_leaderboard()
181
 
182
  # Remove existing entry if present
183
  existing_mask = df["model_name"] == model_name
@@ -186,10 +153,6 @@ def add_model_to_scientific_leaderboard(
186
 
187
  # Extract track results
188
  tracks = evaluation_results.get("tracks", {})
189
- cross_track = evaluation_results.get("cross_track_analysis", {})
190
-
191
- # Calculate scientific adequacy score
192
- adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
193
 
194
  # Prepare new entry
195
  new_entry = {
@@ -203,22 +166,17 @@ def add_model_to_scientific_leaderboard(
203
  # Extract track-specific metrics
204
  **extract_track_metrics(tracks),
205
 
206
- # Statistical metadata
207
- **extract_statistical_metadata(tracks),
208
 
209
  # Coverage information
210
  **extract_coverage_information(tracks),
211
 
212
- # Adequacy flags
213
- **extract_adequacy_flags(tracks),
214
-
215
  # Detailed results (JSON strings)
216
- **serialize_detailed_results(tracks, cross_track),
217
 
218
  # Metadata
219
  "evaluation_date": datetime.datetime.now().isoformat(),
220
- "leaderboard_version": 2,
221
- "scientific_adequacy_score": adequacy_score,
222
  }
223
 
224
  # Convert to DataFrame and append
@@ -226,7 +184,7 @@ def add_model_to_scientific_leaderboard(
226
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
227
 
228
  # Save to hub
229
- save_scientific_leaderboard(updated_df)
230
 
231
  return updated_df
232
 
@@ -252,20 +210,20 @@ def extract_track_metrics(tracks: Dict) -> Dict:
252
  return metrics
253
 
254
 
255
- def extract_statistical_metadata(tracks: Dict) -> Dict:
256
  """Extract confidence intervals from each track."""
257
 
258
- metadata = {}
259
 
260
  for track_name in EVALUATION_TRACKS.keys():
261
  track_data = tracks.get(track_name, {})
262
- track_statistics = track_data.get("track_statistics", {})
263
 
264
- quality_stats = track_statistics.get("quality_score", {})
265
- metadata[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
266
- metadata[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
267
 
268
- return metadata
269
 
270
 
271
  def extract_coverage_information(tracks: Dict) -> Dict:
@@ -283,24 +241,7 @@ def extract_coverage_information(tracks: Dict) -> Dict:
283
  return coverage
284
 
285
 
286
- def extract_adequacy_flags(tracks: Dict) -> Dict:
287
- """Extract statistical adequacy flags for each track."""
288
-
289
- adequacy = {}
290
-
291
- for track_name in EVALUATION_TRACKS.keys():
292
- track_data = tracks.get(track_name, {})
293
- summary = track_data.get("summary", {})
294
-
295
- min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
296
- is_adequate = summary.get("total_samples", 0) >= min_required
297
-
298
- adequacy[f"{track_name}_adequate"] = bool(is_adequate)
299
-
300
- return adequacy
301
-
302
-
303
- def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
304
  """Serialize detailed results for storage."""
305
 
306
  detailed = {}
@@ -308,55 +249,24 @@ def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
308
  for track_name in EVALUATION_TRACKS.keys():
309
  track_data = tracks.get(track_name, {})
310
 
311
- # Remove non-serializable data
312
- safe_track_data = {}
313
- for key, value in track_data.items():
314
- if key != "sample_metrics": # Skip large DataFrames
315
- safe_track_data[key] = value
 
 
316
 
317
- detailed[f"detailed_{track_name}"] = json.dumps(safe_track_data)
318
-
319
- detailed["cross_track_analysis"] = json.dumps(cross_track)
320
 
321
  return detailed
322
 
323
 
324
- def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
325
- """Calculate overall scientific adequacy score (0-1)."""
326
-
327
- tracks = evaluation_results.get("tracks", {})
328
-
329
- adequacy_scores = []
330
-
331
- for track_name in EVALUATION_TRACKS.keys():
332
- track_data = tracks.get(track_name, {})
333
- summary = track_data.get("summary", {})
334
-
335
- if track_data.get("error"):
336
- adequacy_scores.append(0.0)
337
- continue
338
-
339
- # Sample size adequacy
340
- min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
341
- sample_adequacy = min(summary.get("total_samples", 0) / max(min_required, 1), 1.0)
342
-
343
- # Coverage adequacy
344
- total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (len(EVALUATION_TRACKS[track_name]["languages"]) - 1)
345
- coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(total_possible_pairs, 1)
346
-
347
- # Track adequacy
348
- track_adequacy = (sample_adequacy + coverage_adequacy) / 2
349
- adequacy_scores.append(track_adequacy)
350
-
351
- return float(np.mean(adequacy_scores))
352
-
353
-
354
  def get_track_leaderboard(
355
  df: pd.DataFrame,
356
  track: str,
357
  metric: str = "quality",
358
- category_filter: str = "all",
359
- min_adequacy: float = 0.0
360
  ) -> pd.DataFrame:
361
  """Get leaderboard for a specific track with filtering."""
362
 
@@ -364,29 +274,19 @@ def get_track_leaderboard(
364
  return df
365
 
366
  track_quality_col = f"{track}_{metric}"
367
- track_adequate_col = f"{track}_adequate"
368
 
369
  # Ensure columns exist
370
- if track_quality_col not in df.columns or track_adequate_col not in df.columns:
371
- print(f"Warning: Missing columns for track {track}")
372
  return pd.DataFrame()
373
 
374
- # Filter by adequacy
375
- if min_adequacy > 0:
376
- adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
377
- df = df[adequacy_mask]
378
-
379
  # Filter by category
380
  if category_filter != "all":
381
  df = df[df["model_category"] == category_filter]
382
 
383
- # Filter to models that have this track - fix boolean operation
384
- # Convert to proper boolean and handle NaN values
385
  quality_mask = pd.to_numeric(df[track_quality_col], errors='coerce') > 0
386
- adequate_mask = df[track_adequate_col].fillna(False).astype(bool)
387
-
388
- valid_mask = quality_mask & adequate_mask
389
- df = df[valid_mask]
390
 
391
  if df.empty:
392
  return df
@@ -397,7 +297,7 @@ def get_track_leaderboard(
397
  return df
398
 
399
 
400
- def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
401
  """Prepare track-specific leaderboard for display."""
402
 
403
  if df.empty:
@@ -414,7 +314,6 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
414
  f"{track}_ci_upper",
415
  f"{track}_samples",
416
  f"{track}_pairs",
417
- f"{track}_adequate",
418
  ]
419
 
420
  # Only include columns that exist
@@ -461,167 +360,8 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
461
  f"{track}_confidence_interval": "95% CI",
462
  f"{track}_samples": "Samples",
463
  f"{track}_pairs": "Pairs",
464
- f"{track}_adequate": "Adequate",
465
  }
466
 
467
  display_df = display_df.rename(columns=column_renames)
468
 
469
- return display_df
470
-
471
-
472
- def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
473
- """Get comprehensive statistics for the scientific leaderboard."""
474
-
475
- if df.empty:
476
- return {
477
- "total_models": 0,
478
- "models_by_category": {},
479
- "track_statistics": {},
480
- "adequacy_distribution": {},
481
- "best_models_by_track": {},
482
- }
483
-
484
- stats = {
485
- "total_models": len(df),
486
- "models_by_category": df["model_category"].value_counts().to_dict(),
487
- "adequacy_distribution": {},
488
- "track_statistics": {},
489
- "best_models_by_track": {},
490
- }
491
-
492
- # Adequacy distribution
493
- adequacy_bins = pd.cut(
494
- df["scientific_adequacy_score"],
495
- bins=[0, 0.3, 0.6, 0.8, 1.0],
496
- labels=["Poor", "Fair", "Good", "Excellent"]
497
- )
498
- stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
499
-
500
- # Track-specific statistics
501
- for track_name in EVALUATION_TRACKS.keys():
502
- quality_col = f"{track_name}_quality"
503
- adequate_col = f"{track_name}_adequate"
504
-
505
- if quality_col in df.columns and adequate_col in df.columns:
506
- track_models = df[df[adequate_col] & (df[quality_col] > 0)]
507
-
508
- if len(track_models) > 0:
509
- stats["track_statistics"][track_name] = {
510
- "participating_models": len(track_models),
511
- "avg_quality": float(track_models[quality_col].mean()),
512
- "std_quality": float(track_models[quality_col].std()),
513
- "best_quality": float(track_models[quality_col].max()),
514
- }
515
-
516
- # Best model for this track
517
- best_model = track_models.loc[track_models[quality_col].idxmax()]
518
- stats["best_models_by_track"][track_name] = {
519
- "name": best_model["model_name"],
520
- "category": best_model["model_category"],
521
- "quality": float(best_model[quality_col]),
522
- }
523
-
524
- return stats
525
-
526
-
527
- def perform_fair_comparison(
528
- df: pd.DataFrame,
529
- model_names: List[str],
530
- shared_pairs_only: bool = True
531
- ) -> Dict:
532
- """Perform fair comparison between models using only shared language pairs."""
533
-
534
- models = df[df["model_name"].isin(model_names)]
535
-
536
- if len(models) == 0:
537
- return {"error": "No models found"}
538
-
539
- comparison = {
540
- "models": list(models["model_name"]),
541
- "fair_comparison_possible": True,
542
- "track_comparisons": {},
543
- "statistical_significance": {},
544
- "recommendations": [],
545
- }
546
-
547
- # Check if fair comparison is possible
548
- categories = models["model_category"].unique()
549
- if len(categories) > 1:
550
- comparison["recommendations"].append(
551
- "⚠️ Comparing models from different categories - interpret results carefully"
552
- )
553
-
554
- # For each track, compare models
555
- for track_name in EVALUATION_TRACKS.keys():
556
- quality_col = f"{track_name}_quality"
557
- adequate_col = f"{track_name}_adequate"
558
-
559
- track_models = models[models[adequate_col] & (models[quality_col] > 0)]
560
-
561
- if len(track_models) >= 2:
562
- comparison["track_comparisons"][track_name] = {
563
- "participating_models": len(track_models),
564
- "quality_scores": dict(zip(track_models["model_name"], track_models[quality_col])),
565
- "confidence_intervals": {},
566
- }
567
-
568
- # Extract confidence intervals
569
- for _, model in track_models.iterrows():
570
- ci_lower = model.get(f"{track_name}_ci_lower", 0)
571
- ci_upper = model.get(f"{track_name}_ci_upper", 0)
572
- comparison["track_comparisons"][track_name]["confidence_intervals"][model["model_name"]] = [ci_lower, ci_upper]
573
-
574
- return comparison
575
-
576
-
577
- def export_scientific_leaderboard(
578
- df: pd.DataFrame,
579
- track: str = "all",
580
- format: str = "csv",
581
- include_detailed: bool = False
582
- ) -> str:
583
- """Export scientific leaderboard in specified format."""
584
-
585
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
586
-
587
- if track != "all":
588
- # Export specific track
589
- export_df = prepare_track_leaderboard_display(df, track)
590
- filename_prefix = f"salt_leaderboard_{track}_{timestamp}"
591
- else:
592
- # Export all tracks
593
- if include_detailed:
594
- export_df = df.copy()
595
- else:
596
- # Select essential columns
597
- essential_columns = [
598
- "model_name", "author", "submission_date", "model_category",
599
- "scientific_adequacy_score"
600
- ]
601
-
602
- # Add track-specific quality scores
603
- for track_name in EVALUATION_TRACKS.keys():
604
- essential_columns.extend([
605
- f"{track_name}_quality",
606
- f"{track_name}_adequate",
607
- ])
608
-
609
- available_columns = [col for col in essential_columns if col in df.columns]
610
- export_df = df[available_columns].copy()
611
-
612
- filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
613
-
614
- # Export in specified format
615
- if format == "csv":
616
- filename = f"{filename_prefix}.csv"
617
- export_df.to_csv(filename, index=False)
618
- elif format == "json":
619
- filename = f"{filename_prefix}.json"
620
- export_df.to_json(filename, orient="records", indent=2)
621
- elif format == "xlsx":
622
- filename = f"{filename_prefix}.xlsx"
623
- export_df.to_excel(filename, index=False)
624
- else:
625
- raise ValueError(f"Unsupported format: {format}")
626
-
627
- return filename
 
11
  HF_TOKEN,
12
  EVALUATION_TRACKS,
13
  MODEL_CATEGORIES,
 
14
  METRICS_CONFIG,
 
15
  )
16
  from src.utils import create_submission_id, sanitize_model_name
17
 
18
 
19
+ def initialize_leaderboard() -> pd.DataFrame:
20
+ """Initialize empty leaderboard DataFrame with all required columns."""
21
 
22
  columns = {
23
  # Basic information
 
31
  # Track-specific quality scores
32
  "google_comparable_quality": [],
33
  "ug40_complete_quality": [],
 
34
 
35
  # Track-specific BLEU scores
36
  "google_comparable_bleu": [],
37
  "ug40_complete_bleu": [],
 
38
 
39
  # Track-specific ChrF scores
40
  "google_comparable_chrf": [],
41
  "ug40_complete_chrf": [],
 
42
 
43
+ # Confidence intervals
44
  "google_comparable_ci_lower": [],
45
  "google_comparable_ci_upper": [],
46
  "ug40_complete_ci_lower": [],
47
  "ug40_complete_ci_upper": [],
 
 
48
 
49
  # Coverage information
50
  "google_comparable_samples": [],
51
  "ug40_complete_samples": [],
 
52
  "google_comparable_pairs": [],
53
  "ug40_complete_pairs": [],
 
 
 
 
 
 
54
 
55
  # Detailed results (JSON strings)
56
  "detailed_google_comparable": [],
57
  "detailed_ug40_complete": [],
 
 
58
 
59
  # Metadata
60
  "evaluation_date": [],
 
 
61
  }
62
 
63
  return pd.DataFrame(columns)
64
 
65
 
66
+ def load_leaderboard() -> pd.DataFrame:
67
+ """Load current leaderboard from HuggingFace dataset."""
68
 
69
  try:
70
+ print("📥 Loading leaderboard...")
71
+ dataset = load_dataset(LEADERBOARD_DATASET, split="train", token=HF_TOKEN)
72
  df = dataset.to_pandas()
73
 
74
  # Ensure all required columns exist
75
+ required_columns = list(initialize_leaderboard().columns)
76
  for col in required_columns:
77
  if col not in df.columns:
78
  if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col:
79
  df[col] = 0.0
80
  elif "samples" in col or "pairs" in col:
81
  df[col] = 0
 
 
 
 
 
 
82
  else:
83
  df[col] = ""
84
 
 
 
 
 
 
85
  # Ensure proper data types for numeric columns
86
  numeric_columns = [
87
  col for col in df.columns
88
+ if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"])
 
89
  ]
90
  for col in numeric_columns:
91
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
92
 
93
+ print(f"✅ Loaded leaderboard with {len(df)} entries")
94
  return df
95
 
96
  except Exception as e:
97
+ print(f"⚠️ Could not load leaderboard: {e}")
98
+ print("🔄 Initializing empty leaderboard...")
99
+ return initialize_leaderboard()
100
 
101
 
102
+ def save_leaderboard(df: pd.DataFrame) -> bool:
103
+ """Save leaderboard to HuggingFace dataset."""
104
 
105
  try:
106
  # Clean data before saving
 
109
  # Ensure numeric columns are proper types
110
  numeric_columns = [
111
  col for col in df_clean.columns
112
+ if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"])
113
  ]
114
 
115
  for col in numeric_columns:
116
  if col in df_clean.columns:
117
+ df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
 
 
 
118
 
119
  # Convert to dataset
120
  dataset = Dataset.from_pandas(df_clean)
121
 
122
  # Push to hub
123
  dataset.push_to_hub(
124
+ LEADERBOARD_DATASET,
125
  token=HF_TOKEN,
126
+ commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()[:19]}",
127
  )
128
 
129
+ print("✅ Leaderboard saved successfully!")
130
  return True
131
 
132
  except Exception as e:
133
+ print(f"❌ Error saving leaderboard: {e}")
134
  return False
135
 
136
 
137
+ def add_model_to_leaderboard(
138
  model_name: str,
139
  author: str,
140
  evaluation_results: Dict,
141
  model_category: str = "community",
142
  description: str = "",
143
  ) -> pd.DataFrame:
144
+ """Add new model results to leaderboard."""
145
 
146
  # Load current leaderboard
147
+ df = load_leaderboard()
148
 
149
  # Remove existing entry if present
150
  existing_mask = df["model_name"] == model_name
 
153
 
154
  # Extract track results
155
  tracks = evaluation_results.get("tracks", {})
 
 
 
 
156
 
157
  # Prepare new entry
158
  new_entry = {
 
166
  # Extract track-specific metrics
167
  **extract_track_metrics(tracks),
168
 
169
+ # Confidence intervals
170
+ **extract_confidence_intervals(tracks),
171
 
172
  # Coverage information
173
  **extract_coverage_information(tracks),
174
 
 
 
 
175
  # Detailed results (JSON strings)
176
+ **serialize_detailed_results(tracks),
177
 
178
  # Metadata
179
  "evaluation_date": datetime.datetime.now().isoformat(),
 
 
180
  }
181
 
182
  # Convert to DataFrame and append
 
184
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
185
 
186
  # Save to hub
187
+ save_leaderboard(updated_df)
188
 
189
  return updated_df
190
 
 
210
  return metrics
211
 
212
 
213
+ def extract_confidence_intervals(tracks: Dict) -> Dict:
214
  """Extract confidence intervals from each track."""
215
 
216
+ ci_data = {}
217
 
218
  for track_name in EVALUATION_TRACKS.keys():
219
  track_data = tracks.get(track_name, {})
220
+ track_confidence = track_data.get("track_confidence", {})
221
 
222
+ quality_stats = track_confidence.get("quality_score", {})
223
+ ci_data[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
224
+ ci_data[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
225
 
226
+ return ci_data
227
 
228
 
229
  def extract_coverage_information(tracks: Dict) -> Dict:
 
241
  return coverage
242
 
243
 
244
+ def serialize_detailed_results(tracks: Dict) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  """Serialize detailed results for storage."""
246
 
247
  detailed = {}
 
249
  for track_name in EVALUATION_TRACKS.keys():
250
  track_data = tracks.get(track_name, {})
251
 
252
+ # Create simplified detailed results for storage
253
+ simple_track_data = {
254
+ "pair_metrics": track_data.get("pair_metrics", {}),
255
+ "track_averages": track_data.get("track_averages", {}),
256
+ "track_confidence": track_data.get("track_confidence", {}),
257
+ "summary": track_data.get("summary", {})
258
+ }
259
 
260
+ detailed[f"detailed_{track_name}"] = json.dumps(simple_track_data)
 
 
261
 
262
  return detailed
263
 
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  def get_track_leaderboard(
266
  df: pd.DataFrame,
267
  track: str,
268
  metric: str = "quality",
269
+ category_filter: str = "all"
 
270
  ) -> pd.DataFrame:
271
  """Get leaderboard for a specific track with filtering."""
272
 
 
274
  return df
275
 
276
  track_quality_col = f"{track}_{metric}"
 
277
 
278
  # Ensure columns exist
279
+ if track_quality_col not in df.columns:
280
+ print(f"Warning: Missing column for track {track}")
281
  return pd.DataFrame()
282
 
 
 
 
 
 
283
  # Filter by category
284
  if category_filter != "all":
285
  df = df[df["model_category"] == category_filter]
286
 
287
+ # Filter to models that have this track
 
288
  quality_mask = pd.to_numeric(df[track_quality_col], errors='coerce') > 0
289
+ df = df[quality_mask]
 
 
 
290
 
291
  if df.empty:
292
  return df
 
297
  return df
298
 
299
 
300
+ def prepare_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
301
  """Prepare track-specific leaderboard for display."""
302
 
303
  if df.empty:
 
314
  f"{track}_ci_upper",
315
  f"{track}_samples",
316
  f"{track}_pairs",
 
317
  ]
318
 
319
  # Only include columns that exist
 
360
  f"{track}_confidence_interval": "95% CI",
361
  f"{track}_samples": "Samples",
362
  f"{track}_pairs": "Pairs",
 
363
  }
364
 
365
  display_df = display_df.rename(columns=column_renames)
366
 
367
+ return display_df