akera commited on
Commit
4a955b1
·
verified ·
1 Parent(s): e3a10db

Update src/leaderboard.py

Browse files
Files changed (1) hide show
  1. src/leaderboard.py +149 -182
src/leaderboard.py CHANGED
@@ -13,13 +13,14 @@ from config import (
13
  MODEL_CATEGORIES,
14
  STATISTICAL_CONFIG,
15
  METRICS_CONFIG,
 
16
  )
17
  from src.utils import create_submission_id, sanitize_model_name
18
 
19
 
20
  def initialize_scientific_leaderboard() -> pd.DataFrame:
21
  """Initialize empty scientific leaderboard DataFrame with all required columns."""
22
-
23
  columns = {
24
  # Basic information
25
  "submission_id": [],
@@ -28,18 +29,22 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
28
  "submission_date": [],
29
  "model_category": [],
30
  "description": [],
 
31
  # Track-specific quality scores
32
  "google_comparable_quality": [],
33
  "ug40_complete_quality": [],
34
  "language_pair_matrix_quality": [],
 
35
  # Track-specific BLEU scores
36
  "google_comparable_bleu": [],
37
  "ug40_complete_bleu": [],
38
  "language_pair_matrix_bleu": [],
 
39
  # Track-specific ChrF scores
40
  "google_comparable_chrf": [],
41
  "ug40_complete_chrf": [],
42
  "language_pair_matrix_chrf": [],
 
43
  # Statistical metadata
44
  "google_comparable_ci_lower": [],
45
  "google_comparable_ci_upper": [],
@@ -47,6 +52,7 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
47
  "ug40_complete_ci_upper": [],
48
  "language_pair_matrix_ci_lower": [],
49
  "language_pair_matrix_ci_upper": [],
 
50
  # Coverage information
51
  "google_comparable_samples": [],
52
  "ug40_complete_samples": [],
@@ -54,32 +60,35 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
54
  "google_comparable_pairs": [],
55
  "ug40_complete_pairs": [],
56
  "language_pair_matrix_pairs": [],
 
57
  # Statistical adequacy flags
58
  "google_comparable_adequate": [],
59
  "ug40_complete_adequate": [],
60
  "language_pair_matrix_adequate": [],
 
61
  # Detailed results (JSON strings)
62
  "detailed_google_comparable": [],
63
  "detailed_ug40_complete": [],
64
  "detailed_language_pair_matrix": [],
65
  "cross_track_analysis": [],
 
66
  # Metadata
67
  "evaluation_date": [],
68
  "leaderboard_version": [],
69
  "scientific_adequacy_score": [],
70
  }
71
-
72
  return pd.DataFrame(columns)
73
 
74
 
75
  def load_scientific_leaderboard() -> pd.DataFrame:
76
  """Load current scientific leaderboard from HuggingFace dataset."""
77
-
78
  try:
79
  print("📥 Loading scientific leaderboard...")
80
  dataset = load_dataset(LEADERBOARD_DATASET + "-scientific", split="train")
81
  df = dataset.to_pandas()
82
-
83
  # Ensure all required columns exist
84
  required_columns = list(initialize_scientific_leaderboard().columns)
85
  for col in required_columns:
@@ -96,10 +105,10 @@ def load_scientific_leaderboard() -> pd.DataFrame:
96
  df[col] = 2 # Scientific version
97
  else:
98
  df[col] = ""
99
-
100
  print(f"✅ Loaded scientific leaderboard with {len(df)} entries")
101
  return df
102
-
103
  except Exception as e:
104
  print(f"⚠️ Could not load scientific leaderboard: {e}")
105
  print("🔄 Initializing empty scientific leaderboard...")
@@ -108,51 +117,37 @@ def load_scientific_leaderboard() -> pd.DataFrame:
108
 
109
  def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
110
  """Save scientific leaderboard to HuggingFace dataset."""
111
-
112
  try:
113
  # Clean data before saving
114
  df_clean = df.copy()
115
-
116
  # Ensure numeric columns are proper types
117
  numeric_columns = [
118
- col
119
- for col in df_clean.columns
120
- if any(
121
- x in col
122
- for x in [
123
- "quality",
124
- "bleu",
125
- "chrf",
126
- "ci_",
127
- "samples",
128
- "pairs",
129
- "adequacy",
130
- ]
131
- )
132
  ]
133
-
134
  for col in numeric_columns:
135
  if col in df_clean.columns:
136
  if "adequate" in col:
137
  df_clean[col] = df_clean[col].astype(bool)
138
  else:
139
- df_clean[col] = pd.to_numeric(
140
- df_clean[col], errors="coerce"
141
- ).fillna(0.0)
142
-
143
  # Convert to dataset
144
  dataset = Dataset.from_pandas(df_clean)
145
-
146
  # Push to hub
147
  dataset.push_to_hub(
148
  LEADERBOARD_DATASET + "-scientific",
149
  token=HF_TOKEN,
150
  commit_message=f"Update scientific leaderboard - {datetime.datetime.now().isoformat()[:19]}",
151
  )
152
-
153
  print("✅ Scientific leaderboard saved successfully!")
154
  return True
155
-
156
  except Exception as e:
157
  print(f"❌ Error saving scientific leaderboard: {e}")
158
  return False
@@ -166,241 +161,231 @@ def add_model_to_scientific_leaderboard(
166
  description: str = "",
167
  ) -> pd.DataFrame:
168
  """Add new model results to scientific leaderboard."""
169
-
170
  # Load current leaderboard
171
  df = load_scientific_leaderboard()
172
-
173
  # Remove existing entry if present
174
  existing_mask = df["model_name"] == model_name
175
  if existing_mask.any():
176
  df = df[~existing_mask]
177
-
178
  # Extract track results
179
  tracks = evaluation_results.get("tracks", {})
180
  cross_track = evaluation_results.get("cross_track_analysis", {})
181
-
182
  # Calculate scientific adequacy score
183
  adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
184
-
185
  # Prepare new entry
186
  new_entry = {
187
  "submission_id": create_submission_id(),
188
  "model_name": sanitize_model_name(model_name),
189
  "author": author[:100] if author else "Anonymous",
190
  "submission_date": datetime.datetime.now().isoformat(),
191
- "model_category": (
192
- model_category if model_category in MODEL_CATEGORIES else "community"
193
- ),
194
  "description": description[:500] if description else "",
 
195
  # Extract track-specific metrics
196
  **extract_track_metrics(tracks),
 
197
  # Statistical metadata
198
  **extract_statistical_metadata(tracks),
 
199
  # Coverage information
200
  **extract_coverage_information(tracks),
 
201
  # Adequacy flags
202
  **extract_adequacy_flags(tracks),
 
203
  # Detailed results (JSON strings)
204
  **serialize_detailed_results(tracks, cross_track),
 
205
  # Metadata
206
  "evaluation_date": datetime.datetime.now().isoformat(),
207
  "leaderboard_version": 2,
208
  "scientific_adequacy_score": adequacy_score,
209
  }
210
-
211
  # Convert to DataFrame and append
212
  new_row_df = pd.DataFrame([new_entry])
213
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
214
-
215
  # Save to hub
216
  save_scientific_leaderboard(updated_df)
217
-
218
  return updated_df
219
 
220
 
221
  def extract_track_metrics(tracks: Dict) -> Dict:
222
  """Extract primary metrics from each track."""
223
-
224
  metrics = {}
225
-
226
  for track_name in EVALUATION_TRACKS.keys():
227
  track_data = tracks.get(track_name, {})
228
  track_averages = track_data.get("track_averages", {})
229
-
230
  # Quality score
231
- metrics[f"{track_name}_quality"] = float(
232
- track_averages.get("quality_score", 0.0)
233
- )
234
-
235
  # BLEU score
236
  metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0))
237
-
238
  # ChrF score
239
  metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0))
240
-
241
  return metrics
242
 
243
 
244
  def extract_statistical_metadata(tracks: Dict) -> Dict:
245
  """Extract confidence intervals from each track."""
246
-
247
  metadata = {}
248
-
249
  for track_name in EVALUATION_TRACKS.keys():
250
  track_data = tracks.get(track_name, {})
251
  track_statistics = track_data.get("track_statistics", {})
252
-
253
  quality_stats = track_statistics.get("quality_score", {})
254
  metadata[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
255
  metadata[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
256
-
257
  return metadata
258
 
259
 
260
  def extract_coverage_information(tracks: Dict) -> Dict:
261
  """Extract coverage information from each track."""
262
-
263
  coverage = {}
264
-
265
  for track_name in EVALUATION_TRACKS.keys():
266
  track_data = tracks.get(track_name, {})
267
  summary = track_data.get("summary", {})
268
-
269
  coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0))
270
- coverage[f"{track_name}_pairs"] = int(
271
- summary.get("language_pairs_evaluated", 0)
272
- )
273
-
274
  return coverage
275
 
276
 
277
  def extract_adequacy_flags(tracks: Dict) -> Dict:
278
  """Extract statistical adequacy flags for each track."""
279
-
280
  adequacy = {}
281
-
282
  for track_name in EVALUATION_TRACKS.keys():
283
  track_data = tracks.get(track_name, {})
284
  summary = track_data.get("summary", {})
285
-
286
- min_required = EVALUATION_TRACKS[track_name][
287
- "min_samples_per_pair"
288
- ] * summary.get("language_pairs_evaluated", 0)
289
  is_adequate = summary.get("total_samples", 0) >= min_required
290
-
291
  adequacy[f"{track_name}_adequate"] = bool(is_adequate)
292
-
293
  return adequacy
294
 
295
 
296
  def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
297
  """Serialize detailed results for storage."""
298
-
299
  detailed = {}
300
-
301
  for track_name in EVALUATION_TRACKS.keys():
302
  track_data = tracks.get(track_name, {})
303
-
304
  # Remove non-serializable data
305
  safe_track_data = {}
306
  for key, value in track_data.items():
307
  if key != "sample_metrics": # Skip large DataFrames
308
  safe_track_data[key] = value
309
-
310
  detailed[f"detailed_{track_name}"] = json.dumps(safe_track_data)
311
-
312
  detailed["cross_track_analysis"] = json.dumps(cross_track)
313
-
314
  return detailed
315
 
316
 
317
  def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
318
  """Calculate overall scientific adequacy score (0-1)."""
319
-
320
  tracks = evaluation_results.get("tracks", {})
321
-
322
  adequacy_scores = []
323
-
324
  for track_name in EVALUATION_TRACKS.keys():
325
  track_data = tracks.get(track_name, {})
326
  summary = track_data.get("summary", {})
327
-
328
  if track_data.get("error"):
329
  adequacy_scores.append(0.0)
330
  continue
331
-
332
  # Sample size adequacy
333
- min_required = EVALUATION_TRACKS[track_name][
334
- "min_samples_per_pair"
335
- ] * summary.get("language_pairs_evaluated", 0)
336
- sample_adequacy = min(
337
- summary.get("total_samples", 0) / max(min_required, 1), 1.0
338
- )
339
-
340
  # Coverage adequacy
341
- total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (
342
- len(EVALUATION_TRACKS[track_name]["languages"]) - 1
343
- )
344
- coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(
345
- total_possible_pairs, 1
346
- )
347
-
348
  # Track adequacy
349
  track_adequacy = (sample_adequacy + coverage_adequacy) / 2
350
  adequacy_scores.append(track_adequacy)
351
-
352
  return float(np.mean(adequacy_scores))
353
 
354
 
355
  def get_track_leaderboard(
356
- df: pd.DataFrame,
357
- track: str,
358
  metric: str = "quality",
359
  category_filter: str = "all",
360
- min_adequacy: float = 0.0,
361
  ) -> pd.DataFrame:
362
  """Get leaderboard for a specific track with filtering."""
363
-
364
  if df.empty:
365
  return df
366
-
367
  track_quality_col = f"{track}_{metric}"
368
  track_adequate_col = f"{track}_adequate"
369
-
370
  # Filter by adequacy
371
  if min_adequacy > 0:
372
  adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
373
  df = df[adequacy_mask]
374
-
375
  # Filter by category
376
  if category_filter != "all":
377
  df = df[df["model_category"] == category_filter]
378
-
379
  # Filter to models that have this track
380
  valid_mask = (df[track_quality_col] > 0) & df[track_adequate_col]
381
  df = df[valid_mask]
382
-
383
  if df.empty:
384
  return df
385
-
386
  # Sort by track-specific metric
387
  df = df.sort_values(track_quality_col, ascending=False).reset_index(drop=True)
388
-
389
  return df
390
 
391
 
392
  def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
393
  """Prepare track-specific leaderboard for display."""
394
-
395
  if df.empty:
396
  return df
397
-
398
  # Select relevant columns for this track
399
  base_columns = ["model_name", "author", "submission_date", "model_category"]
400
-
401
  track_columns = [
402
  f"{track}_quality",
403
- f"{track}_bleu",
404
  f"{track}_chrf",
405
  f"{track}_ci_lower",
406
  f"{track}_ci_upper",
@@ -408,13 +393,11 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
408
  f"{track}_pairs",
409
  f"{track}_adequate",
410
  ]
411
-
412
  # Only include columns that exist
413
- available_columns = [
414
- col for col in base_columns + track_columns if col in df.columns
415
- ]
416
  display_df = df[available_columns].copy()
417
-
418
  # Format numeric columns
419
  numeric_format = {
420
  f"{track}_quality": "{:.4f}",
@@ -423,34 +406,25 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
423
  f"{track}_ci_lower": "{:.4f}",
424
  f"{track}_ci_upper": "{:.4f}",
425
  }
426
-
427
  for col, fmt in numeric_format.items():
428
  if col in display_df.columns:
429
  display_df[col] = display_df[col].apply(
430
  lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000"
431
  )
432
-
433
  # Format confidence intervals
434
- if (
435
- f"{track}_ci_lower" in display_df.columns
436
- and f"{track}_ci_upper" in display_df.columns
437
- ):
438
  display_df[f"{track}_confidence_interval"] = (
439
- "["
440
- + display_df[f"{track}_ci_lower"]
441
- + ", "
442
- + display_df[f"{track}_ci_upper"]
443
- + "]"
444
  )
445
  # Remove individual CI columns for cleaner display
446
  display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"])
447
-
448
  # Format submission date
449
  if "submission_date" in display_df.columns:
450
- display_df["submission_date"] = pd.to_datetime(
451
- display_df["submission_date"]
452
- ).dt.strftime("%Y-%m-%d")
453
-
454
  # Rename columns for better display
455
  track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word
456
  column_renames = {
@@ -466,15 +440,15 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
466
  f"{track}_pairs": "Pairs",
467
  f"{track}_adequate": "Adequate",
468
  }
469
-
470
  display_df = display_df.rename(columns=column_renames)
471
-
472
  return display_df
473
 
474
 
475
  def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
476
  """Get comprehensive statistics for the scientific leaderboard."""
477
-
478
  if df.empty:
479
  return {
480
  "total_models": 0,
@@ -483,7 +457,7 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
483
  "adequacy_distribution": {},
484
  "best_models_by_track": {},
485
  }
486
-
487
  stats = {
488
  "total_models": len(df),
489
  "models_by_category": df["model_category"].value_counts().to_dict(),
@@ -491,23 +465,23 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
491
  "track_statistics": {},
492
  "best_models_by_track": {},
493
  }
494
-
495
  # Adequacy distribution
496
  adequacy_bins = pd.cut(
497
- df["scientific_adequacy_score"],
498
  bins=[0, 0.3, 0.6, 0.8, 1.0],
499
- labels=["Poor", "Fair", "Good", "Excellent"],
500
  )
501
  stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
502
-
503
  # Track-specific statistics
504
  for track_name in EVALUATION_TRACKS.keys():
505
  quality_col = f"{track_name}_quality"
506
  adequate_col = f"{track_name}_adequate"
507
-
508
  if quality_col in df.columns and adequate_col in df.columns:
509
  track_models = df[df[adequate_col] & (df[quality_col] > 0)]
510
-
511
  if len(track_models) > 0:
512
  stats["track_statistics"][track_name] = {
513
  "participating_models": len(track_models),
@@ -515,7 +489,7 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
515
  "std_quality": float(track_models[quality_col].std()),
516
  "best_quality": float(track_models[quality_col].max()),
517
  }
518
-
519
  # Best model for this track
520
  best_model = track_models.loc[track_models[quality_col].idxmax()]
521
  stats["best_models_by_track"][track_name] = {
@@ -523,20 +497,22 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
523
  "category": best_model["model_category"],
524
  "quality": float(best_model[quality_col]),
525
  }
526
-
527
  return stats
528
 
529
 
530
  def perform_fair_comparison(
531
- df: pd.DataFrame, model_names: List[str], shared_pairs_only: bool = True
 
 
532
  ) -> Dict:
533
  """Perform fair comparison between models using only shared language pairs."""
534
-
535
  models = df[df["model_name"].isin(model_names)]
536
-
537
  if len(models) == 0:
538
  return {"error": "No models found"}
539
-
540
  comparison = {
541
  "models": list(models["model_name"]),
542
  "fair_comparison_possible": True,
@@ -544,51 +520,47 @@ def perform_fair_comparison(
544
  "statistical_significance": {},
545
  "recommendations": [],
546
  }
547
-
548
  # Check if fair comparison is possible
549
  categories = models["model_category"].unique()
550
  if len(categories) > 1:
551
  comparison["recommendations"].append(
552
  "⚠️ Comparing models from different categories - interpret results carefully"
553
  )
554
-
555
  # For each track, compare models
556
  for track_name in EVALUATION_TRACKS.keys():
557
  quality_col = f"{track_name}_quality"
558
  adequate_col = f"{track_name}_adequate"
559
-
560
  track_models = models[models[adequate_col] & (models[quality_col] > 0)]
561
-
562
  if len(track_models) >= 2:
563
  comparison["track_comparisons"][track_name] = {
564
  "participating_models": len(track_models),
565
- "quality_scores": dict(
566
- zip(track_models["model_name"], track_models[quality_col])
567
- ),
568
  "confidence_intervals": {},
569
  }
570
-
571
  # Extract confidence intervals
572
  for _, model in track_models.iterrows():
573
  ci_lower = model.get(f"{track_name}_ci_lower", 0)
574
  ci_upper = model.get(f"{track_name}_ci_upper", 0)
575
- comparison["track_comparisons"][track_name]["confidence_intervals"][
576
- model["model_name"]
577
- ] = [ci_lower, ci_upper]
578
-
579
  return comparison
580
 
581
 
582
  def export_scientific_leaderboard(
583
- df: pd.DataFrame,
584
- track: str = "all",
585
- format: str = "csv",
586
- include_detailed: bool = False,
587
  ) -> str:
588
  """Export scientific leaderboard in specified format."""
589
-
590
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
591
-
592
  if track != "all":
593
  # Export specific track
594
  export_df = prepare_track_leaderboard_display(df, track)
@@ -600,27 +572,22 @@ def export_scientific_leaderboard(
600
  else:
601
  # Select essential columns
602
  essential_columns = [
603
- "model_name",
604
- "author",
605
- "submission_date",
606
- "model_category",
607
- "scientific_adequacy_score",
608
  ]
609
-
610
  # Add track-specific quality scores
611
  for track_name in EVALUATION_TRACKS.keys():
612
- essential_columns.extend(
613
- [
614
- f"{track_name}_quality",
615
- f"{track_name}_adequate",
616
- ]
617
- )
618
-
619
  available_columns = [col for col in essential_columns if col in df.columns]
620
  export_df = df[available_columns].copy()
621
-
622
  filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
623
-
624
  # Export in specified format
625
  if format == "csv":
626
  filename = f"{filename_prefix}.csv"
@@ -633,5 +600,5 @@ def export_scientific_leaderboard(
633
  export_df.to_excel(filename, index=False)
634
  else:
635
  raise ValueError(f"Unsupported format: {format}")
636
-
637
- return filename
 
13
  MODEL_CATEGORIES,
14
  STATISTICAL_CONFIG,
15
  METRICS_CONFIG,
16
+ SAMPLE_SIZE_RECOMMENDATIONS,
17
  )
18
  from src.utils import create_submission_id, sanitize_model_name
19
 
20
 
21
  def initialize_scientific_leaderboard() -> pd.DataFrame:
22
  """Initialize empty scientific leaderboard DataFrame with all required columns."""
23
+
24
  columns = {
25
  # Basic information
26
  "submission_id": [],
 
29
  "submission_date": [],
30
  "model_category": [],
31
  "description": [],
32
+
33
  # Track-specific quality scores
34
  "google_comparable_quality": [],
35
  "ug40_complete_quality": [],
36
  "language_pair_matrix_quality": [],
37
+
38
  # Track-specific BLEU scores
39
  "google_comparable_bleu": [],
40
  "ug40_complete_bleu": [],
41
  "language_pair_matrix_bleu": [],
42
+
43
  # Track-specific ChrF scores
44
  "google_comparable_chrf": [],
45
  "ug40_complete_chrf": [],
46
  "language_pair_matrix_chrf": [],
47
+
48
  # Statistical metadata
49
  "google_comparable_ci_lower": [],
50
  "google_comparable_ci_upper": [],
 
52
  "ug40_complete_ci_upper": [],
53
  "language_pair_matrix_ci_lower": [],
54
  "language_pair_matrix_ci_upper": [],
55
+
56
  # Coverage information
57
  "google_comparable_samples": [],
58
  "ug40_complete_samples": [],
 
60
  "google_comparable_pairs": [],
61
  "ug40_complete_pairs": [],
62
  "language_pair_matrix_pairs": [],
63
+
64
  # Statistical adequacy flags
65
  "google_comparable_adequate": [],
66
  "ug40_complete_adequate": [],
67
  "language_pair_matrix_adequate": [],
68
+
69
  # Detailed results (JSON strings)
70
  "detailed_google_comparable": [],
71
  "detailed_ug40_complete": [],
72
  "detailed_language_pair_matrix": [],
73
  "cross_track_analysis": [],
74
+
75
  # Metadata
76
  "evaluation_date": [],
77
  "leaderboard_version": [],
78
  "scientific_adequacy_score": [],
79
  }
80
+
81
  return pd.DataFrame(columns)
82
 
83
 
84
  def load_scientific_leaderboard() -> pd.DataFrame:
85
  """Load current scientific leaderboard from HuggingFace dataset."""
86
+
87
  try:
88
  print("📥 Loading scientific leaderboard...")
89
  dataset = load_dataset(LEADERBOARD_DATASET + "-scientific", split="train")
90
  df = dataset.to_pandas()
91
+
92
  # Ensure all required columns exist
93
  required_columns = list(initialize_scientific_leaderboard().columns)
94
  for col in required_columns:
 
105
  df[col] = 2 # Scientific version
106
  else:
107
  df[col] = ""
108
+
109
  print(f"✅ Loaded scientific leaderboard with {len(df)} entries")
110
  return df
111
+
112
  except Exception as e:
113
  print(f"⚠️ Could not load scientific leaderboard: {e}")
114
  print("🔄 Initializing empty scientific leaderboard...")
 
117
 
118
  def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
119
  """Save scientific leaderboard to HuggingFace dataset."""
120
+
121
  try:
122
  # Clean data before saving
123
  df_clean = df.copy()
124
+
125
  # Ensure numeric columns are proper types
126
  numeric_columns = [
127
+ col for col in df_clean.columns
128
+ if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs", "adequacy"])
 
 
 
 
 
 
 
 
 
 
 
 
129
  ]
130
+
131
  for col in numeric_columns:
132
  if col in df_clean.columns:
133
  if "adequate" in col:
134
  df_clean[col] = df_clean[col].astype(bool)
135
  else:
136
+ df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
137
+
 
 
138
  # Convert to dataset
139
  dataset = Dataset.from_pandas(df_clean)
140
+
141
  # Push to hub
142
  dataset.push_to_hub(
143
  LEADERBOARD_DATASET + "-scientific",
144
  token=HF_TOKEN,
145
  commit_message=f"Update scientific leaderboard - {datetime.datetime.now().isoformat()[:19]}",
146
  )
147
+
148
  print("✅ Scientific leaderboard saved successfully!")
149
  return True
150
+
151
  except Exception as e:
152
  print(f"❌ Error saving scientific leaderboard: {e}")
153
  return False
 
161
  description: str = "",
162
  ) -> pd.DataFrame:
163
  """Add new model results to scientific leaderboard."""
164
+
165
  # Load current leaderboard
166
  df = load_scientific_leaderboard()
167
+
168
  # Remove existing entry if present
169
  existing_mask = df["model_name"] == model_name
170
  if existing_mask.any():
171
  df = df[~existing_mask]
172
+
173
  # Extract track results
174
  tracks = evaluation_results.get("tracks", {})
175
  cross_track = evaluation_results.get("cross_track_analysis", {})
176
+
177
  # Calculate scientific adequacy score
178
  adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
179
+
180
  # Prepare new entry
181
  new_entry = {
182
  "submission_id": create_submission_id(),
183
  "model_name": sanitize_model_name(model_name),
184
  "author": author[:100] if author else "Anonymous",
185
  "submission_date": datetime.datetime.now().isoformat(),
186
+ "model_category": model_category if model_category in MODEL_CATEGORIES else "community",
 
 
187
  "description": description[:500] if description else "",
188
+
189
  # Extract track-specific metrics
190
  **extract_track_metrics(tracks),
191
+
192
  # Statistical metadata
193
  **extract_statistical_metadata(tracks),
194
+
195
  # Coverage information
196
  **extract_coverage_information(tracks),
197
+
198
  # Adequacy flags
199
  **extract_adequacy_flags(tracks),
200
+
201
  # Detailed results (JSON strings)
202
  **serialize_detailed_results(tracks, cross_track),
203
+
204
  # Metadata
205
  "evaluation_date": datetime.datetime.now().isoformat(),
206
  "leaderboard_version": 2,
207
  "scientific_adequacy_score": adequacy_score,
208
  }
209
+
210
  # Convert to DataFrame and append
211
  new_row_df = pd.DataFrame([new_entry])
212
  updated_df = pd.concat([df, new_row_df], ignore_index=True)
213
+
214
  # Save to hub
215
  save_scientific_leaderboard(updated_df)
216
+
217
  return updated_df
218
 
219
 
220
  def extract_track_metrics(tracks: Dict) -> Dict:
221
  """Extract primary metrics from each track."""
222
+
223
  metrics = {}
224
+
225
  for track_name in EVALUATION_TRACKS.keys():
226
  track_data = tracks.get(track_name, {})
227
  track_averages = track_data.get("track_averages", {})
228
+
229
  # Quality score
230
+ metrics[f"{track_name}_quality"] = float(track_averages.get("quality_score", 0.0))
231
+
 
 
232
  # BLEU score
233
  metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0))
234
+
235
  # ChrF score
236
  metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0))
237
+
238
  return metrics
239
 
240
 
241
  def extract_statistical_metadata(tracks: Dict) -> Dict:
242
  """Extract confidence intervals from each track."""
243
+
244
  metadata = {}
245
+
246
  for track_name in EVALUATION_TRACKS.keys():
247
  track_data = tracks.get(track_name, {})
248
  track_statistics = track_data.get("track_statistics", {})
249
+
250
  quality_stats = track_statistics.get("quality_score", {})
251
  metadata[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
252
  metadata[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
253
+
254
  return metadata
255
 
256
 
257
  def extract_coverage_information(tracks: Dict) -> Dict:
258
  """Extract coverage information from each track."""
259
+
260
  coverage = {}
261
+
262
  for track_name in EVALUATION_TRACKS.keys():
263
  track_data = tracks.get(track_name, {})
264
  summary = track_data.get("summary", {})
265
+
266
  coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0))
267
+ coverage[f"{track_name}_pairs"] = int(summary.get("language_pairs_evaluated", 0))
268
+
 
 
269
  return coverage
270
 
271
 
272
  def extract_adequacy_flags(tracks: Dict) -> Dict:
273
  """Extract statistical adequacy flags for each track."""
274
+
275
  adequacy = {}
276
+
277
  for track_name in EVALUATION_TRACKS.keys():
278
  track_data = tracks.get(track_name, {})
279
  summary = track_data.get("summary", {})
280
+
281
+ min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
 
 
282
  is_adequate = summary.get("total_samples", 0) >= min_required
283
+
284
  adequacy[f"{track_name}_adequate"] = bool(is_adequate)
285
+
286
  return adequacy
287
 
288
 
289
  def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
290
  """Serialize detailed results for storage."""
291
+
292
  detailed = {}
293
+
294
  for track_name in EVALUATION_TRACKS.keys():
295
  track_data = tracks.get(track_name, {})
296
+
297
  # Remove non-serializable data
298
  safe_track_data = {}
299
  for key, value in track_data.items():
300
  if key != "sample_metrics": # Skip large DataFrames
301
  safe_track_data[key] = value
302
+
303
  detailed[f"detailed_{track_name}"] = json.dumps(safe_track_data)
304
+
305
  detailed["cross_track_analysis"] = json.dumps(cross_track)
306
+
307
  return detailed
308
 
309
 
310
  def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
311
  """Calculate overall scientific adequacy score (0-1)."""
312
+
313
  tracks = evaluation_results.get("tracks", {})
314
+
315
  adequacy_scores = []
316
+
317
  for track_name in EVALUATION_TRACKS.keys():
318
  track_data = tracks.get(track_name, {})
319
  summary = track_data.get("summary", {})
320
+
321
  if track_data.get("error"):
322
  adequacy_scores.append(0.0)
323
  continue
324
+
325
  # Sample size adequacy
326
+ min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
327
+ sample_adequacy = min(summary.get("total_samples", 0) / max(min_required, 1), 1.0)
328
+
 
 
 
 
329
  # Coverage adequacy
330
+ total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (len(EVALUATION_TRACKS[track_name]["languages"]) - 1)
331
+ coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(total_possible_pairs, 1)
332
+
 
 
 
 
333
  # Track adequacy
334
  track_adequacy = (sample_adequacy + coverage_adequacy) / 2
335
  adequacy_scores.append(track_adequacy)
336
+
337
  return float(np.mean(adequacy_scores))
338
 
339
 
340
  def get_track_leaderboard(
341
+ df: pd.DataFrame,
342
+ track: str,
343
  metric: str = "quality",
344
  category_filter: str = "all",
345
+ min_adequacy: float = 0.0
346
  ) -> pd.DataFrame:
347
  """Get leaderboard for a specific track with filtering."""
348
+
349
  if df.empty:
350
  return df
351
+
352
  track_quality_col = f"{track}_{metric}"
353
  track_adequate_col = f"{track}_adequate"
354
+
355
  # Filter by adequacy
356
  if min_adequacy > 0:
357
  adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
358
  df = df[adequacy_mask]
359
+
360
  # Filter by category
361
  if category_filter != "all":
362
  df = df[df["model_category"] == category_filter]
363
+
364
  # Filter to models that have this track
365
  valid_mask = (df[track_quality_col] > 0) & df[track_adequate_col]
366
  df = df[valid_mask]
367
+
368
  if df.empty:
369
  return df
370
+
371
  # Sort by track-specific metric
372
  df = df.sort_values(track_quality_col, ascending=False).reset_index(drop=True)
373
+
374
  return df
375
 
376
 
377
  def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
378
  """Prepare track-specific leaderboard for display."""
379
+
380
  if df.empty:
381
  return df
382
+
383
  # Select relevant columns for this track
384
  base_columns = ["model_name", "author", "submission_date", "model_category"]
385
+
386
  track_columns = [
387
  f"{track}_quality",
388
+ f"{track}_bleu",
389
  f"{track}_chrf",
390
  f"{track}_ci_lower",
391
  f"{track}_ci_upper",
 
393
  f"{track}_pairs",
394
  f"{track}_adequate",
395
  ]
396
+
397
  # Only include columns that exist
398
+ available_columns = [col for col in base_columns + track_columns if col in df.columns]
 
 
399
  display_df = df[available_columns].copy()
400
+
401
  # Format numeric columns
402
  numeric_format = {
403
  f"{track}_quality": "{:.4f}",
 
406
  f"{track}_ci_lower": "{:.4f}",
407
  f"{track}_ci_upper": "{:.4f}",
408
  }
409
+
410
  for col, fmt in numeric_format.items():
411
  if col in display_df.columns:
412
  display_df[col] = display_df[col].apply(
413
  lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000"
414
  )
415
+
416
  # Format confidence intervals
417
+ if f"{track}_ci_lower" in display_df.columns and f"{track}_ci_upper" in display_df.columns:
 
 
 
418
  display_df[f"{track}_confidence_interval"] = (
419
+ "[" + display_df[f"{track}_ci_lower"] + ", " + display_df[f"{track}_ci_upper"] + "]"
 
 
 
 
420
  )
421
  # Remove individual CI columns for cleaner display
422
  display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"])
423
+
424
  # Format submission date
425
  if "submission_date" in display_df.columns:
426
+ display_df["submission_date"] = pd.to_datetime(display_df["submission_date"]).dt.strftime("%Y-%m-%d")
427
+
 
 
428
  # Rename columns for better display
429
  track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word
430
  column_renames = {
 
440
  f"{track}_pairs": "Pairs",
441
  f"{track}_adequate": "Adequate",
442
  }
443
+
444
  display_df = display_df.rename(columns=column_renames)
445
+
446
  return display_df
447
 
448
 
449
  def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
450
  """Get comprehensive statistics for the scientific leaderboard."""
451
+
452
  if df.empty:
453
  return {
454
  "total_models": 0,
 
457
  "adequacy_distribution": {},
458
  "best_models_by_track": {},
459
  }
460
+
461
  stats = {
462
  "total_models": len(df),
463
  "models_by_category": df["model_category"].value_counts().to_dict(),
 
465
  "track_statistics": {},
466
  "best_models_by_track": {},
467
  }
468
+
469
  # Adequacy distribution
470
  adequacy_bins = pd.cut(
471
+ df["scientific_adequacy_score"],
472
  bins=[0, 0.3, 0.6, 0.8, 1.0],
473
+ labels=["Poor", "Fair", "Good", "Excellent"]
474
  )
475
  stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
476
+
477
  # Track-specific statistics
478
  for track_name in EVALUATION_TRACKS.keys():
479
  quality_col = f"{track_name}_quality"
480
  adequate_col = f"{track_name}_adequate"
481
+
482
  if quality_col in df.columns and adequate_col in df.columns:
483
  track_models = df[df[adequate_col] & (df[quality_col] > 0)]
484
+
485
  if len(track_models) > 0:
486
  stats["track_statistics"][track_name] = {
487
  "participating_models": len(track_models),
 
489
  "std_quality": float(track_models[quality_col].std()),
490
  "best_quality": float(track_models[quality_col].max()),
491
  }
492
+
493
  # Best model for this track
494
  best_model = track_models.loc[track_models[quality_col].idxmax()]
495
  stats["best_models_by_track"][track_name] = {
 
497
  "category": best_model["model_category"],
498
  "quality": float(best_model[quality_col]),
499
  }
500
+
501
  return stats
502
 
503
 
504
  def perform_fair_comparison(
505
+ df: pd.DataFrame,
506
+ model_names: List[str],
507
+ shared_pairs_only: bool = True
508
  ) -> Dict:
509
  """Perform fair comparison between models using only shared language pairs."""
510
+
511
  models = df[df["model_name"].isin(model_names)]
512
+
513
  if len(models) == 0:
514
  return {"error": "No models found"}
515
+
516
  comparison = {
517
  "models": list(models["model_name"]),
518
  "fair_comparison_possible": True,
 
520
  "statistical_significance": {},
521
  "recommendations": [],
522
  }
523
+
524
  # Check if fair comparison is possible
525
  categories = models["model_category"].unique()
526
  if len(categories) > 1:
527
  comparison["recommendations"].append(
528
  "⚠️ Comparing models from different categories - interpret results carefully"
529
  )
530
+
531
  # For each track, compare models
532
  for track_name in EVALUATION_TRACKS.keys():
533
  quality_col = f"{track_name}_quality"
534
  adequate_col = f"{track_name}_adequate"
535
+
536
  track_models = models[models[adequate_col] & (models[quality_col] > 0)]
537
+
538
  if len(track_models) >= 2:
539
  comparison["track_comparisons"][track_name] = {
540
  "participating_models": len(track_models),
541
+ "quality_scores": dict(zip(track_models["model_name"], track_models[quality_col])),
 
 
542
  "confidence_intervals": {},
543
  }
544
+
545
  # Extract confidence intervals
546
  for _, model in track_models.iterrows():
547
  ci_lower = model.get(f"{track_name}_ci_lower", 0)
548
  ci_upper = model.get(f"{track_name}_ci_upper", 0)
549
+ comparison["track_comparisons"][track_name]["confidence_intervals"][model["model_name"]] = [ci_lower, ci_upper]
550
+
 
 
551
  return comparison
552
 
553
 
554
  def export_scientific_leaderboard(
555
+ df: pd.DataFrame,
556
+ track: str = "all",
557
+ format: str = "csv",
558
+ include_detailed: bool = False
559
  ) -> str:
560
  """Export scientific leaderboard in specified format."""
561
+
562
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
563
+
564
  if track != "all":
565
  # Export specific track
566
  export_df = prepare_track_leaderboard_display(df, track)
 
572
  else:
573
  # Select essential columns
574
  essential_columns = [
575
+ "model_name", "author", "submission_date", "model_category",
576
+ "scientific_adequacy_score"
 
 
 
577
  ]
578
+
579
  # Add track-specific quality scores
580
  for track_name in EVALUATION_TRACKS.keys():
581
+ essential_columns.extend([
582
+ f"{track_name}_quality",
583
+ f"{track_name}_adequate",
584
+ ])
585
+
 
 
586
  available_columns = [col for col in essential_columns if col in df.columns]
587
  export_df = df[available_columns].copy()
588
+
589
  filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
590
+
591
  # Export in specified format
592
  if format == "csv":
593
  filename = f"{filename_prefix}.csv"
 
600
  export_df.to_excel(filename, index=False)
601
  else:
602
  raise ValueError(f"Unsupported format: {format}")
603
+
604
+ return filename