Spaces:
Running
Running
Update src/leaderboard.py
Browse files- src/leaderboard.py +51 -311
src/leaderboard.py
CHANGED
@@ -11,15 +11,13 @@ from config import (
|
|
11 |
HF_TOKEN,
|
12 |
EVALUATION_TRACKS,
|
13 |
MODEL_CATEGORIES,
|
14 |
-
STATISTICAL_CONFIG,
|
15 |
METRICS_CONFIG,
|
16 |
-
SAMPLE_SIZE_RECOMMENDATIONS,
|
17 |
)
|
18 |
from src.utils import create_submission_id, sanitize_model_name
|
19 |
|
20 |
|
21 |
-
def
|
22 |
-
"""Initialize empty
|
23 |
|
24 |
columns = {
|
25 |
# Basic information
|
@@ -33,104 +31,76 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
|
|
33 |
# Track-specific quality scores
|
34 |
"google_comparable_quality": [],
|
35 |
"ug40_complete_quality": [],
|
36 |
-
"language_pair_matrix_quality": [],
|
37 |
|
38 |
# Track-specific BLEU scores
|
39 |
"google_comparable_bleu": [],
|
40 |
"ug40_complete_bleu": [],
|
41 |
-
"language_pair_matrix_bleu": [],
|
42 |
|
43 |
# Track-specific ChrF scores
|
44 |
"google_comparable_chrf": [],
|
45 |
"ug40_complete_chrf": [],
|
46 |
-
"language_pair_matrix_chrf": [],
|
47 |
|
48 |
-
#
|
49 |
"google_comparable_ci_lower": [],
|
50 |
"google_comparable_ci_upper": [],
|
51 |
"ug40_complete_ci_lower": [],
|
52 |
"ug40_complete_ci_upper": [],
|
53 |
-
"language_pair_matrix_ci_lower": [],
|
54 |
-
"language_pair_matrix_ci_upper": [],
|
55 |
|
56 |
# Coverage information
|
57 |
"google_comparable_samples": [],
|
58 |
"ug40_complete_samples": [],
|
59 |
-
"language_pair_matrix_samples": [],
|
60 |
"google_comparable_pairs": [],
|
61 |
"ug40_complete_pairs": [],
|
62 |
-
"language_pair_matrix_pairs": [],
|
63 |
-
|
64 |
-
# Statistical adequacy flags
|
65 |
-
"google_comparable_adequate": [],
|
66 |
-
"ug40_complete_adequate": [],
|
67 |
-
"language_pair_matrix_adequate": [],
|
68 |
|
69 |
# Detailed results (JSON strings)
|
70 |
"detailed_google_comparable": [],
|
71 |
"detailed_ug40_complete": [],
|
72 |
-
"detailed_language_pair_matrix": [],
|
73 |
-
"cross_track_analysis": [],
|
74 |
|
75 |
# Metadata
|
76 |
"evaluation_date": [],
|
77 |
-
"leaderboard_version": [],
|
78 |
-
"scientific_adequacy_score": [],
|
79 |
}
|
80 |
|
81 |
return pd.DataFrame(columns)
|
82 |
|
83 |
|
84 |
-
def
|
85 |
-
"""Load current
|
86 |
|
87 |
try:
|
88 |
-
print("📥 Loading
|
89 |
-
dataset = load_dataset(LEADERBOARD_DATASET
|
90 |
df = dataset.to_pandas()
|
91 |
|
92 |
# Ensure all required columns exist
|
93 |
-
required_columns = list(
|
94 |
for col in required_columns:
|
95 |
if col not in df.columns:
|
96 |
if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col:
|
97 |
df[col] = 0.0
|
98 |
elif "samples" in col or "pairs" in col:
|
99 |
df[col] = 0
|
100 |
-
elif "adequate" in col:
|
101 |
-
df[col] = False
|
102 |
-
elif col == "scientific_adequacy_score":
|
103 |
-
df[col] = 0.0
|
104 |
-
elif col == "leaderboard_version":
|
105 |
-
df[col] = 2 # Scientific version
|
106 |
else:
|
107 |
df[col] = ""
|
108 |
|
109 |
-
# Ensure proper data types for boolean columns
|
110 |
-
boolean_columns = [col for col in df.columns if "adequate" in col]
|
111 |
-
for col in boolean_columns:
|
112 |
-
df[col] = df[col].fillna(False).astype(bool)
|
113 |
-
|
114 |
# Ensure proper data types for numeric columns
|
115 |
numeric_columns = [
|
116 |
col for col in df.columns
|
117 |
-
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"
|
118 |
-
and "adequate" not in col
|
119 |
]
|
120 |
for col in numeric_columns:
|
121 |
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
|
122 |
|
123 |
-
print(f"✅ Loaded
|
124 |
return df
|
125 |
|
126 |
except Exception as e:
|
127 |
-
print(f"⚠️ Could not load
|
128 |
-
print("🔄 Initializing empty
|
129 |
-
return
|
130 |
|
131 |
|
132 |
-
def
|
133 |
-
"""Save
|
134 |
|
135 |
try:
|
136 |
# Clean data before saving
|
@@ -139,45 +109,42 @@ def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
|
|
139 |
# Ensure numeric columns are proper types
|
140 |
numeric_columns = [
|
141 |
col for col in df_clean.columns
|
142 |
-
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"
|
143 |
]
|
144 |
|
145 |
for col in numeric_columns:
|
146 |
if col in df_clean.columns:
|
147 |
-
|
148 |
-
df_clean[col] = df_clean[col].astype(bool)
|
149 |
-
else:
|
150 |
-
df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
|
151 |
|
152 |
# Convert to dataset
|
153 |
dataset = Dataset.from_pandas(df_clean)
|
154 |
|
155 |
# Push to hub
|
156 |
dataset.push_to_hub(
|
157 |
-
LEADERBOARD_DATASET
|
158 |
token=HF_TOKEN,
|
159 |
-
commit_message=f"Update
|
160 |
)
|
161 |
|
162 |
-
print("✅
|
163 |
return True
|
164 |
|
165 |
except Exception as e:
|
166 |
-
print(f"❌ Error saving
|
167 |
return False
|
168 |
|
169 |
|
170 |
-
def
|
171 |
model_name: str,
|
172 |
author: str,
|
173 |
evaluation_results: Dict,
|
174 |
model_category: str = "community",
|
175 |
description: str = "",
|
176 |
) -> pd.DataFrame:
|
177 |
-
"""Add new model results to
|
178 |
|
179 |
# Load current leaderboard
|
180 |
-
df =
|
181 |
|
182 |
# Remove existing entry if present
|
183 |
existing_mask = df["model_name"] == model_name
|
@@ -186,10 +153,6 @@ def add_model_to_scientific_leaderboard(
|
|
186 |
|
187 |
# Extract track results
|
188 |
tracks = evaluation_results.get("tracks", {})
|
189 |
-
cross_track = evaluation_results.get("cross_track_analysis", {})
|
190 |
-
|
191 |
-
# Calculate scientific adequacy score
|
192 |
-
adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
|
193 |
|
194 |
# Prepare new entry
|
195 |
new_entry = {
|
@@ -203,22 +166,17 @@ def add_model_to_scientific_leaderboard(
|
|
203 |
# Extract track-specific metrics
|
204 |
**extract_track_metrics(tracks),
|
205 |
|
206 |
-
#
|
207 |
-
**
|
208 |
|
209 |
# Coverage information
|
210 |
**extract_coverage_information(tracks),
|
211 |
|
212 |
-
# Adequacy flags
|
213 |
-
**extract_adequacy_flags(tracks),
|
214 |
-
|
215 |
# Detailed results (JSON strings)
|
216 |
-
**serialize_detailed_results(tracks
|
217 |
|
218 |
# Metadata
|
219 |
"evaluation_date": datetime.datetime.now().isoformat(),
|
220 |
-
"leaderboard_version": 2,
|
221 |
-
"scientific_adequacy_score": adequacy_score,
|
222 |
}
|
223 |
|
224 |
# Convert to DataFrame and append
|
@@ -226,7 +184,7 @@ def add_model_to_scientific_leaderboard(
|
|
226 |
updated_df = pd.concat([df, new_row_df], ignore_index=True)
|
227 |
|
228 |
# Save to hub
|
229 |
-
|
230 |
|
231 |
return updated_df
|
232 |
|
@@ -252,20 +210,20 @@ def extract_track_metrics(tracks: Dict) -> Dict:
|
|
252 |
return metrics
|
253 |
|
254 |
|
255 |
-
def
|
256 |
"""Extract confidence intervals from each track."""
|
257 |
|
258 |
-
|
259 |
|
260 |
for track_name in EVALUATION_TRACKS.keys():
|
261 |
track_data = tracks.get(track_name, {})
|
262 |
-
|
263 |
|
264 |
-
quality_stats =
|
265 |
-
|
266 |
-
|
267 |
|
268 |
-
return
|
269 |
|
270 |
|
271 |
def extract_coverage_information(tracks: Dict) -> Dict:
|
@@ -283,24 +241,7 @@ def extract_coverage_information(tracks: Dict) -> Dict:
|
|
283 |
return coverage
|
284 |
|
285 |
|
286 |
-
def
|
287 |
-
"""Extract statistical adequacy flags for each track."""
|
288 |
-
|
289 |
-
adequacy = {}
|
290 |
-
|
291 |
-
for track_name in EVALUATION_TRACKS.keys():
|
292 |
-
track_data = tracks.get(track_name, {})
|
293 |
-
summary = track_data.get("summary", {})
|
294 |
-
|
295 |
-
min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
|
296 |
-
is_adequate = summary.get("total_samples", 0) >= min_required
|
297 |
-
|
298 |
-
adequacy[f"{track_name}_adequate"] = bool(is_adequate)
|
299 |
-
|
300 |
-
return adequacy
|
301 |
-
|
302 |
-
|
303 |
-
def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
|
304 |
"""Serialize detailed results for storage."""
|
305 |
|
306 |
detailed = {}
|
@@ -308,55 +249,24 @@ def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
|
|
308 |
for track_name in EVALUATION_TRACKS.keys():
|
309 |
track_data = tracks.get(track_name, {})
|
310 |
|
311 |
-
#
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
316 |
|
317 |
-
detailed[f"detailed_{track_name}"] = json.dumps(
|
318 |
-
|
319 |
-
detailed["cross_track_analysis"] = json.dumps(cross_track)
|
320 |
|
321 |
return detailed
|
322 |
|
323 |
|
324 |
-
def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
|
325 |
-
"""Calculate overall scientific adequacy score (0-1)."""
|
326 |
-
|
327 |
-
tracks = evaluation_results.get("tracks", {})
|
328 |
-
|
329 |
-
adequacy_scores = []
|
330 |
-
|
331 |
-
for track_name in EVALUATION_TRACKS.keys():
|
332 |
-
track_data = tracks.get(track_name, {})
|
333 |
-
summary = track_data.get("summary", {})
|
334 |
-
|
335 |
-
if track_data.get("error"):
|
336 |
-
adequacy_scores.append(0.0)
|
337 |
-
continue
|
338 |
-
|
339 |
-
# Sample size adequacy
|
340 |
-
min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
|
341 |
-
sample_adequacy = min(summary.get("total_samples", 0) / max(min_required, 1), 1.0)
|
342 |
-
|
343 |
-
# Coverage adequacy
|
344 |
-
total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (len(EVALUATION_TRACKS[track_name]["languages"]) - 1)
|
345 |
-
coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(total_possible_pairs, 1)
|
346 |
-
|
347 |
-
# Track adequacy
|
348 |
-
track_adequacy = (sample_adequacy + coverage_adequacy) / 2
|
349 |
-
adequacy_scores.append(track_adequacy)
|
350 |
-
|
351 |
-
return float(np.mean(adequacy_scores))
|
352 |
-
|
353 |
-
|
354 |
def get_track_leaderboard(
|
355 |
df: pd.DataFrame,
|
356 |
track: str,
|
357 |
metric: str = "quality",
|
358 |
-
category_filter: str = "all"
|
359 |
-
min_adequacy: float = 0.0
|
360 |
) -> pd.DataFrame:
|
361 |
"""Get leaderboard for a specific track with filtering."""
|
362 |
|
@@ -364,29 +274,19 @@ def get_track_leaderboard(
|
|
364 |
return df
|
365 |
|
366 |
track_quality_col = f"{track}_{metric}"
|
367 |
-
track_adequate_col = f"{track}_adequate"
|
368 |
|
369 |
# Ensure columns exist
|
370 |
-
if track_quality_col not in df.columns
|
371 |
-
print(f"Warning: Missing
|
372 |
return pd.DataFrame()
|
373 |
|
374 |
-
# Filter by adequacy
|
375 |
-
if min_adequacy > 0:
|
376 |
-
adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
|
377 |
-
df = df[adequacy_mask]
|
378 |
-
|
379 |
# Filter by category
|
380 |
if category_filter != "all":
|
381 |
df = df[df["model_category"] == category_filter]
|
382 |
|
383 |
-
# Filter to models that have this track
|
384 |
-
# Convert to proper boolean and handle NaN values
|
385 |
quality_mask = pd.to_numeric(df[track_quality_col], errors='coerce') > 0
|
386 |
-
|
387 |
-
|
388 |
-
valid_mask = quality_mask & adequate_mask
|
389 |
-
df = df[valid_mask]
|
390 |
|
391 |
if df.empty:
|
392 |
return df
|
@@ -397,7 +297,7 @@ def get_track_leaderboard(
|
|
397 |
return df
|
398 |
|
399 |
|
400 |
-
def
|
401 |
"""Prepare track-specific leaderboard for display."""
|
402 |
|
403 |
if df.empty:
|
@@ -414,7 +314,6 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
|
|
414 |
f"{track}_ci_upper",
|
415 |
f"{track}_samples",
|
416 |
f"{track}_pairs",
|
417 |
-
f"{track}_adequate",
|
418 |
]
|
419 |
|
420 |
# Only include columns that exist
|
@@ -461,167 +360,8 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
|
|
461 |
f"{track}_confidence_interval": "95% CI",
|
462 |
f"{track}_samples": "Samples",
|
463 |
f"{track}_pairs": "Pairs",
|
464 |
-
f"{track}_adequate": "Adequate",
|
465 |
}
|
466 |
|
467 |
display_df = display_df.rename(columns=column_renames)
|
468 |
|
469 |
-
return display_df
|
470 |
-
|
471 |
-
|
472 |
-
def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
|
473 |
-
"""Get comprehensive statistics for the scientific leaderboard."""
|
474 |
-
|
475 |
-
if df.empty:
|
476 |
-
return {
|
477 |
-
"total_models": 0,
|
478 |
-
"models_by_category": {},
|
479 |
-
"track_statistics": {},
|
480 |
-
"adequacy_distribution": {},
|
481 |
-
"best_models_by_track": {},
|
482 |
-
}
|
483 |
-
|
484 |
-
stats = {
|
485 |
-
"total_models": len(df),
|
486 |
-
"models_by_category": df["model_category"].value_counts().to_dict(),
|
487 |
-
"adequacy_distribution": {},
|
488 |
-
"track_statistics": {},
|
489 |
-
"best_models_by_track": {},
|
490 |
-
}
|
491 |
-
|
492 |
-
# Adequacy distribution
|
493 |
-
adequacy_bins = pd.cut(
|
494 |
-
df["scientific_adequacy_score"],
|
495 |
-
bins=[0, 0.3, 0.6, 0.8, 1.0],
|
496 |
-
labels=["Poor", "Fair", "Good", "Excellent"]
|
497 |
-
)
|
498 |
-
stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
|
499 |
-
|
500 |
-
# Track-specific statistics
|
501 |
-
for track_name in EVALUATION_TRACKS.keys():
|
502 |
-
quality_col = f"{track_name}_quality"
|
503 |
-
adequate_col = f"{track_name}_adequate"
|
504 |
-
|
505 |
-
if quality_col in df.columns and adequate_col in df.columns:
|
506 |
-
track_models = df[df[adequate_col] & (df[quality_col] > 0)]
|
507 |
-
|
508 |
-
if len(track_models) > 0:
|
509 |
-
stats["track_statistics"][track_name] = {
|
510 |
-
"participating_models": len(track_models),
|
511 |
-
"avg_quality": float(track_models[quality_col].mean()),
|
512 |
-
"std_quality": float(track_models[quality_col].std()),
|
513 |
-
"best_quality": float(track_models[quality_col].max()),
|
514 |
-
}
|
515 |
-
|
516 |
-
# Best model for this track
|
517 |
-
best_model = track_models.loc[track_models[quality_col].idxmax()]
|
518 |
-
stats["best_models_by_track"][track_name] = {
|
519 |
-
"name": best_model["model_name"],
|
520 |
-
"category": best_model["model_category"],
|
521 |
-
"quality": float(best_model[quality_col]),
|
522 |
-
}
|
523 |
-
|
524 |
-
return stats
|
525 |
-
|
526 |
-
|
527 |
-
def perform_fair_comparison(
|
528 |
-
df: pd.DataFrame,
|
529 |
-
model_names: List[str],
|
530 |
-
shared_pairs_only: bool = True
|
531 |
-
) -> Dict:
|
532 |
-
"""Perform fair comparison between models using only shared language pairs."""
|
533 |
-
|
534 |
-
models = df[df["model_name"].isin(model_names)]
|
535 |
-
|
536 |
-
if len(models) == 0:
|
537 |
-
return {"error": "No models found"}
|
538 |
-
|
539 |
-
comparison = {
|
540 |
-
"models": list(models["model_name"]),
|
541 |
-
"fair_comparison_possible": True,
|
542 |
-
"track_comparisons": {},
|
543 |
-
"statistical_significance": {},
|
544 |
-
"recommendations": [],
|
545 |
-
}
|
546 |
-
|
547 |
-
# Check if fair comparison is possible
|
548 |
-
categories = models["model_category"].unique()
|
549 |
-
if len(categories) > 1:
|
550 |
-
comparison["recommendations"].append(
|
551 |
-
"⚠️ Comparing models from different categories - interpret results carefully"
|
552 |
-
)
|
553 |
-
|
554 |
-
# For each track, compare models
|
555 |
-
for track_name in EVALUATION_TRACKS.keys():
|
556 |
-
quality_col = f"{track_name}_quality"
|
557 |
-
adequate_col = f"{track_name}_adequate"
|
558 |
-
|
559 |
-
track_models = models[models[adequate_col] & (models[quality_col] > 0)]
|
560 |
-
|
561 |
-
if len(track_models) >= 2:
|
562 |
-
comparison["track_comparisons"][track_name] = {
|
563 |
-
"participating_models": len(track_models),
|
564 |
-
"quality_scores": dict(zip(track_models["model_name"], track_models[quality_col])),
|
565 |
-
"confidence_intervals": {},
|
566 |
-
}
|
567 |
-
|
568 |
-
# Extract confidence intervals
|
569 |
-
for _, model in track_models.iterrows():
|
570 |
-
ci_lower = model.get(f"{track_name}_ci_lower", 0)
|
571 |
-
ci_upper = model.get(f"{track_name}_ci_upper", 0)
|
572 |
-
comparison["track_comparisons"][track_name]["confidence_intervals"][model["model_name"]] = [ci_lower, ci_upper]
|
573 |
-
|
574 |
-
return comparison
|
575 |
-
|
576 |
-
|
577 |
-
def export_scientific_leaderboard(
|
578 |
-
df: pd.DataFrame,
|
579 |
-
track: str = "all",
|
580 |
-
format: str = "csv",
|
581 |
-
include_detailed: bool = False
|
582 |
-
) -> str:
|
583 |
-
"""Export scientific leaderboard in specified format."""
|
584 |
-
|
585 |
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
586 |
-
|
587 |
-
if track != "all":
|
588 |
-
# Export specific track
|
589 |
-
export_df = prepare_track_leaderboard_display(df, track)
|
590 |
-
filename_prefix = f"salt_leaderboard_{track}_{timestamp}"
|
591 |
-
else:
|
592 |
-
# Export all tracks
|
593 |
-
if include_detailed:
|
594 |
-
export_df = df.copy()
|
595 |
-
else:
|
596 |
-
# Select essential columns
|
597 |
-
essential_columns = [
|
598 |
-
"model_name", "author", "submission_date", "model_category",
|
599 |
-
"scientific_adequacy_score"
|
600 |
-
]
|
601 |
-
|
602 |
-
# Add track-specific quality scores
|
603 |
-
for track_name in EVALUATION_TRACKS.keys():
|
604 |
-
essential_columns.extend([
|
605 |
-
f"{track_name}_quality",
|
606 |
-
f"{track_name}_adequate",
|
607 |
-
])
|
608 |
-
|
609 |
-
available_columns = [col for col in essential_columns if col in df.columns]
|
610 |
-
export_df = df[available_columns].copy()
|
611 |
-
|
612 |
-
filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
|
613 |
-
|
614 |
-
# Export in specified format
|
615 |
-
if format == "csv":
|
616 |
-
filename = f"{filename_prefix}.csv"
|
617 |
-
export_df.to_csv(filename, index=False)
|
618 |
-
elif format == "json":
|
619 |
-
filename = f"{filename_prefix}.json"
|
620 |
-
export_df.to_json(filename, orient="records", indent=2)
|
621 |
-
elif format == "xlsx":
|
622 |
-
filename = f"{filename_prefix}.xlsx"
|
623 |
-
export_df.to_excel(filename, index=False)
|
624 |
-
else:
|
625 |
-
raise ValueError(f"Unsupported format: {format}")
|
626 |
-
|
627 |
-
return filename
|
|
|
11 |
HF_TOKEN,
|
12 |
EVALUATION_TRACKS,
|
13 |
MODEL_CATEGORIES,
|
|
|
14 |
METRICS_CONFIG,
|
|
|
15 |
)
|
16 |
from src.utils import create_submission_id, sanitize_model_name
|
17 |
|
18 |
|
19 |
+
def initialize_leaderboard() -> pd.DataFrame:
|
20 |
+
"""Initialize empty leaderboard DataFrame with all required columns."""
|
21 |
|
22 |
columns = {
|
23 |
# Basic information
|
|
|
31 |
# Track-specific quality scores
|
32 |
"google_comparable_quality": [],
|
33 |
"ug40_complete_quality": [],
|
|
|
34 |
|
35 |
# Track-specific BLEU scores
|
36 |
"google_comparable_bleu": [],
|
37 |
"ug40_complete_bleu": [],
|
|
|
38 |
|
39 |
# Track-specific ChrF scores
|
40 |
"google_comparable_chrf": [],
|
41 |
"ug40_complete_chrf": [],
|
|
|
42 |
|
43 |
+
# Confidence intervals
|
44 |
"google_comparable_ci_lower": [],
|
45 |
"google_comparable_ci_upper": [],
|
46 |
"ug40_complete_ci_lower": [],
|
47 |
"ug40_complete_ci_upper": [],
|
|
|
|
|
48 |
|
49 |
# Coverage information
|
50 |
"google_comparable_samples": [],
|
51 |
"ug40_complete_samples": [],
|
|
|
52 |
"google_comparable_pairs": [],
|
53 |
"ug40_complete_pairs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
# Detailed results (JSON strings)
|
56 |
"detailed_google_comparable": [],
|
57 |
"detailed_ug40_complete": [],
|
|
|
|
|
58 |
|
59 |
# Metadata
|
60 |
"evaluation_date": [],
|
|
|
|
|
61 |
}
|
62 |
|
63 |
return pd.DataFrame(columns)
|
64 |
|
65 |
|
66 |
+
def load_leaderboard() -> pd.DataFrame:
|
67 |
+
"""Load current leaderboard from HuggingFace dataset."""
|
68 |
|
69 |
try:
|
70 |
+
print("📥 Loading leaderboard...")
|
71 |
+
dataset = load_dataset(LEADERBOARD_DATASET, split="train", token=HF_TOKEN)
|
72 |
df = dataset.to_pandas()
|
73 |
|
74 |
# Ensure all required columns exist
|
75 |
+
required_columns = list(initialize_leaderboard().columns)
|
76 |
for col in required_columns:
|
77 |
if col not in df.columns:
|
78 |
if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col:
|
79 |
df[col] = 0.0
|
80 |
elif "samples" in col or "pairs" in col:
|
81 |
df[col] = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
else:
|
83 |
df[col] = ""
|
84 |
|
|
|
|
|
|
|
|
|
|
|
85 |
# Ensure proper data types for numeric columns
|
86 |
numeric_columns = [
|
87 |
col for col in df.columns
|
88 |
+
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"])
|
|
|
89 |
]
|
90 |
for col in numeric_columns:
|
91 |
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
|
92 |
|
93 |
+
print(f"✅ Loaded leaderboard with {len(df)} entries")
|
94 |
return df
|
95 |
|
96 |
except Exception as e:
|
97 |
+
print(f"⚠️ Could not load leaderboard: {e}")
|
98 |
+
print("🔄 Initializing empty leaderboard...")
|
99 |
+
return initialize_leaderboard()
|
100 |
|
101 |
|
102 |
+
def save_leaderboard(df: pd.DataFrame) -> bool:
|
103 |
+
"""Save leaderboard to HuggingFace dataset."""
|
104 |
|
105 |
try:
|
106 |
# Clean data before saving
|
|
|
109 |
# Ensure numeric columns are proper types
|
110 |
numeric_columns = [
|
111 |
col for col in df_clean.columns
|
112 |
+
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"])
|
113 |
]
|
114 |
|
115 |
for col in numeric_columns:
|
116 |
if col in df_clean.columns:
|
117 |
+
df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
|
|
|
|
|
|
|
118 |
|
119 |
# Convert to dataset
|
120 |
dataset = Dataset.from_pandas(df_clean)
|
121 |
|
122 |
# Push to hub
|
123 |
dataset.push_to_hub(
|
124 |
+
LEADERBOARD_DATASET,
|
125 |
token=HF_TOKEN,
|
126 |
+
commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()[:19]}",
|
127 |
)
|
128 |
|
129 |
+
print("✅ Leaderboard saved successfully!")
|
130 |
return True
|
131 |
|
132 |
except Exception as e:
|
133 |
+
print(f"❌ Error saving leaderboard: {e}")
|
134 |
return False
|
135 |
|
136 |
|
137 |
+
def add_model_to_leaderboard(
|
138 |
model_name: str,
|
139 |
author: str,
|
140 |
evaluation_results: Dict,
|
141 |
model_category: str = "community",
|
142 |
description: str = "",
|
143 |
) -> pd.DataFrame:
|
144 |
+
"""Add new model results to leaderboard."""
|
145 |
|
146 |
# Load current leaderboard
|
147 |
+
df = load_leaderboard()
|
148 |
|
149 |
# Remove existing entry if present
|
150 |
existing_mask = df["model_name"] == model_name
|
|
|
153 |
|
154 |
# Extract track results
|
155 |
tracks = evaluation_results.get("tracks", {})
|
|
|
|
|
|
|
|
|
156 |
|
157 |
# Prepare new entry
|
158 |
new_entry = {
|
|
|
166 |
# Extract track-specific metrics
|
167 |
**extract_track_metrics(tracks),
|
168 |
|
169 |
+
# Confidence intervals
|
170 |
+
**extract_confidence_intervals(tracks),
|
171 |
|
172 |
# Coverage information
|
173 |
**extract_coverage_information(tracks),
|
174 |
|
|
|
|
|
|
|
175 |
# Detailed results (JSON strings)
|
176 |
+
**serialize_detailed_results(tracks),
|
177 |
|
178 |
# Metadata
|
179 |
"evaluation_date": datetime.datetime.now().isoformat(),
|
|
|
|
|
180 |
}
|
181 |
|
182 |
# Convert to DataFrame and append
|
|
|
184 |
updated_df = pd.concat([df, new_row_df], ignore_index=True)
|
185 |
|
186 |
# Save to hub
|
187 |
+
save_leaderboard(updated_df)
|
188 |
|
189 |
return updated_df
|
190 |
|
|
|
210 |
return metrics
|
211 |
|
212 |
|
213 |
+
def extract_confidence_intervals(tracks: Dict) -> Dict:
|
214 |
"""Extract confidence intervals from each track."""
|
215 |
|
216 |
+
ci_data = {}
|
217 |
|
218 |
for track_name in EVALUATION_TRACKS.keys():
|
219 |
track_data = tracks.get(track_name, {})
|
220 |
+
track_confidence = track_data.get("track_confidence", {})
|
221 |
|
222 |
+
quality_stats = track_confidence.get("quality_score", {})
|
223 |
+
ci_data[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
|
224 |
+
ci_data[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
|
225 |
|
226 |
+
return ci_data
|
227 |
|
228 |
|
229 |
def extract_coverage_information(tracks: Dict) -> Dict:
|
|
|
241 |
return coverage
|
242 |
|
243 |
|
244 |
+
def serialize_detailed_results(tracks: Dict) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
"""Serialize detailed results for storage."""
|
246 |
|
247 |
detailed = {}
|
|
|
249 |
for track_name in EVALUATION_TRACKS.keys():
|
250 |
track_data = tracks.get(track_name, {})
|
251 |
|
252 |
+
# Create simplified detailed results for storage
|
253 |
+
simple_track_data = {
|
254 |
+
"pair_metrics": track_data.get("pair_metrics", {}),
|
255 |
+
"track_averages": track_data.get("track_averages", {}),
|
256 |
+
"track_confidence": track_data.get("track_confidence", {}),
|
257 |
+
"summary": track_data.get("summary", {})
|
258 |
+
}
|
259 |
|
260 |
+
detailed[f"detailed_{track_name}"] = json.dumps(simple_track_data)
|
|
|
|
|
261 |
|
262 |
return detailed
|
263 |
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
def get_track_leaderboard(
|
266 |
df: pd.DataFrame,
|
267 |
track: str,
|
268 |
metric: str = "quality",
|
269 |
+
category_filter: str = "all"
|
|
|
270 |
) -> pd.DataFrame:
|
271 |
"""Get leaderboard for a specific track with filtering."""
|
272 |
|
|
|
274 |
return df
|
275 |
|
276 |
track_quality_col = f"{track}_{metric}"
|
|
|
277 |
|
278 |
# Ensure columns exist
|
279 |
+
if track_quality_col not in df.columns:
|
280 |
+
print(f"Warning: Missing column for track {track}")
|
281 |
return pd.DataFrame()
|
282 |
|
|
|
|
|
|
|
|
|
|
|
283 |
# Filter by category
|
284 |
if category_filter != "all":
|
285 |
df = df[df["model_category"] == category_filter]
|
286 |
|
287 |
+
# Filter to models that have this track
|
|
|
288 |
quality_mask = pd.to_numeric(df[track_quality_col], errors='coerce') > 0
|
289 |
+
df = df[quality_mask]
|
|
|
|
|
|
|
290 |
|
291 |
if df.empty:
|
292 |
return df
|
|
|
297 |
return df
|
298 |
|
299 |
|
300 |
+
def prepare_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
|
301 |
"""Prepare track-specific leaderboard for display."""
|
302 |
|
303 |
if df.empty:
|
|
|
314 |
f"{track}_ci_upper",
|
315 |
f"{track}_samples",
|
316 |
f"{track}_pairs",
|
|
|
317 |
]
|
318 |
|
319 |
# Only include columns that exist
|
|
|
360 |
f"{track}_confidence_interval": "95% CI",
|
361 |
f"{track}_samples": "Samples",
|
362 |
f"{track}_pairs": "Pairs",
|
|
|
363 |
}
|
364 |
|
365 |
display_df = display_df.rename(columns=column_renames)
|
366 |
|
367 |
+
return display_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|