Spaces:
Running
Running
Update src/leaderboard.py
Browse files- src/leaderboard.py +149 -182
src/leaderboard.py
CHANGED
@@ -13,13 +13,14 @@ from config import (
|
|
13 |
MODEL_CATEGORIES,
|
14 |
STATISTICAL_CONFIG,
|
15 |
METRICS_CONFIG,
|
|
|
16 |
)
|
17 |
from src.utils import create_submission_id, sanitize_model_name
|
18 |
|
19 |
|
20 |
def initialize_scientific_leaderboard() -> pd.DataFrame:
|
21 |
"""Initialize empty scientific leaderboard DataFrame with all required columns."""
|
22 |
-
|
23 |
columns = {
|
24 |
# Basic information
|
25 |
"submission_id": [],
|
@@ -28,18 +29,22 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
|
|
28 |
"submission_date": [],
|
29 |
"model_category": [],
|
30 |
"description": [],
|
|
|
31 |
# Track-specific quality scores
|
32 |
"google_comparable_quality": [],
|
33 |
"ug40_complete_quality": [],
|
34 |
"language_pair_matrix_quality": [],
|
|
|
35 |
# Track-specific BLEU scores
|
36 |
"google_comparable_bleu": [],
|
37 |
"ug40_complete_bleu": [],
|
38 |
"language_pair_matrix_bleu": [],
|
|
|
39 |
# Track-specific ChrF scores
|
40 |
"google_comparable_chrf": [],
|
41 |
"ug40_complete_chrf": [],
|
42 |
"language_pair_matrix_chrf": [],
|
|
|
43 |
# Statistical metadata
|
44 |
"google_comparable_ci_lower": [],
|
45 |
"google_comparable_ci_upper": [],
|
@@ -47,6 +52,7 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
|
|
47 |
"ug40_complete_ci_upper": [],
|
48 |
"language_pair_matrix_ci_lower": [],
|
49 |
"language_pair_matrix_ci_upper": [],
|
|
|
50 |
# Coverage information
|
51 |
"google_comparable_samples": [],
|
52 |
"ug40_complete_samples": [],
|
@@ -54,32 +60,35 @@ def initialize_scientific_leaderboard() -> pd.DataFrame:
|
|
54 |
"google_comparable_pairs": [],
|
55 |
"ug40_complete_pairs": [],
|
56 |
"language_pair_matrix_pairs": [],
|
|
|
57 |
# Statistical adequacy flags
|
58 |
"google_comparable_adequate": [],
|
59 |
"ug40_complete_adequate": [],
|
60 |
"language_pair_matrix_adequate": [],
|
|
|
61 |
# Detailed results (JSON strings)
|
62 |
"detailed_google_comparable": [],
|
63 |
"detailed_ug40_complete": [],
|
64 |
"detailed_language_pair_matrix": [],
|
65 |
"cross_track_analysis": [],
|
|
|
66 |
# Metadata
|
67 |
"evaluation_date": [],
|
68 |
"leaderboard_version": [],
|
69 |
"scientific_adequacy_score": [],
|
70 |
}
|
71 |
-
|
72 |
return pd.DataFrame(columns)
|
73 |
|
74 |
|
75 |
def load_scientific_leaderboard() -> pd.DataFrame:
|
76 |
"""Load current scientific leaderboard from HuggingFace dataset."""
|
77 |
-
|
78 |
try:
|
79 |
print("📥 Loading scientific leaderboard...")
|
80 |
dataset = load_dataset(LEADERBOARD_DATASET + "-scientific", split="train")
|
81 |
df = dataset.to_pandas()
|
82 |
-
|
83 |
# Ensure all required columns exist
|
84 |
required_columns = list(initialize_scientific_leaderboard().columns)
|
85 |
for col in required_columns:
|
@@ -96,10 +105,10 @@ def load_scientific_leaderboard() -> pd.DataFrame:
|
|
96 |
df[col] = 2 # Scientific version
|
97 |
else:
|
98 |
df[col] = ""
|
99 |
-
|
100 |
print(f"✅ Loaded scientific leaderboard with {len(df)} entries")
|
101 |
return df
|
102 |
-
|
103 |
except Exception as e:
|
104 |
print(f"⚠️ Could not load scientific leaderboard: {e}")
|
105 |
print("🔄 Initializing empty scientific leaderboard...")
|
@@ -108,51 +117,37 @@ def load_scientific_leaderboard() -> pd.DataFrame:
|
|
108 |
|
109 |
def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
|
110 |
"""Save scientific leaderboard to HuggingFace dataset."""
|
111 |
-
|
112 |
try:
|
113 |
# Clean data before saving
|
114 |
df_clean = df.copy()
|
115 |
-
|
116 |
# Ensure numeric columns are proper types
|
117 |
numeric_columns = [
|
118 |
-
col
|
119 |
-
|
120 |
-
if any(
|
121 |
-
x in col
|
122 |
-
for x in [
|
123 |
-
"quality",
|
124 |
-
"bleu",
|
125 |
-
"chrf",
|
126 |
-
"ci_",
|
127 |
-
"samples",
|
128 |
-
"pairs",
|
129 |
-
"adequacy",
|
130 |
-
]
|
131 |
-
)
|
132 |
]
|
133 |
-
|
134 |
for col in numeric_columns:
|
135 |
if col in df_clean.columns:
|
136 |
if "adequate" in col:
|
137 |
df_clean[col] = df_clean[col].astype(bool)
|
138 |
else:
|
139 |
-
df_clean[col] = pd.to_numeric(
|
140 |
-
|
141 |
-
).fillna(0.0)
|
142 |
-
|
143 |
# Convert to dataset
|
144 |
dataset = Dataset.from_pandas(df_clean)
|
145 |
-
|
146 |
# Push to hub
|
147 |
dataset.push_to_hub(
|
148 |
LEADERBOARD_DATASET + "-scientific",
|
149 |
token=HF_TOKEN,
|
150 |
commit_message=f"Update scientific leaderboard - {datetime.datetime.now().isoformat()[:19]}",
|
151 |
)
|
152 |
-
|
153 |
print("✅ Scientific leaderboard saved successfully!")
|
154 |
return True
|
155 |
-
|
156 |
except Exception as e:
|
157 |
print(f"❌ Error saving scientific leaderboard: {e}")
|
158 |
return False
|
@@ -166,241 +161,231 @@ def add_model_to_scientific_leaderboard(
|
|
166 |
description: str = "",
|
167 |
) -> pd.DataFrame:
|
168 |
"""Add new model results to scientific leaderboard."""
|
169 |
-
|
170 |
# Load current leaderboard
|
171 |
df = load_scientific_leaderboard()
|
172 |
-
|
173 |
# Remove existing entry if present
|
174 |
existing_mask = df["model_name"] == model_name
|
175 |
if existing_mask.any():
|
176 |
df = df[~existing_mask]
|
177 |
-
|
178 |
# Extract track results
|
179 |
tracks = evaluation_results.get("tracks", {})
|
180 |
cross_track = evaluation_results.get("cross_track_analysis", {})
|
181 |
-
|
182 |
# Calculate scientific adequacy score
|
183 |
adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
|
184 |
-
|
185 |
# Prepare new entry
|
186 |
new_entry = {
|
187 |
"submission_id": create_submission_id(),
|
188 |
"model_name": sanitize_model_name(model_name),
|
189 |
"author": author[:100] if author else "Anonymous",
|
190 |
"submission_date": datetime.datetime.now().isoformat(),
|
191 |
-
"model_category":
|
192 |
-
model_category if model_category in MODEL_CATEGORIES else "community"
|
193 |
-
),
|
194 |
"description": description[:500] if description else "",
|
|
|
195 |
# Extract track-specific metrics
|
196 |
**extract_track_metrics(tracks),
|
|
|
197 |
# Statistical metadata
|
198 |
**extract_statistical_metadata(tracks),
|
|
|
199 |
# Coverage information
|
200 |
**extract_coverage_information(tracks),
|
|
|
201 |
# Adequacy flags
|
202 |
**extract_adequacy_flags(tracks),
|
|
|
203 |
# Detailed results (JSON strings)
|
204 |
**serialize_detailed_results(tracks, cross_track),
|
|
|
205 |
# Metadata
|
206 |
"evaluation_date": datetime.datetime.now().isoformat(),
|
207 |
"leaderboard_version": 2,
|
208 |
"scientific_adequacy_score": adequacy_score,
|
209 |
}
|
210 |
-
|
211 |
# Convert to DataFrame and append
|
212 |
new_row_df = pd.DataFrame([new_entry])
|
213 |
updated_df = pd.concat([df, new_row_df], ignore_index=True)
|
214 |
-
|
215 |
# Save to hub
|
216 |
save_scientific_leaderboard(updated_df)
|
217 |
-
|
218 |
return updated_df
|
219 |
|
220 |
|
221 |
def extract_track_metrics(tracks: Dict) -> Dict:
|
222 |
"""Extract primary metrics from each track."""
|
223 |
-
|
224 |
metrics = {}
|
225 |
-
|
226 |
for track_name in EVALUATION_TRACKS.keys():
|
227 |
track_data = tracks.get(track_name, {})
|
228 |
track_averages = track_data.get("track_averages", {})
|
229 |
-
|
230 |
# Quality score
|
231 |
-
metrics[f"{track_name}_quality"] = float(
|
232 |
-
|
233 |
-
)
|
234 |
-
|
235 |
# BLEU score
|
236 |
metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0))
|
237 |
-
|
238 |
# ChrF score
|
239 |
metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0))
|
240 |
-
|
241 |
return metrics
|
242 |
|
243 |
|
244 |
def extract_statistical_metadata(tracks: Dict) -> Dict:
|
245 |
"""Extract confidence intervals from each track."""
|
246 |
-
|
247 |
metadata = {}
|
248 |
-
|
249 |
for track_name in EVALUATION_TRACKS.keys():
|
250 |
track_data = tracks.get(track_name, {})
|
251 |
track_statistics = track_data.get("track_statistics", {})
|
252 |
-
|
253 |
quality_stats = track_statistics.get("quality_score", {})
|
254 |
metadata[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
|
255 |
metadata[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
|
256 |
-
|
257 |
return metadata
|
258 |
|
259 |
|
260 |
def extract_coverage_information(tracks: Dict) -> Dict:
|
261 |
"""Extract coverage information from each track."""
|
262 |
-
|
263 |
coverage = {}
|
264 |
-
|
265 |
for track_name in EVALUATION_TRACKS.keys():
|
266 |
track_data = tracks.get(track_name, {})
|
267 |
summary = track_data.get("summary", {})
|
268 |
-
|
269 |
coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0))
|
270 |
-
coverage[f"{track_name}_pairs"] = int(
|
271 |
-
|
272 |
-
)
|
273 |
-
|
274 |
return coverage
|
275 |
|
276 |
|
277 |
def extract_adequacy_flags(tracks: Dict) -> Dict:
|
278 |
"""Extract statistical adequacy flags for each track."""
|
279 |
-
|
280 |
adequacy = {}
|
281 |
-
|
282 |
for track_name in EVALUATION_TRACKS.keys():
|
283 |
track_data = tracks.get(track_name, {})
|
284 |
summary = track_data.get("summary", {})
|
285 |
-
|
286 |
-
min_required = EVALUATION_TRACKS[track_name][
|
287 |
-
"min_samples_per_pair"
|
288 |
-
] * summary.get("language_pairs_evaluated", 0)
|
289 |
is_adequate = summary.get("total_samples", 0) >= min_required
|
290 |
-
|
291 |
adequacy[f"{track_name}_adequate"] = bool(is_adequate)
|
292 |
-
|
293 |
return adequacy
|
294 |
|
295 |
|
296 |
def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
|
297 |
"""Serialize detailed results for storage."""
|
298 |
-
|
299 |
detailed = {}
|
300 |
-
|
301 |
for track_name in EVALUATION_TRACKS.keys():
|
302 |
track_data = tracks.get(track_name, {})
|
303 |
-
|
304 |
# Remove non-serializable data
|
305 |
safe_track_data = {}
|
306 |
for key, value in track_data.items():
|
307 |
if key != "sample_metrics": # Skip large DataFrames
|
308 |
safe_track_data[key] = value
|
309 |
-
|
310 |
detailed[f"detailed_{track_name}"] = json.dumps(safe_track_data)
|
311 |
-
|
312 |
detailed["cross_track_analysis"] = json.dumps(cross_track)
|
313 |
-
|
314 |
return detailed
|
315 |
|
316 |
|
317 |
def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
|
318 |
"""Calculate overall scientific adequacy score (0-1)."""
|
319 |
-
|
320 |
tracks = evaluation_results.get("tracks", {})
|
321 |
-
|
322 |
adequacy_scores = []
|
323 |
-
|
324 |
for track_name in EVALUATION_TRACKS.keys():
|
325 |
track_data = tracks.get(track_name, {})
|
326 |
summary = track_data.get("summary", {})
|
327 |
-
|
328 |
if track_data.get("error"):
|
329 |
adequacy_scores.append(0.0)
|
330 |
continue
|
331 |
-
|
332 |
# Sample size adequacy
|
333 |
-
min_required = EVALUATION_TRACKS[track_name][
|
334 |
-
|
335 |
-
|
336 |
-
sample_adequacy = min(
|
337 |
-
summary.get("total_samples", 0) / max(min_required, 1), 1.0
|
338 |
-
)
|
339 |
-
|
340 |
# Coverage adequacy
|
341 |
-
total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (
|
342 |
-
|
343 |
-
|
344 |
-
coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(
|
345 |
-
total_possible_pairs, 1
|
346 |
-
)
|
347 |
-
|
348 |
# Track adequacy
|
349 |
track_adequacy = (sample_adequacy + coverage_adequacy) / 2
|
350 |
adequacy_scores.append(track_adequacy)
|
351 |
-
|
352 |
return float(np.mean(adequacy_scores))
|
353 |
|
354 |
|
355 |
def get_track_leaderboard(
|
356 |
-
df: pd.DataFrame,
|
357 |
-
track: str,
|
358 |
metric: str = "quality",
|
359 |
category_filter: str = "all",
|
360 |
-
min_adequacy: float = 0.0
|
361 |
) -> pd.DataFrame:
|
362 |
"""Get leaderboard for a specific track with filtering."""
|
363 |
-
|
364 |
if df.empty:
|
365 |
return df
|
366 |
-
|
367 |
track_quality_col = f"{track}_{metric}"
|
368 |
track_adequate_col = f"{track}_adequate"
|
369 |
-
|
370 |
# Filter by adequacy
|
371 |
if min_adequacy > 0:
|
372 |
adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
|
373 |
df = df[adequacy_mask]
|
374 |
-
|
375 |
# Filter by category
|
376 |
if category_filter != "all":
|
377 |
df = df[df["model_category"] == category_filter]
|
378 |
-
|
379 |
# Filter to models that have this track
|
380 |
valid_mask = (df[track_quality_col] > 0) & df[track_adequate_col]
|
381 |
df = df[valid_mask]
|
382 |
-
|
383 |
if df.empty:
|
384 |
return df
|
385 |
-
|
386 |
# Sort by track-specific metric
|
387 |
df = df.sort_values(track_quality_col, ascending=False).reset_index(drop=True)
|
388 |
-
|
389 |
return df
|
390 |
|
391 |
|
392 |
def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
|
393 |
"""Prepare track-specific leaderboard for display."""
|
394 |
-
|
395 |
if df.empty:
|
396 |
return df
|
397 |
-
|
398 |
# Select relevant columns for this track
|
399 |
base_columns = ["model_name", "author", "submission_date", "model_category"]
|
400 |
-
|
401 |
track_columns = [
|
402 |
f"{track}_quality",
|
403 |
-
f"{track}_bleu",
|
404 |
f"{track}_chrf",
|
405 |
f"{track}_ci_lower",
|
406 |
f"{track}_ci_upper",
|
@@ -408,13 +393,11 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
|
|
408 |
f"{track}_pairs",
|
409 |
f"{track}_adequate",
|
410 |
]
|
411 |
-
|
412 |
# Only include columns that exist
|
413 |
-
available_columns = [
|
414 |
-
col for col in base_columns + track_columns if col in df.columns
|
415 |
-
]
|
416 |
display_df = df[available_columns].copy()
|
417 |
-
|
418 |
# Format numeric columns
|
419 |
numeric_format = {
|
420 |
f"{track}_quality": "{:.4f}",
|
@@ -423,34 +406,25 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
|
|
423 |
f"{track}_ci_lower": "{:.4f}",
|
424 |
f"{track}_ci_upper": "{:.4f}",
|
425 |
}
|
426 |
-
|
427 |
for col, fmt in numeric_format.items():
|
428 |
if col in display_df.columns:
|
429 |
display_df[col] = display_df[col].apply(
|
430 |
lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000"
|
431 |
)
|
432 |
-
|
433 |
# Format confidence intervals
|
434 |
-
if
|
435 |
-
f"{track}_ci_lower" in display_df.columns
|
436 |
-
and f"{track}_ci_upper" in display_df.columns
|
437 |
-
):
|
438 |
display_df[f"{track}_confidence_interval"] = (
|
439 |
-
"["
|
440 |
-
+ display_df[f"{track}_ci_lower"]
|
441 |
-
+ ", "
|
442 |
-
+ display_df[f"{track}_ci_upper"]
|
443 |
-
+ "]"
|
444 |
)
|
445 |
# Remove individual CI columns for cleaner display
|
446 |
display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"])
|
447 |
-
|
448 |
# Format submission date
|
449 |
if "submission_date" in display_df.columns:
|
450 |
-
display_df["submission_date"] = pd.to_datetime(
|
451 |
-
|
452 |
-
).dt.strftime("%Y-%m-%d")
|
453 |
-
|
454 |
# Rename columns for better display
|
455 |
track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word
|
456 |
column_renames = {
|
@@ -466,15 +440,15 @@ def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFr
|
|
466 |
f"{track}_pairs": "Pairs",
|
467 |
f"{track}_adequate": "Adequate",
|
468 |
}
|
469 |
-
|
470 |
display_df = display_df.rename(columns=column_renames)
|
471 |
-
|
472 |
return display_df
|
473 |
|
474 |
|
475 |
def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
|
476 |
"""Get comprehensive statistics for the scientific leaderboard."""
|
477 |
-
|
478 |
if df.empty:
|
479 |
return {
|
480 |
"total_models": 0,
|
@@ -483,7 +457,7 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
|
|
483 |
"adequacy_distribution": {},
|
484 |
"best_models_by_track": {},
|
485 |
}
|
486 |
-
|
487 |
stats = {
|
488 |
"total_models": len(df),
|
489 |
"models_by_category": df["model_category"].value_counts().to_dict(),
|
@@ -491,23 +465,23 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
|
|
491 |
"track_statistics": {},
|
492 |
"best_models_by_track": {},
|
493 |
}
|
494 |
-
|
495 |
# Adequacy distribution
|
496 |
adequacy_bins = pd.cut(
|
497 |
-
df["scientific_adequacy_score"],
|
498 |
bins=[0, 0.3, 0.6, 0.8, 1.0],
|
499 |
-
labels=["Poor", "Fair", "Good", "Excellent"]
|
500 |
)
|
501 |
stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
|
502 |
-
|
503 |
# Track-specific statistics
|
504 |
for track_name in EVALUATION_TRACKS.keys():
|
505 |
quality_col = f"{track_name}_quality"
|
506 |
adequate_col = f"{track_name}_adequate"
|
507 |
-
|
508 |
if quality_col in df.columns and adequate_col in df.columns:
|
509 |
track_models = df[df[adequate_col] & (df[quality_col] > 0)]
|
510 |
-
|
511 |
if len(track_models) > 0:
|
512 |
stats["track_statistics"][track_name] = {
|
513 |
"participating_models": len(track_models),
|
@@ -515,7 +489,7 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
|
|
515 |
"std_quality": float(track_models[quality_col].std()),
|
516 |
"best_quality": float(track_models[quality_col].max()),
|
517 |
}
|
518 |
-
|
519 |
# Best model for this track
|
520 |
best_model = track_models.loc[track_models[quality_col].idxmax()]
|
521 |
stats["best_models_by_track"][track_name] = {
|
@@ -523,20 +497,22 @@ def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dic
|
|
523 |
"category": best_model["model_category"],
|
524 |
"quality": float(best_model[quality_col]),
|
525 |
}
|
526 |
-
|
527 |
return stats
|
528 |
|
529 |
|
530 |
def perform_fair_comparison(
|
531 |
-
df: pd.DataFrame,
|
|
|
|
|
532 |
) -> Dict:
|
533 |
"""Perform fair comparison between models using only shared language pairs."""
|
534 |
-
|
535 |
models = df[df["model_name"].isin(model_names)]
|
536 |
-
|
537 |
if len(models) == 0:
|
538 |
return {"error": "No models found"}
|
539 |
-
|
540 |
comparison = {
|
541 |
"models": list(models["model_name"]),
|
542 |
"fair_comparison_possible": True,
|
@@ -544,51 +520,47 @@ def perform_fair_comparison(
|
|
544 |
"statistical_significance": {},
|
545 |
"recommendations": [],
|
546 |
}
|
547 |
-
|
548 |
# Check if fair comparison is possible
|
549 |
categories = models["model_category"].unique()
|
550 |
if len(categories) > 1:
|
551 |
comparison["recommendations"].append(
|
552 |
"⚠️ Comparing models from different categories - interpret results carefully"
|
553 |
)
|
554 |
-
|
555 |
# For each track, compare models
|
556 |
for track_name in EVALUATION_TRACKS.keys():
|
557 |
quality_col = f"{track_name}_quality"
|
558 |
adequate_col = f"{track_name}_adequate"
|
559 |
-
|
560 |
track_models = models[models[adequate_col] & (models[quality_col] > 0)]
|
561 |
-
|
562 |
if len(track_models) >= 2:
|
563 |
comparison["track_comparisons"][track_name] = {
|
564 |
"participating_models": len(track_models),
|
565 |
-
"quality_scores": dict(
|
566 |
-
zip(track_models["model_name"], track_models[quality_col])
|
567 |
-
),
|
568 |
"confidence_intervals": {},
|
569 |
}
|
570 |
-
|
571 |
# Extract confidence intervals
|
572 |
for _, model in track_models.iterrows():
|
573 |
ci_lower = model.get(f"{track_name}_ci_lower", 0)
|
574 |
ci_upper = model.get(f"{track_name}_ci_upper", 0)
|
575 |
-
comparison["track_comparisons"][track_name]["confidence_intervals"][
|
576 |
-
|
577 |
-
] = [ci_lower, ci_upper]
|
578 |
-
|
579 |
return comparison
|
580 |
|
581 |
|
582 |
def export_scientific_leaderboard(
|
583 |
-
df: pd.DataFrame,
|
584 |
-
track: str = "all",
|
585 |
-
format: str = "csv",
|
586 |
-
include_detailed: bool = False
|
587 |
) -> str:
|
588 |
"""Export scientific leaderboard in specified format."""
|
589 |
-
|
590 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
591 |
-
|
592 |
if track != "all":
|
593 |
# Export specific track
|
594 |
export_df = prepare_track_leaderboard_display(df, track)
|
@@ -600,27 +572,22 @@ def export_scientific_leaderboard(
|
|
600 |
else:
|
601 |
# Select essential columns
|
602 |
essential_columns = [
|
603 |
-
"model_name",
|
604 |
-
"
|
605 |
-
"submission_date",
|
606 |
-
"model_category",
|
607 |
-
"scientific_adequacy_score",
|
608 |
]
|
609 |
-
|
610 |
# Add track-specific quality scores
|
611 |
for track_name in EVALUATION_TRACKS.keys():
|
612 |
-
essential_columns.extend(
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
)
|
618 |
-
|
619 |
available_columns = [col for col in essential_columns if col in df.columns]
|
620 |
export_df = df[available_columns].copy()
|
621 |
-
|
622 |
filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
|
623 |
-
|
624 |
# Export in specified format
|
625 |
if format == "csv":
|
626 |
filename = f"{filename_prefix}.csv"
|
@@ -633,5 +600,5 @@ def export_scientific_leaderboard(
|
|
633 |
export_df.to_excel(filename, index=False)
|
634 |
else:
|
635 |
raise ValueError(f"Unsupported format: {format}")
|
636 |
-
|
637 |
-
return filename
|
|
|
13 |
MODEL_CATEGORIES,
|
14 |
STATISTICAL_CONFIG,
|
15 |
METRICS_CONFIG,
|
16 |
+
SAMPLE_SIZE_RECOMMENDATIONS,
|
17 |
)
|
18 |
from src.utils import create_submission_id, sanitize_model_name
|
19 |
|
20 |
|
21 |
def initialize_scientific_leaderboard() -> pd.DataFrame:
|
22 |
"""Initialize empty scientific leaderboard DataFrame with all required columns."""
|
23 |
+
|
24 |
columns = {
|
25 |
# Basic information
|
26 |
"submission_id": [],
|
|
|
29 |
"submission_date": [],
|
30 |
"model_category": [],
|
31 |
"description": [],
|
32 |
+
|
33 |
# Track-specific quality scores
|
34 |
"google_comparable_quality": [],
|
35 |
"ug40_complete_quality": [],
|
36 |
"language_pair_matrix_quality": [],
|
37 |
+
|
38 |
# Track-specific BLEU scores
|
39 |
"google_comparable_bleu": [],
|
40 |
"ug40_complete_bleu": [],
|
41 |
"language_pair_matrix_bleu": [],
|
42 |
+
|
43 |
# Track-specific ChrF scores
|
44 |
"google_comparable_chrf": [],
|
45 |
"ug40_complete_chrf": [],
|
46 |
"language_pair_matrix_chrf": [],
|
47 |
+
|
48 |
# Statistical metadata
|
49 |
"google_comparable_ci_lower": [],
|
50 |
"google_comparable_ci_upper": [],
|
|
|
52 |
"ug40_complete_ci_upper": [],
|
53 |
"language_pair_matrix_ci_lower": [],
|
54 |
"language_pair_matrix_ci_upper": [],
|
55 |
+
|
56 |
# Coverage information
|
57 |
"google_comparable_samples": [],
|
58 |
"ug40_complete_samples": [],
|
|
|
60 |
"google_comparable_pairs": [],
|
61 |
"ug40_complete_pairs": [],
|
62 |
"language_pair_matrix_pairs": [],
|
63 |
+
|
64 |
# Statistical adequacy flags
|
65 |
"google_comparable_adequate": [],
|
66 |
"ug40_complete_adequate": [],
|
67 |
"language_pair_matrix_adequate": [],
|
68 |
+
|
69 |
# Detailed results (JSON strings)
|
70 |
"detailed_google_comparable": [],
|
71 |
"detailed_ug40_complete": [],
|
72 |
"detailed_language_pair_matrix": [],
|
73 |
"cross_track_analysis": [],
|
74 |
+
|
75 |
# Metadata
|
76 |
"evaluation_date": [],
|
77 |
"leaderboard_version": [],
|
78 |
"scientific_adequacy_score": [],
|
79 |
}
|
80 |
+
|
81 |
return pd.DataFrame(columns)
|
82 |
|
83 |
|
84 |
def load_scientific_leaderboard() -> pd.DataFrame:
|
85 |
"""Load current scientific leaderboard from HuggingFace dataset."""
|
86 |
+
|
87 |
try:
|
88 |
print("📥 Loading scientific leaderboard...")
|
89 |
dataset = load_dataset(LEADERBOARD_DATASET + "-scientific", split="train")
|
90 |
df = dataset.to_pandas()
|
91 |
+
|
92 |
# Ensure all required columns exist
|
93 |
required_columns = list(initialize_scientific_leaderboard().columns)
|
94 |
for col in required_columns:
|
|
|
105 |
df[col] = 2 # Scientific version
|
106 |
else:
|
107 |
df[col] = ""
|
108 |
+
|
109 |
print(f"✅ Loaded scientific leaderboard with {len(df)} entries")
|
110 |
return df
|
111 |
+
|
112 |
except Exception as e:
|
113 |
print(f"⚠️ Could not load scientific leaderboard: {e}")
|
114 |
print("🔄 Initializing empty scientific leaderboard...")
|
|
|
117 |
|
118 |
def save_scientific_leaderboard(df: pd.DataFrame) -> bool:
|
119 |
"""Save scientific leaderboard to HuggingFace dataset."""
|
120 |
+
|
121 |
try:
|
122 |
# Clean data before saving
|
123 |
df_clean = df.copy()
|
124 |
+
|
125 |
# Ensure numeric columns are proper types
|
126 |
numeric_columns = [
|
127 |
+
col for col in df_clean.columns
|
128 |
+
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs", "adequacy"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
]
|
130 |
+
|
131 |
for col in numeric_columns:
|
132 |
if col in df_clean.columns:
|
133 |
if "adequate" in col:
|
134 |
df_clean[col] = df_clean[col].astype(bool)
|
135 |
else:
|
136 |
+
df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
|
137 |
+
|
|
|
|
|
138 |
# Convert to dataset
|
139 |
dataset = Dataset.from_pandas(df_clean)
|
140 |
+
|
141 |
# Push to hub
|
142 |
dataset.push_to_hub(
|
143 |
LEADERBOARD_DATASET + "-scientific",
|
144 |
token=HF_TOKEN,
|
145 |
commit_message=f"Update scientific leaderboard - {datetime.datetime.now().isoformat()[:19]}",
|
146 |
)
|
147 |
+
|
148 |
print("✅ Scientific leaderboard saved successfully!")
|
149 |
return True
|
150 |
+
|
151 |
except Exception as e:
|
152 |
print(f"❌ Error saving scientific leaderboard: {e}")
|
153 |
return False
|
|
|
161 |
description: str = "",
|
162 |
) -> pd.DataFrame:
|
163 |
"""Add new model results to scientific leaderboard."""
|
164 |
+
|
165 |
# Load current leaderboard
|
166 |
df = load_scientific_leaderboard()
|
167 |
+
|
168 |
# Remove existing entry if present
|
169 |
existing_mask = df["model_name"] == model_name
|
170 |
if existing_mask.any():
|
171 |
df = df[~existing_mask]
|
172 |
+
|
173 |
# Extract track results
|
174 |
tracks = evaluation_results.get("tracks", {})
|
175 |
cross_track = evaluation_results.get("cross_track_analysis", {})
|
176 |
+
|
177 |
# Calculate scientific adequacy score
|
178 |
adequacy_score = calculate_scientific_adequacy_score(evaluation_results)
|
179 |
+
|
180 |
# Prepare new entry
|
181 |
new_entry = {
|
182 |
"submission_id": create_submission_id(),
|
183 |
"model_name": sanitize_model_name(model_name),
|
184 |
"author": author[:100] if author else "Anonymous",
|
185 |
"submission_date": datetime.datetime.now().isoformat(),
|
186 |
+
"model_category": model_category if model_category in MODEL_CATEGORIES else "community",
|
|
|
|
|
187 |
"description": description[:500] if description else "",
|
188 |
+
|
189 |
# Extract track-specific metrics
|
190 |
**extract_track_metrics(tracks),
|
191 |
+
|
192 |
# Statistical metadata
|
193 |
**extract_statistical_metadata(tracks),
|
194 |
+
|
195 |
# Coverage information
|
196 |
**extract_coverage_information(tracks),
|
197 |
+
|
198 |
# Adequacy flags
|
199 |
**extract_adequacy_flags(tracks),
|
200 |
+
|
201 |
# Detailed results (JSON strings)
|
202 |
**serialize_detailed_results(tracks, cross_track),
|
203 |
+
|
204 |
# Metadata
|
205 |
"evaluation_date": datetime.datetime.now().isoformat(),
|
206 |
"leaderboard_version": 2,
|
207 |
"scientific_adequacy_score": adequacy_score,
|
208 |
}
|
209 |
+
|
210 |
# Convert to DataFrame and append
|
211 |
new_row_df = pd.DataFrame([new_entry])
|
212 |
updated_df = pd.concat([df, new_row_df], ignore_index=True)
|
213 |
+
|
214 |
# Save to hub
|
215 |
save_scientific_leaderboard(updated_df)
|
216 |
+
|
217 |
return updated_df
|
218 |
|
219 |
|
220 |
def extract_track_metrics(tracks: Dict) -> Dict:
|
221 |
"""Extract primary metrics from each track."""
|
222 |
+
|
223 |
metrics = {}
|
224 |
+
|
225 |
for track_name in EVALUATION_TRACKS.keys():
|
226 |
track_data = tracks.get(track_name, {})
|
227 |
track_averages = track_data.get("track_averages", {})
|
228 |
+
|
229 |
# Quality score
|
230 |
+
metrics[f"{track_name}_quality"] = float(track_averages.get("quality_score", 0.0))
|
231 |
+
|
|
|
|
|
232 |
# BLEU score
|
233 |
metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0))
|
234 |
+
|
235 |
# ChrF score
|
236 |
metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0))
|
237 |
+
|
238 |
return metrics
|
239 |
|
240 |
|
241 |
def extract_statistical_metadata(tracks: Dict) -> Dict:
|
242 |
"""Extract confidence intervals from each track."""
|
243 |
+
|
244 |
metadata = {}
|
245 |
+
|
246 |
for track_name in EVALUATION_TRACKS.keys():
|
247 |
track_data = tracks.get(track_name, {})
|
248 |
track_statistics = track_data.get("track_statistics", {})
|
249 |
+
|
250 |
quality_stats = track_statistics.get("quality_score", {})
|
251 |
metadata[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
|
252 |
metadata[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
|
253 |
+
|
254 |
return metadata
|
255 |
|
256 |
|
257 |
def extract_coverage_information(tracks: Dict) -> Dict:
|
258 |
"""Extract coverage information from each track."""
|
259 |
+
|
260 |
coverage = {}
|
261 |
+
|
262 |
for track_name in EVALUATION_TRACKS.keys():
|
263 |
track_data = tracks.get(track_name, {})
|
264 |
summary = track_data.get("summary", {})
|
265 |
+
|
266 |
coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0))
|
267 |
+
coverage[f"{track_name}_pairs"] = int(summary.get("language_pairs_evaluated", 0))
|
268 |
+
|
|
|
|
|
269 |
return coverage
|
270 |
|
271 |
|
272 |
def extract_adequacy_flags(tracks: Dict) -> Dict:
|
273 |
"""Extract statistical adequacy flags for each track."""
|
274 |
+
|
275 |
adequacy = {}
|
276 |
+
|
277 |
for track_name in EVALUATION_TRACKS.keys():
|
278 |
track_data = tracks.get(track_name, {})
|
279 |
summary = track_data.get("summary", {})
|
280 |
+
|
281 |
+
min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
|
|
|
|
|
282 |
is_adequate = summary.get("total_samples", 0) >= min_required
|
283 |
+
|
284 |
adequacy[f"{track_name}_adequate"] = bool(is_adequate)
|
285 |
+
|
286 |
return adequacy
|
287 |
|
288 |
|
289 |
def serialize_detailed_results(tracks: Dict, cross_track: Dict) -> Dict:
|
290 |
"""Serialize detailed results for storage."""
|
291 |
+
|
292 |
detailed = {}
|
293 |
+
|
294 |
for track_name in EVALUATION_TRACKS.keys():
|
295 |
track_data = tracks.get(track_name, {})
|
296 |
+
|
297 |
# Remove non-serializable data
|
298 |
safe_track_data = {}
|
299 |
for key, value in track_data.items():
|
300 |
if key != "sample_metrics": # Skip large DataFrames
|
301 |
safe_track_data[key] = value
|
302 |
+
|
303 |
detailed[f"detailed_{track_name}"] = json.dumps(safe_track_data)
|
304 |
+
|
305 |
detailed["cross_track_analysis"] = json.dumps(cross_track)
|
306 |
+
|
307 |
return detailed
|
308 |
|
309 |
|
310 |
def calculate_scientific_adequacy_score(evaluation_results: Dict) -> float:
|
311 |
"""Calculate overall scientific adequacy score (0-1)."""
|
312 |
+
|
313 |
tracks = evaluation_results.get("tracks", {})
|
314 |
+
|
315 |
adequacy_scores = []
|
316 |
+
|
317 |
for track_name in EVALUATION_TRACKS.keys():
|
318 |
track_data = tracks.get(track_name, {})
|
319 |
summary = track_data.get("summary", {})
|
320 |
+
|
321 |
if track_data.get("error"):
|
322 |
adequacy_scores.append(0.0)
|
323 |
continue
|
324 |
+
|
325 |
# Sample size adequacy
|
326 |
+
min_required = EVALUATION_TRACKS[track_name]["min_samples_per_pair"] * summary.get("language_pairs_evaluated", 0)
|
327 |
+
sample_adequacy = min(summary.get("total_samples", 0) / max(min_required, 1), 1.0)
|
328 |
+
|
|
|
|
|
|
|
|
|
329 |
# Coverage adequacy
|
330 |
+
total_possible_pairs = len(EVALUATION_TRACKS[track_name]["languages"]) * (len(EVALUATION_TRACKS[track_name]["languages"]) - 1)
|
331 |
+
coverage_adequacy = summary.get("language_pairs_evaluated", 0) / max(total_possible_pairs, 1)
|
332 |
+
|
|
|
|
|
|
|
|
|
333 |
# Track adequacy
|
334 |
track_adequacy = (sample_adequacy + coverage_adequacy) / 2
|
335 |
adequacy_scores.append(track_adequacy)
|
336 |
+
|
337 |
return float(np.mean(adequacy_scores))
|
338 |
|
339 |
|
340 |
def get_track_leaderboard(
|
341 |
+
df: pd.DataFrame,
|
342 |
+
track: str,
|
343 |
metric: str = "quality",
|
344 |
category_filter: str = "all",
|
345 |
+
min_adequacy: float = 0.0
|
346 |
) -> pd.DataFrame:
|
347 |
"""Get leaderboard for a specific track with filtering."""
|
348 |
+
|
349 |
if df.empty:
|
350 |
return df
|
351 |
+
|
352 |
track_quality_col = f"{track}_{metric}"
|
353 |
track_adequate_col = f"{track}_adequate"
|
354 |
+
|
355 |
# Filter by adequacy
|
356 |
if min_adequacy > 0:
|
357 |
adequacy_mask = df["scientific_adequacy_score"] >= min_adequacy
|
358 |
df = df[adequacy_mask]
|
359 |
+
|
360 |
# Filter by category
|
361 |
if category_filter != "all":
|
362 |
df = df[df["model_category"] == category_filter]
|
363 |
+
|
364 |
# Filter to models that have this track
|
365 |
valid_mask = (df[track_quality_col] > 0) & df[track_adequate_col]
|
366 |
df = df[valid_mask]
|
367 |
+
|
368 |
if df.empty:
|
369 |
return df
|
370 |
+
|
371 |
# Sort by track-specific metric
|
372 |
df = df.sort_values(track_quality_col, ascending=False).reset_index(drop=True)
|
373 |
+
|
374 |
return df
|
375 |
|
376 |
|
377 |
def prepare_track_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
|
378 |
"""Prepare track-specific leaderboard for display."""
|
379 |
+
|
380 |
if df.empty:
|
381 |
return df
|
382 |
+
|
383 |
# Select relevant columns for this track
|
384 |
base_columns = ["model_name", "author", "submission_date", "model_category"]
|
385 |
+
|
386 |
track_columns = [
|
387 |
f"{track}_quality",
|
388 |
+
f"{track}_bleu",
|
389 |
f"{track}_chrf",
|
390 |
f"{track}_ci_lower",
|
391 |
f"{track}_ci_upper",
|
|
|
393 |
f"{track}_pairs",
|
394 |
f"{track}_adequate",
|
395 |
]
|
396 |
+
|
397 |
# Only include columns that exist
|
398 |
+
available_columns = [col for col in base_columns + track_columns if col in df.columns]
|
|
|
|
|
399 |
display_df = df[available_columns].copy()
|
400 |
+
|
401 |
# Format numeric columns
|
402 |
numeric_format = {
|
403 |
f"{track}_quality": "{:.4f}",
|
|
|
406 |
f"{track}_ci_lower": "{:.4f}",
|
407 |
f"{track}_ci_upper": "{:.4f}",
|
408 |
}
|
409 |
+
|
410 |
for col, fmt in numeric_format.items():
|
411 |
if col in display_df.columns:
|
412 |
display_df[col] = display_df[col].apply(
|
413 |
lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000"
|
414 |
)
|
415 |
+
|
416 |
# Format confidence intervals
|
417 |
+
if f"{track}_ci_lower" in display_df.columns and f"{track}_ci_upper" in display_df.columns:
|
|
|
|
|
|
|
418 |
display_df[f"{track}_confidence_interval"] = (
|
419 |
+
"[" + display_df[f"{track}_ci_lower"] + ", " + display_df[f"{track}_ci_upper"] + "]"
|
|
|
|
|
|
|
|
|
420 |
)
|
421 |
# Remove individual CI columns for cleaner display
|
422 |
display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"])
|
423 |
+
|
424 |
# Format submission date
|
425 |
if "submission_date" in display_df.columns:
|
426 |
+
display_df["submission_date"] = pd.to_datetime(display_df["submission_date"]).dt.strftime("%Y-%m-%d")
|
427 |
+
|
|
|
|
|
428 |
# Rename columns for better display
|
429 |
track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word
|
430 |
column_renames = {
|
|
|
440 |
f"{track}_pairs": "Pairs",
|
441 |
f"{track}_adequate": "Adequate",
|
442 |
}
|
443 |
+
|
444 |
display_df = display_df.rename(columns=column_renames)
|
445 |
+
|
446 |
return display_df
|
447 |
|
448 |
|
449 |
def get_scientific_leaderboard_stats(df: pd.DataFrame, track: str = None) -> Dict:
|
450 |
"""Get comprehensive statistics for the scientific leaderboard."""
|
451 |
+
|
452 |
if df.empty:
|
453 |
return {
|
454 |
"total_models": 0,
|
|
|
457 |
"adequacy_distribution": {},
|
458 |
"best_models_by_track": {},
|
459 |
}
|
460 |
+
|
461 |
stats = {
|
462 |
"total_models": len(df),
|
463 |
"models_by_category": df["model_category"].value_counts().to_dict(),
|
|
|
465 |
"track_statistics": {},
|
466 |
"best_models_by_track": {},
|
467 |
}
|
468 |
+
|
469 |
# Adequacy distribution
|
470 |
adequacy_bins = pd.cut(
|
471 |
+
df["scientific_adequacy_score"],
|
472 |
bins=[0, 0.3, 0.6, 0.8, 1.0],
|
473 |
+
labels=["Poor", "Fair", "Good", "Excellent"]
|
474 |
)
|
475 |
stats["adequacy_distribution"] = adequacy_bins.value_counts().to_dict()
|
476 |
+
|
477 |
# Track-specific statistics
|
478 |
for track_name in EVALUATION_TRACKS.keys():
|
479 |
quality_col = f"{track_name}_quality"
|
480 |
adequate_col = f"{track_name}_adequate"
|
481 |
+
|
482 |
if quality_col in df.columns and adequate_col in df.columns:
|
483 |
track_models = df[df[adequate_col] & (df[quality_col] > 0)]
|
484 |
+
|
485 |
if len(track_models) > 0:
|
486 |
stats["track_statistics"][track_name] = {
|
487 |
"participating_models": len(track_models),
|
|
|
489 |
"std_quality": float(track_models[quality_col].std()),
|
490 |
"best_quality": float(track_models[quality_col].max()),
|
491 |
}
|
492 |
+
|
493 |
# Best model for this track
|
494 |
best_model = track_models.loc[track_models[quality_col].idxmax()]
|
495 |
stats["best_models_by_track"][track_name] = {
|
|
|
497 |
"category": best_model["model_category"],
|
498 |
"quality": float(best_model[quality_col]),
|
499 |
}
|
500 |
+
|
501 |
return stats
|
502 |
|
503 |
|
504 |
def perform_fair_comparison(
|
505 |
+
df: pd.DataFrame,
|
506 |
+
model_names: List[str],
|
507 |
+
shared_pairs_only: bool = True
|
508 |
) -> Dict:
|
509 |
"""Perform fair comparison between models using only shared language pairs."""
|
510 |
+
|
511 |
models = df[df["model_name"].isin(model_names)]
|
512 |
+
|
513 |
if len(models) == 0:
|
514 |
return {"error": "No models found"}
|
515 |
+
|
516 |
comparison = {
|
517 |
"models": list(models["model_name"]),
|
518 |
"fair_comparison_possible": True,
|
|
|
520 |
"statistical_significance": {},
|
521 |
"recommendations": [],
|
522 |
}
|
523 |
+
|
524 |
# Check if fair comparison is possible
|
525 |
categories = models["model_category"].unique()
|
526 |
if len(categories) > 1:
|
527 |
comparison["recommendations"].append(
|
528 |
"⚠️ Comparing models from different categories - interpret results carefully"
|
529 |
)
|
530 |
+
|
531 |
# For each track, compare models
|
532 |
for track_name in EVALUATION_TRACKS.keys():
|
533 |
quality_col = f"{track_name}_quality"
|
534 |
adequate_col = f"{track_name}_adequate"
|
535 |
+
|
536 |
track_models = models[models[adequate_col] & (models[quality_col] > 0)]
|
537 |
+
|
538 |
if len(track_models) >= 2:
|
539 |
comparison["track_comparisons"][track_name] = {
|
540 |
"participating_models": len(track_models),
|
541 |
+
"quality_scores": dict(zip(track_models["model_name"], track_models[quality_col])),
|
|
|
|
|
542 |
"confidence_intervals": {},
|
543 |
}
|
544 |
+
|
545 |
# Extract confidence intervals
|
546 |
for _, model in track_models.iterrows():
|
547 |
ci_lower = model.get(f"{track_name}_ci_lower", 0)
|
548 |
ci_upper = model.get(f"{track_name}_ci_upper", 0)
|
549 |
+
comparison["track_comparisons"][track_name]["confidence_intervals"][model["model_name"]] = [ci_lower, ci_upper]
|
550 |
+
|
|
|
|
|
551 |
return comparison
|
552 |
|
553 |
|
554 |
def export_scientific_leaderboard(
|
555 |
+
df: pd.DataFrame,
|
556 |
+
track: str = "all",
|
557 |
+
format: str = "csv",
|
558 |
+
include_detailed: bool = False
|
559 |
) -> str:
|
560 |
"""Export scientific leaderboard in specified format."""
|
561 |
+
|
562 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
563 |
+
|
564 |
if track != "all":
|
565 |
# Export specific track
|
566 |
export_df = prepare_track_leaderboard_display(df, track)
|
|
|
572 |
else:
|
573 |
# Select essential columns
|
574 |
essential_columns = [
|
575 |
+
"model_name", "author", "submission_date", "model_category",
|
576 |
+
"scientific_adequacy_score"
|
|
|
|
|
|
|
577 |
]
|
578 |
+
|
579 |
# Add track-specific quality scores
|
580 |
for track_name in EVALUATION_TRACKS.keys():
|
581 |
+
essential_columns.extend([
|
582 |
+
f"{track_name}_quality",
|
583 |
+
f"{track_name}_adequate",
|
584 |
+
])
|
585 |
+
|
|
|
|
|
586 |
available_columns = [col for col in essential_columns if col in df.columns]
|
587 |
export_df = df[available_columns].copy()
|
588 |
+
|
589 |
filename_prefix = f"salt_leaderboard_scientific_{timestamp}"
|
590 |
+
|
591 |
# Export in specified format
|
592 |
if format == "csv":
|
593 |
filename = f"{filename_prefix}.csv"
|
|
|
600 |
export_df.to_excel(filename, index=False)
|
601 |
else:
|
602 |
raise ValueError(f"Unsupported format: {format}")
|
603 |
+
|
604 |
+
return filename
|