leaderboard / src /leaderboard.py
akera's picture
Update src/leaderboard.py
83243ea verified
raw
history blame
11.7 kB
# src/leaderboard.py
import pandas as pd
from datasets import Dataset, load_dataset
import json
import datetime
from typing import Dict, List, Optional, Tuple
import os
import numpy as np
from config import (
LEADERBOARD_DATASET,
HF_TOKEN,
EVALUATION_TRACKS,
MODEL_CATEGORIES,
METRICS_CONFIG,
)
from src.utils import create_submission_id, sanitize_model_name
def initialize_leaderboard() -> pd.DataFrame:
"""Initialize empty leaderboard DataFrame with all required columns."""
columns = {
# Basic information
"submission_id": [],
"model_name": [],
"author": [],
"submission_date": [],
"model_category": [],
"description": [],
# Track-specific quality scores
"google_comparable_quality": [],
"ug40_complete_quality": [],
# Track-specific BLEU scores
"google_comparable_bleu": [],
"ug40_complete_bleu": [],
# Track-specific ChrF scores
"google_comparable_chrf": [],
"ug40_complete_chrf": [],
# Confidence intervals
"google_comparable_ci_lower": [],
"google_comparable_ci_upper": [],
"ug40_complete_ci_lower": [],
"ug40_complete_ci_upper": [],
# Coverage information
"google_comparable_samples": [],
"ug40_complete_samples": [],
"google_comparable_pairs": [],
"ug40_complete_pairs": [],
# Detailed results (JSON strings)
"detailed_google_comparable": [],
"detailed_ug40_complete": [],
# Metadata
"evaluation_date": [],
}
return pd.DataFrame(columns)
def load_leaderboard() -> pd.DataFrame:
"""Load current leaderboard from HuggingFace dataset."""
try:
print("πŸ“₯ Loading leaderboard...")
dataset = load_dataset(LEADERBOARD_DATASET, split="train", token=HF_TOKEN)
df = dataset.to_pandas()
# Ensure all required columns exist
required_columns = list(initialize_leaderboard().columns)
for col in required_columns:
if col not in df.columns:
if "quality" in col or "bleu" in col or "chrf" in col or "ci_" in col:
df[col] = 0.0
elif "samples" in col or "pairs" in col:
df[col] = 0
else:
df[col] = ""
# Ensure proper data types for numeric columns
numeric_columns = [
col for col in df.columns
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"])
]
for col in numeric_columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
print(f"βœ… Loaded leaderboard with {len(df)} entries")
return df
except Exception as e:
print(f"⚠️ Could not load leaderboard: {e}")
print("πŸ”„ Initializing empty leaderboard...")
return initialize_leaderboard()
def save_leaderboard(df: pd.DataFrame) -> bool:
"""Save leaderboard to HuggingFace dataset."""
try:
# Clean data before saving
df_clean = df.copy()
# Ensure numeric columns are proper types
numeric_columns = [
col for col in df_clean.columns
if any(x in col for x in ["quality", "bleu", "chrf", "ci_", "samples", "pairs"])
]
for col in numeric_columns:
if col in df_clean.columns:
df_clean[col] = pd.to_numeric(df_clean[col], errors="coerce").fillna(0.0)
# Convert to dataset
dataset = Dataset.from_pandas(df_clean)
# Push to hub
dataset.push_to_hub(
LEADERBOARD_DATASET,
token=HF_TOKEN,
commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()[:19]}",
)
print("βœ… Leaderboard saved successfully!")
return True
except Exception as e:
print(f"❌ Error saving leaderboard: {e}")
return False
def add_model_to_leaderboard(
model_name: str,
author: str,
evaluation_results: Dict,
model_category: str = "community",
description: str = "",
) -> pd.DataFrame:
"""Add new model results to leaderboard."""
# Load current leaderboard
df = load_leaderboard()
# Remove existing entry if present
existing_mask = df["model_name"] == model_name
if existing_mask.any():
df = df[~existing_mask]
# Extract track results
tracks = evaluation_results.get("tracks", {})
# Prepare new entry
new_entry = {
"submission_id": create_submission_id(),
"model_name": sanitize_model_name(model_name),
"author": author[:100] if author else "Anonymous",
"submission_date": datetime.datetime.now().isoformat(),
"model_category": model_category if model_category in MODEL_CATEGORIES else "community",
"description": description[:500] if description else "",
# Extract track-specific metrics
**extract_track_metrics(tracks),
# Confidence intervals
**extract_confidence_intervals(tracks),
# Coverage information
**extract_coverage_information(tracks),
# Detailed results (JSON strings)
**serialize_detailed_results(tracks),
# Metadata
"evaluation_date": datetime.datetime.now().isoformat(),
}
# Convert to DataFrame and append
new_row_df = pd.DataFrame([new_entry])
updated_df = pd.concat([df, new_row_df], ignore_index=True)
# Save to hub
save_leaderboard(updated_df)
return updated_df
def extract_track_metrics(tracks: Dict) -> Dict:
"""Extract primary metrics from each track."""
metrics = {}
for track_name in EVALUATION_TRACKS.keys():
track_data = tracks.get(track_name, {})
track_averages = track_data.get("track_averages", {})
# Quality score
metrics[f"{track_name}_quality"] = float(track_averages.get("quality_score", 0.0))
# BLEU score
metrics[f"{track_name}_bleu"] = float(track_averages.get("bleu", 0.0))
# ChrF score
metrics[f"{track_name}_chrf"] = float(track_averages.get("chrf", 0.0))
return metrics
def extract_confidence_intervals(tracks: Dict) -> Dict:
"""Extract confidence intervals from each track."""
ci_data = {}
for track_name in EVALUATION_TRACKS.keys():
track_data = tracks.get(track_name, {})
track_confidence = track_data.get("track_confidence", {})
quality_stats = track_confidence.get("quality_score", {})
ci_data[f"{track_name}_ci_lower"] = float(quality_stats.get("ci_lower", 0.0))
ci_data[f"{track_name}_ci_upper"] = float(quality_stats.get("ci_upper", 0.0))
return ci_data
def extract_coverage_information(tracks: Dict) -> Dict:
"""Extract coverage information from each track."""
coverage = {}
for track_name in EVALUATION_TRACKS.keys():
track_data = tracks.get(track_name, {})
summary = track_data.get("summary", {})
coverage[f"{track_name}_samples"] = int(summary.get("total_samples", 0))
coverage[f"{track_name}_pairs"] = int(summary.get("language_pairs_evaluated", 0))
return coverage
def serialize_detailed_results(tracks: Dict) -> Dict:
"""Serialize detailed results for storage."""
detailed = {}
for track_name in EVALUATION_TRACKS.keys():
track_data = tracks.get(track_name, {})
# Create simplified detailed results for storage
simple_track_data = {
"pair_metrics": track_data.get("pair_metrics", {}),
"track_averages": track_data.get("track_averages", {}),
"track_confidence": track_data.get("track_confidence", {}),
"summary": track_data.get("summary", {})
}
detailed[f"detailed_{track_name}"] = json.dumps(simple_track_data)
return detailed
def get_track_leaderboard(
df: pd.DataFrame,
track: str,
metric: str = "quality",
category_filter: str = "all"
) -> pd.DataFrame:
"""Get leaderboard for a specific track with filtering."""
if df.empty:
return df
track_quality_col = f"{track}_{metric}"
# Ensure columns exist
if track_quality_col not in df.columns:
print(f"Warning: Missing column for track {track}")
return pd.DataFrame()
# Filter by category
if category_filter != "all":
df = df[df["model_category"] == category_filter]
# Filter to models that have this track
quality_mask = pd.to_numeric(df[track_quality_col], errors='coerce') > 0
df = df[quality_mask]
if df.empty:
return df
# Sort by track-specific metric
df = df.sort_values(track_quality_col, ascending=False).reset_index(drop=True)
return df
def prepare_leaderboard_display(df: pd.DataFrame, track: str) -> pd.DataFrame:
"""Prepare track-specific leaderboard for display."""
if df.empty:
return df
# Select relevant columns for this track
base_columns = ["model_name", "author", "submission_date", "model_category"]
track_columns = [
f"{track}_quality",
f"{track}_bleu",
f"{track}_chrf",
f"{track}_ci_lower",
f"{track}_ci_upper",
f"{track}_samples",
f"{track}_pairs",
]
# Only include columns that exist
available_columns = [col for col in base_columns + track_columns if col in df.columns]
display_df = df[available_columns].copy()
# Format numeric columns
numeric_format = {
f"{track}_quality": "{:.4f}",
f"{track}_bleu": "{:.2f}",
f"{track}_chrf": "{:.4f}",
f"{track}_ci_lower": "{:.4f}",
f"{track}_ci_upper": "{:.4f}",
}
for col, fmt in numeric_format.items():
if col in display_df.columns:
display_df[col] = display_df[col].apply(
lambda x: fmt.format(float(x)) if pd.notnull(x) else "0.0000"
)
# Format confidence intervals
if f"{track}_ci_lower" in display_df.columns and f"{track}_ci_upper" in display_df.columns:
display_df[f"{track}_confidence_interval"] = (
"[" + display_df[f"{track}_ci_lower"] + ", " + display_df[f"{track}_ci_upper"] + "]"
)
# Remove individual CI columns for cleaner display
display_df = display_df.drop(columns=[f"{track}_ci_lower", f"{track}_ci_upper"])
# Format submission date
if "submission_date" in display_df.columns:
display_df["submission_date"] = pd.to_datetime(display_df["submission_date"]).dt.strftime("%Y-%m-%d")
# Rename columns for better display
track_name = EVALUATION_TRACKS[track]["name"].split()[0] # First word
column_renames = {
"model_name": "Model Name",
"author": "Author",
"submission_date": "Submitted",
"model_category": "Category",
f"{track}_quality": f"{track_name} Quality",
f"{track}_bleu": f"{track_name} BLEU",
f"{track}_chrf": f"{track_name} ChrF",
f"{track}_confidence_interval": "95% CI",
f"{track}_samples": "Samples",
f"{track}_pairs": "Pairs",
}
display_df = display_df.rename(columns=column_renames)
return display_df