leaderboard / app.py
akera's picture
Update app.py
aed11c8 verified
raw
history blame
56.9 kB
# app.py
import subprocess
import sys
import os
from pathlib import Path
def setup_salt():
"""Clone and setup SALT library like in Colab."""
try:
# Check if salt is already available
import salt.dataset
print("βœ… SALT library already available")
return True
except ImportError:
pass
print("πŸ“₯ Setting up SALT library...")
try:
# Clone SALT repo if not exists
salt_dir = Path("salt")
if not salt_dir.exists():
print("πŸ”„ Cloning SALT repository...")
subprocess.check_call(
["git", "clone", "https://github.com/sunbirdai/salt.git"]
)
else:
print("πŸ“ SALT repository already exists")
# Install SALT requirements
salt_requirements = salt_dir / "requirements.txt"
if salt_requirements.exists():
print("πŸ“¦ Installing SALT requirements...")
subprocess.check_call(
[
sys.executable,
"-m",
"pip",
"install",
"-q",
"-r",
str(salt_requirements),
]
)
# Add SALT directory to Python path
salt_path = str(salt_dir.absolute())
if salt_path not in sys.path:
sys.path.insert(0, salt_path)
print(f"πŸ”— Added {salt_path} to Python path")
# Test import
import salt.dataset
print("βœ… SALT library setup completed successfully")
return True
except Exception as e:
print(f"❌ Failed to setup SALT: {e}")
return False
# Setup SALT on startup
print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
if not setup_salt():
print("❌ Cannot continue without SALT library")
print("πŸ’‘ Please check that git is available and GitHub is accessible")
sys.exit(1)
import gradio as gr
import pandas as pd
import json
import traceback
from datetime import datetime
from typing import Optional, Dict, Tuple, List
# Import our enhanced modules
from src.test_set import (
get_public_test_set_scientific,
get_complete_test_set_scientific,
create_test_set_download_scientific,
validate_test_set_integrity_scientific,
get_track_test_set,
)
from src.validation import validate_submission_scientific
from src.evaluation import (
evaluate_predictions_scientific,
generate_scientific_report,
compare_models_statistically,
)
from src.leaderboard import (
load_scientific_leaderboard,
add_model_to_scientific_leaderboard,
get_scientific_leaderboard_stats,
get_track_leaderboard,
prepare_track_leaderboard_display,
perform_fair_comparison,
export_scientific_leaderboard,
)
from src.plotting import (
create_scientific_leaderboard_plot,
create_language_pair_heatmap_scientific,
create_statistical_comparison_plot,
create_category_comparison_plot,
create_adequacy_analysis_plot,
create_cross_track_analysis_plot,
create_scientific_model_detail_plot,
)
from src.utils import (
sanitize_model_name,
get_all_language_pairs,
get_google_comparable_pairs,
get_track_language_pairs,
format_metric_value,
)
from config import *
# Global variables for caching
current_leaderboard = None
public_test_set = None
complete_test_set = None
test_set_stats = None
def initialize_scientific_data():
"""Initialize scientific test sets and leaderboard data."""
global public_test_set, complete_test_set, current_leaderboard, test_set_stats
try:
print("πŸ”¬ Initializing SALT Translation Leaderboard - Scientific Edition...")
# Load scientific test sets
print("πŸ“₯ Loading scientific test sets...")
public_test_set = get_public_test_set_scientific()
complete_test_set = get_complete_test_set_scientific()
# Load scientific leaderboard
print("πŸ† Loading scientific leaderboard...")
current_leaderboard = load_scientific_leaderboard()
# Validate test set integrity
print("πŸ” Validating test set integrity...")
test_set_stats = validate_test_set_integrity_scientific()
print(f"βœ… Scientific initialization complete!")
print(f" - Test set: {len(public_test_set):,} samples")
print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
print(
f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}"
)
print(f" - Current models: {len(current_leaderboard)}")
return True
except Exception as e:
print(f"❌ Scientific initialization failed: {e}")
traceback.print_exc()
return False
def download_scientific_test_set() -> Tuple[str, str]:
"""Create downloadable scientific test set and return file path and info."""
try:
global public_test_set
if public_test_set is None:
public_test_set = get_public_test_set_scientific()
# Create download file
download_path, stats = create_test_set_download_scientific()
# Create comprehensive info message
adequacy = stats.get("adequacy_assessment", "unknown")
adequacy_emoji = {
"excellent": "🟒",
"good": "🟑",
"fair": "🟠",
"insufficient": "πŸ”΄",
"unknown": "βšͺ",
}.get(adequacy, "βšͺ")
info_msg = f"""
## πŸ“₯ SALT Scientific Test Set Downloaded Successfully!
### πŸ”¬ Scientific Edition Features:
- **Stratified Sampling**: Ensures representative coverage across domains
- **Statistical Weighting**: Samples weighted by track importance
- **Track Balancing**: Optimized for fair cross-track comparison
- **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**
### πŸ“Š Dataset Statistics:
- **Total Samples**: {stats['total_samples']:,}
- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
- **Domains**: {', '.join(stats.get('domains', ['general']))}
### 🏁 Track Breakdown:
"""
track_breakdown = stats.get("track_breakdown", {})
for track_name, track_info in track_breakdown.items():
status_emoji = (
"βœ…" if track_info.get("statistical_adequacy", False) else "⚠️"
)
info_msg += f"""
**{status_emoji} {track_info.get('name', track_name)}**:
- Samples: {track_info.get('total_samples', 0):,}
- Language Pairs: {track_info.get('language_pairs', 0)}
- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
"""
info_msg += f"""
### πŸ“‹ Enhanced File Format:
- `sample_id`: Unique identifier for each sample
- `source_text`: Text to be translated
- `source_language`: Source language code
- `target_language`: Target language code
- `domain`: Content domain (if available)
- `google_comparable`: Whether this pair can be compared with Google Translate
- `tracks_included`: Comma-separated list of tracks that include this sample
- `statistical_weight`: Statistical importance weight (1.0-5.0)
### πŸ”¬ Next Steps for Scientific Evaluation:
1. **Run your model** on the source texts to generate translations
2. **Create a predictions file** with columns: `sample_id`, `prediction`
3. **Optional**: Add `category` column to help with model classification
4. **Submit** your predictions using the appropriate track tab
5. **Analyze** results with statistical confidence intervals
### πŸ’‘ Tips for Best Results:
- Ensure coverage of all language pairs for chosen track
- Include confidence scores if available
- Provide detailed model description for proper categorization
- Consider submitting to multiple tracks for comprehensive evaluation
"""
return download_path, info_msg
except Exception as e:
error_msg = f"❌ Error creating scientific test set download: {str(e)}"
return None, error_msg
def validate_scientific_submission(
file, model_name: str, author: str, description: str
) -> Tuple[str, Optional[pd.DataFrame], str]:
"""Validate uploaded prediction file with scientific rigor."""
try:
if file is None:
return "❌ Please upload a predictions file", None, "community"
if not model_name.strip():
return "❌ Please provide a model name", None, "community"
# Handle different file input types
if isinstance(file, bytes):
file_content = file
elif isinstance(file, str):
if os.path.exists(file):
with open(file, "rb") as f:
file_content = f.read()
else:
file_content = file.encode("utf-8")
elif hasattr(file, "name") and os.path.exists(file.name):
with open(file.name, "rb") as f:
file_content = f.read()
else:
return "❌ Could not read uploaded file", None, "community"
# Determine filename
filename = (
getattr(file, "name", None)
or getattr(file, "filename", None)
or "predictions.csv"
)
# Load test set if needed
global complete_test_set
if complete_test_set is None:
complete_test_set = get_complete_test_set_scientific()
# Run enhanced scientific validation
validation_result = validate_submission_scientific(
file_content, filename, complete_test_set, model_name, author, description
)
detected_category = validation_result.get("category", "community")
if validation_result["valid"]:
return (
validation_result["report"],
validation_result["predictions"],
detected_category,
)
else:
return validation_result["report"], None, detected_category
except Exception as e:
return (
f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
None,
"community",
)
def evaluate_scientific_submission(
predictions_df: pd.DataFrame,
model_name: str,
author: str,
description: str,
detected_category: str,
validation_info: Dict,
) -> Tuple[str, pd.DataFrame, object, object]:
"""Evaluate validated predictions using scientific methodology."""
try:
if predictions_df is None:
return "❌ No valid predictions to evaluate", None, None, None
# Get complete test set with targets
global complete_test_set, current_leaderboard
if complete_test_set is None:
complete_test_set = get_complete_test_set_scientific()
# Run scientific evaluation across all tracks
print(f"πŸ”¬ Starting scientific evaluation for {model_name}...")
evaluation_results = evaluate_predictions_scientific(
predictions_df, complete_test_set, detected_category
)
if any(
track_data.get("error")
for track_data in evaluation_results.get("tracks", {}).values()
):
errors = [
track_data["error"]
for track_data in evaluation_results["tracks"].values()
if track_data.get("error")
]
return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
# Add to scientific leaderboard
print("πŸ† Adding to scientific leaderboard...")
updated_leaderboard = add_model_to_scientific_leaderboard(
model_name=sanitize_model_name(model_name),
author=author or "Anonymous",
evaluation_results=evaluation_results,
model_category=detected_category,
description=description or "",
)
# Update global leaderboard
current_leaderboard = updated_leaderboard
# Generate scientific report
report = generate_scientific_report(evaluation_results, model_name)
# Create visualizations
summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
# Prepare display leaderboard (Google-comparable track by default)
google_leaderboard = get_track_leaderboard(
updated_leaderboard, "google_comparable"
)
display_leaderboard = prepare_track_leaderboard_display(
google_leaderboard, "google_comparable"
)
# Format success message with track-specific results
success_msg = f"""
## πŸŽ‰ Scientific Evaluation Complete!
### πŸ“Š Model Information:
- **Model**: {model_name}
- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
- **Author**: {author or 'Anonymous'}
### πŸ† Track Performance Summary:
"""
tracks = evaluation_results.get("tracks", {})
for track_name, track_data in tracks.items():
if not track_data.get("error"):
track_config = EVALUATION_TRACKS[track_name]
track_averages = track_data.get("track_averages", {})
summary = track_data.get("summary", {})
# Get rank in this track
track_leaderboard = get_track_leaderboard(
updated_leaderboard, track_name
)
if not track_leaderboard.empty:
model_row = track_leaderboard[
track_leaderboard["model_name"]
== sanitize_model_name(model_name)
]
rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
total_models = len(track_leaderboard)
else:
rank = "N/A"
total_models = 0
quality_score = track_averages.get("quality_score", 0)
bleu_score = track_averages.get("bleu", 0)
samples = summary.get("total_samples", 0)
success_msg += f"""
**🏁 {track_config['name']}**:
- Rank: #{rank} out of {total_models} models
- Quality Score: {quality_score:.4f}
- BLEU: {bleu_score:.2f}
- Samples: {samples:,}
"""
success_msg += f"""
### πŸ”¬ Scientific Adequacy:
- **Cross-Track Consistency**: Available in detailed analysis
- **Statistical Confidence**: 95% confidence intervals computed
- **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}
{report}
"""
return success_msg, display_leaderboard, summary_plot, cross_track_plot
except Exception as e:
error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
return error_msg, None, None, None
def refresh_track_leaderboard(
track: str,
search_query: str = "",
category_filter: str = "all",
min_adequacy: float = 0.0,
show_ci: bool = True,
) -> Tuple[pd.DataFrame, object, object, str]:
"""Refresh leaderboard for a specific track with filters."""
try:
global current_leaderboard
if current_leaderboard is None:
current_leaderboard = load_scientific_leaderboard()
# Get track-specific leaderboard
track_leaderboard = get_track_leaderboard(
current_leaderboard,
track,
category_filter=category_filter,
min_adequacy=min_adequacy,
)
# Apply search filter
if search_query:
query_lower = search_query.lower()
mask = track_leaderboard["model_name"].str.lower().str.contains(
query_lower, na=False
) | track_leaderboard["author"].str.lower().str.contains(
query_lower, na=False
)
track_leaderboard = track_leaderboard[mask]
# Prepare for display
display_df = prepare_track_leaderboard_display(track_leaderboard, track)
# Create plots
ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
# Get track statistics
track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
track_config = EVALUATION_TRACKS[track]
stats_text = f"""
### πŸ“Š {track_config['name']} Statistics
- **Total Models**: {track_stats.get('total_models', 0)}
- **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
- **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}
**Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
**Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}
### πŸ”¬ Scientific Notes:
- All metrics include 95% confidence intervals
- Statistical adequacy verified for reliable comparisons
- {track_config['description']}
"""
return display_df, ranking_plot, comparison_plot, stats_text
except Exception as e:
error_msg = f"Error loading {track} leaderboard: {str(e)}"
empty_df = pd.DataFrame()
return empty_df, None, None, error_msg
def get_scientific_model_details(
model_name: str, track: str
) -> Tuple[str, object, object]:
"""Get detailed scientific analysis for a specific model."""
try:
global current_leaderboard
if current_leaderboard is None:
return "Leaderboard not loaded", None, None
# Find model
model_row = current_leaderboard[current_leaderboard["model_name"] == model_name]
if model_row.empty:
return f"Model '{model_name}' not found", None, None
model_info = model_row.iloc[0]
# Parse detailed metrics for the requested track
try:
detailed_results = json.loads(model_info[f"detailed_{track}"])
except:
detailed_results = {}
# Create detailed plots
detail_plot = create_scientific_model_detail_plot(
detailed_results, model_name, track
)
# Create language pair heatmap
heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
# Format model details with scientific information
track_config = EVALUATION_TRACKS[track]
category_info = MODEL_CATEGORIES.get(model_info["model_category"], {})
# Extract track-specific metrics
quality_col = f"{track}_quality"
bleu_col = f"{track}_bleu"
chrf_col = f"{track}_chrf"
ci_lower_col = f"{track}_ci_lower"
ci_upper_col = f"{track}_ci_upper"
samples_col = f"{track}_samples"
pairs_col = f"{track}_pairs"
adequate_col = f"{track}_adequate"
details_text = f"""
## πŸ”¬ Scientific Model Analysis: {model_name}
### πŸ“‹ Basic Information:
- **Author**: {model_info['author']}
- **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
- **Submission Date**: {model_info['submission_date'][:10]}
- **Description**: {model_info['description'] or 'No description provided'}
### 🏁 {track_config['name']} Performance:
- **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))}
- **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')}
- **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')}
### πŸ“Š Coverage Information:
- **Total Samples**: {model_info.get(samples_col, 0):,}
- **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
- **Statistical Adequacy**: {'βœ… Yes' if model_info.get(adequate_col, False) else '❌ No'}
### πŸ”¬ Statistical Metadata:
- **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
- **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
- **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}
### πŸ“ˆ Cross-Track Performance:
"""
# Add other track performances for comparison
for other_track in EVALUATION_TRACKS.keys():
if other_track != track:
other_quality_col = f"{other_track}_quality"
other_adequate_col = f"{other_track}_adequate"
if model_info.get(other_adequate_col, False):
other_quality = model_info.get(other_quality_col, 0)
details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
else:
details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
details_text += f"""
### πŸ’‘ Scientific Interpretation:
- Performance metrics include 95% confidence intervals for reliability
- Statistical adequacy ensures meaningful comparisons with other models
- Cross-track analysis reveals model strengths across different language sets
- Category classification helps contextualize performance expectations
"""
return details_text, detail_plot, heatmap_plot
except Exception as e:
error_msg = f"Error getting model details: {str(e)}"
return error_msg, None, None
def perform_model_comparison(
model_names: List[str], track: str, comparison_type: str = "statistical"
) -> Tuple[str, object]:
"""Perform scientific comparison between selected models."""
try:
global current_leaderboard
if current_leaderboard is None:
return "Leaderboard not loaded", None
if len(model_names) < 2:
return "Please select at least 2 models for comparison", None
# Get models
models = current_leaderboard[
current_leaderboard["model_name"].isin(model_names)
]
if len(models) < 2:
return "Selected models not found in leaderboard", None
# Perform fair comparison
comparison_result = perform_fair_comparison(current_leaderboard, model_names)
if comparison_result.get("error"):
return f"Comparison error: {comparison_result['error']}", None
# Create comparison visualization
if comparison_type == "statistical":
comparison_plot = create_statistical_comparison_plot(models, track)
else:
comparison_plot = create_category_comparison_plot(models, track)
# Format comparison report
track_config = EVALUATION_TRACKS[track]
comparison_text = f"""
## πŸ”¬ Scientific Model Comparison - {track_config['name']}
### πŸ“Š Models Compared:
"""
quality_col = f"{track}_quality"
ci_lower_col = f"{track}_ci_lower"
ci_upper_col = f"{track}_ci_upper"
# Sort models by performance
models_sorted = models.sort_values(quality_col, ascending=False)
for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
category_info = MODEL_CATEGORIES.get(model["model_category"], {})
comparison_text += f"""
**#{i}. {model['model_name']}**
- Category: {category_info.get('name', 'Unknown')}
- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
- Author: {model['author']}
"""
# Add statistical analysis
track_comparison = comparison_result.get("track_comparisons", {}).get(track, {})
if track_comparison:
comparison_text += f"""
### πŸ”¬ Statistical Analysis:
- **Models with adequate data**: {track_comparison.get('participating_models', 0)}
- **Confidence intervals available**: Yes (95% level)
- **Fair comparison possible**: {'βœ… Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
"""
# Check for statistical significance (simplified)
quality_scores = list(track_comparison.get("quality_scores", {}).values())
if len(quality_scores) >= 2:
score_range = max(quality_scores) - min(quality_scores)
if score_range > 0.05: # 5% difference threshold
comparison_text += (
"- **Performance differences**: Potentially significant\n"
)
else:
comparison_text += "- **Performance differences**: Minimal\n"
# Add recommendations
recommendations = comparison_result.get("recommendations", [])
if recommendations:
comparison_text += "\n### πŸ’‘ Recommendations:\n"
for rec in recommendations:
comparison_text += f"- {rec}\n"
return comparison_text, comparison_plot
except Exception as e:
error_msg = f"Error performing comparison: {str(e)}"
return error_msg, None
# Initialize data on startup
print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
initialization_success = initialize_scientific_data()
# Create Gradio interface with scientific design
with gr.Blocks(
title=UI_CONFIG["title"],
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1600px !important;
margin: 0 auto;
}
.scientific-header {
text-align: center;
margin-bottom: 2rem;
padding: 2rem;
background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
color: white;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.track-tab {
border-radius: 8px;
margin: 0.5rem;
padding: 1rem;
border: 2px solid transparent;
}
.track-tab.google-comparable {
border-color: #1f77b4;
background: linear-gradient(45deg, #f0f9ff, #e0f2fe);
}
.track-tab.ug40-complete {
border-color: #ff7f0e;
background: linear-gradient(45deg, #fff7ed, #fed7aa);
}
.track-tab.language-pair-matrix {
border-color: #2ca02c;
background: linear-gradient(45deg, #f0fdf4, #dcfce7);
}
.metric-box {
background: #f8fafc;
padding: 1rem;
border-radius: 8px;
margin: 0.5rem 0;
border-left: 4px solid #3b82f6;
}
.scientific-note {
background: #fef3c7;
border: 1px solid #f59e0b;
border-radius: 8px;
padding: 1rem;
margin: 1rem 0;
}
.adequacy-excellent { border-left-color: #22c55e; }
.adequacy-good { border-left-color: #eab308; }
.adequacy-fair { border-left-color: #f97316; }
.adequacy-insufficient { border-left-color: #ef4444; }
""",
) as demo:
# Scientific Header
gr.HTML(
f"""
<div class="scientific-header">
<h1>πŸ† SALT Translation Leaderboard - Scientific Edition</h1>
<p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
<p>Three-tier evaluation tracks β€’ 95% Confidence intervals β€’ Research-grade analysis</p>
<p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
</div>
"""
)
# Status indicator
if initialization_success:
status_msg = "βœ… Scientific system initialized successfully"
adequacy_info = test_set_stats.get("scientific_adequacy", {}).get(
"overall_adequacy", "unknown"
)
status_msg += f" | Test set adequacy: {adequacy_info.title()}"
else:
status_msg = "❌ System initialization failed - some features may not work"
gr.Markdown(f"**System Status**: {status_msg}")
# Add scientific overview
gr.Markdown(
"""
## πŸ”¬ Scientific Evaluation Framework
This leaderboard implements rigorous scientific methodology for translation model evaluation:
- **Three Evaluation Tracks**: Fair comparison across different model capabilities
- **Statistical Significance**: 95% confidence intervals and effect size analysis
- **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
- **Cross-Track Consistency**: Validate model performance across language sets
"""
)
with gr.Tabs():
# Tab 1: Download Test Set
with gr.Tab("πŸ“₯ Download Test Set", id="download"):
gr.Markdown(
"""
## πŸ“‹ Get the SALT Scientific Test Set
Download our scientifically designed test set with stratified sampling and statistical weighting.
"""
)
with gr.Row():
download_btn = gr.Button(
"πŸ“₯ Download Scientific Test Set", variant="primary", size="lg"
)
with gr.Row():
with gr.Column():
download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
with gr.Column():
download_info = gr.Markdown(label="ℹ️ Test Set Information")
# Tab 2: Submit Predictions
with gr.Tab("πŸš€ Submit Predictions", id="submit"):
gr.Markdown(
"""
## 🎯 Submit Your Model's Predictions for Scientific Evaluation
Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Model Information")
model_name_input = gr.Textbox(
label="πŸ€– Model Name",
placeholder="e.g., MyTranslator-v2.0",
info="Unique name for your model",
)
author_input = gr.Textbox(
label="πŸ‘€ Author/Organization",
placeholder="Your name or organization",
value="Anonymous",
)
description_input = gr.Textbox(
label="πŸ“„ Model Description",
placeholder="Architecture, training data, special features...",
lines=4,
info="Detailed description helps with proper categorization",
)
gr.Markdown("### πŸ“€ Upload Predictions")
predictions_file = gr.File(
label="πŸ“‚ Predictions File",
file_types=[".csv", ".tsv", ".json"],
)
validate_btn = gr.Button(
"βœ… Validate Submission", variant="secondary"
)
submit_btn = gr.Button(
"πŸš€ Submit for Scientific Evaluation",
variant="primary",
interactive=False,
)
with gr.Column(scale=1):
gr.Markdown("### πŸ“Š Validation Results")
validation_output = gr.Markdown()
# Results section
gr.Markdown("### πŸ† Scientific Evaluation Results")
with gr.Row():
evaluation_output = gr.Markdown()
with gr.Row():
with gr.Column():
submission_plot = gr.Plot(label="πŸ“ˆ Submission Analysis")
with gr.Column():
cross_track_plot = gr.Plot(label="πŸ”„ Cross-Track Analysis")
with gr.Row():
results_table = gr.Dataframe(
label="πŸ“Š Updated Leaderboard (Google-Comparable Track)",
interactive=False,
)
# Tab 3: Google-Comparable Track
with gr.Tab(
"πŸ€– Google-Comparable Track",
id="google_track",
elem_classes=["track-tab", "google-comparable"],
):
gr.Markdown(
f"""
## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
**Fair comparison with commercial translation systems**
This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
enabling direct comparison with commercial baselines.
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
- **Purpose**: Commercial system comparison and baseline establishment
- **Statistical Power**: High (optimized sample sizes)
"""
)
with gr.Row():
with gr.Column(scale=2):
google_search = gr.Textbox(
label="πŸ” Search Models",
placeholder="Search by model name, author...",
)
with gr.Column(scale=1):
google_category = gr.Dropdown(
label="🏷️ Category Filter",
choices=["all"] + list(MODEL_CATEGORIES.keys()),
value="all",
)
with gr.Column(scale=1):
google_adequacy = gr.Slider(
label="πŸ“Š Min Adequacy",
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.1,
)
with gr.Column(scale=1):
google_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
with gr.Row():
google_stats = gr.Markdown()
with gr.Row():
with gr.Column():
google_ranking_plot = gr.Plot(label="πŸ† Google-Comparable Rankings")
with gr.Column():
google_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
with gr.Row():
google_leaderboard = gr.Dataframe(
label="πŸ“ˆ Google-Comparable Leaderboard", interactive=False
)
# Tab 4: UG40-Complete Track
with gr.Tab(
"🌍 UG40-Complete Track",
id="ug40_track",
elem_classes=["track-tab", "ug40-complete"],
):
gr.Markdown(
f"""
## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
**Comprehensive evaluation across all Ugandan languages**
This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
providing the most comprehensive assessment of Ugandan language translation capabilities.
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
- **Purpose**: Comprehensive Ugandan language capability assessment
- **Coverage**: Complete linguistic landscape of Uganda
"""
)
with gr.Row():
with gr.Column(scale=2):
ug40_search = gr.Textbox(
label="πŸ” Search Models",
placeholder="Search by model name, author...",
)
with gr.Column(scale=1):
ug40_category = gr.Dropdown(
label="🏷️ Category Filter",
choices=["all"] + list(MODEL_CATEGORIES.keys()),
value="all",
)
with gr.Column(scale=1):
ug40_adequacy = gr.Slider(
label="πŸ“Š Min Adequacy",
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.1,
)
with gr.Column(scale=1):
ug40_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
with gr.Row():
ug40_stats = gr.Markdown()
with gr.Row():
with gr.Column():
ug40_ranking_plot = gr.Plot(label="πŸ† UG40-Complete Rankings")
with gr.Column():
ug40_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
with gr.Row():
ug40_leaderboard = gr.Dataframe(
label="πŸ“ˆ UG40-Complete Leaderboard", interactive=False
)
# Tab 5: Language-Pair Matrix
with gr.Tab(
"πŸ“Š Language-Pair Matrix",
id="matrix_track",
elem_classes=["track-tab", "language-pair-matrix"],
):
gr.Markdown(
f"""
## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
**Detailed language pair analysis with statistical significance**
This view provides granular analysis of model performance across individual language pairs
with statistical significance testing and effect size analysis.
- **Resolution**: Individual language pair performance
- **Purpose**: Detailed linguistic analysis and model diagnostics
- **Statistics**: Pairwise significance testing available
"""
)
with gr.Row():
with gr.Column(scale=2):
matrix_search = gr.Textbox(
label="πŸ” Search Models",
placeholder="Search by model name, author...",
)
with gr.Column(scale=1):
matrix_category = gr.Dropdown(
label="🏷️ Category Filter",
choices=["all"] + list(MODEL_CATEGORIES.keys()),
value="all",
)
with gr.Column(scale=1):
matrix_adequacy = gr.Slider(
label="πŸ“Š Min Adequacy",
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.1,
)
with gr.Column(scale=1):
matrix_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
with gr.Row():
matrix_stats = gr.Markdown()
with gr.Row():
with gr.Column():
matrix_ranking_plot = gr.Plot(
label="πŸ† Language-Pair Matrix Rankings"
)
with gr.Column():
matrix_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
with gr.Row():
matrix_leaderboard = gr.Dataframe(
label="πŸ“ˆ Language-Pair Matrix Leaderboard", interactive=False
)
# Tab 6: Model Analysis
with gr.Tab("πŸ” Scientific Model Analysis", id="analysis"):
gr.Markdown(
"""
## πŸ”¬ Detailed Scientific Model Analysis
Comprehensive analysis of individual models with statistical confidence intervals,
cross-track performance, and detailed language pair breakdowns.
"""
)
with gr.Row():
with gr.Column(scale=2):
model_select = gr.Dropdown(
label="πŸ€– Select Model",
choices=[],
value=None,
info="Choose a model for detailed scientific analysis",
)
with gr.Column(scale=1):
track_select = gr.Dropdown(
label="🏁 Analysis Track",
choices=list(EVALUATION_TRACKS.keys()),
value="google_comparable",
info="Track for detailed analysis",
)
with gr.Column(scale=1):
analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
with gr.Row():
model_details = gr.Markdown()
with gr.Row():
with gr.Column():
model_analysis_plot = gr.Plot(
label="πŸ“Š Detailed Performance Analysis"
)
with gr.Column():
model_heatmap_plot = gr.Plot(label="πŸ—ΊοΈ Language Pair Heatmap")
# Tab 7: Model Comparison
with gr.Tab("βš–οΈ Scientific Model Comparison", id="comparison"):
gr.Markdown(
"""
## πŸ”¬ Scientific Model Comparison
Compare multiple models with statistical significance testing and fair comparison analysis.
Only models evaluated on the same language pairs are compared for scientific validity.
"""
)
with gr.Row():
with gr.Column(scale=2):
comparison_models = gr.CheckboxGroup(
label="πŸ€– Select Models to Compare",
choices=[],
value=[],
info="Select 2-6 models for comparison",
)
with gr.Column(scale=1):
comparison_track = gr.Dropdown(
label="🏁 Comparison Track",
choices=list(EVALUATION_TRACKS.keys()),
value="google_comparable",
)
comparison_type = gr.Radio(
label="πŸ“Š Comparison Type",
choices=["statistical", "category"],
value="statistical",
)
compare_btn = gr.Button("βš–οΈ Compare Models", variant="primary")
with gr.Row():
comparison_output = gr.Markdown()
with gr.Row():
comparison_plot = gr.Plot(label="πŸ“Š Model Comparison Analysis")
# Tab 8: Documentation
with gr.Tab("πŸ“š Scientific Documentation", id="docs"):
gr.Markdown(
f"""
# πŸ“– SALT Translation Leaderboard - Scientific Edition Documentation
## 🎯 Overview
The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
for translation models on Ugandan languages, designed for research publication and scientific analysis.
## πŸ”¬ Scientific Methodology
### Three-Tier Evaluation System
**1. πŸ€– Google-Comparable Track**
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
- **Pairs**: {len(get_google_comparable_pairs())} language pairs
- **Purpose**: Fair comparison with commercial translation systems
- **Statistical Power**: High (β‰₯200 samples per pair recommended)
**2. 🌍 UG40-Complete Track**
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
- **Pairs**: {len(get_all_language_pairs())} language pairs
- **Purpose**: Comprehensive Ugandan language capability assessment
- **Statistical Power**: Moderate (β‰₯100 samples per pair recommended)
**3. πŸ“Š Language-Pair Matrix**
- **Resolution**: Individual language pair analysis
- **Purpose**: Detailed linguistic analysis and model diagnostics
- **Statistics**: Pairwise significance testing with multiple comparison correction
### Statistical Rigor
- **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
- **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
- **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
- **Statistical Power**: Estimated based on sample sizes and effect sizes
### Model Categories
Models are automatically categorized for fair comparison:
- **🏒 Commercial**: Production translation systems (Google Translate, Azure, etc.)
- **πŸ”¬ Research**: Academic and research institution models (NLLB, M2M-100, etc.)
- **πŸ“Š Baseline**: Simple baseline and reference models
- **πŸ‘₯ Community**: User-submitted models and fine-tuned variants
## πŸ“Š Evaluation Metrics
### Primary Metrics
- **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
- **BLEU**: Bilingual Evaluation Understudy (0-100)
- **ChrF**: Character-level F-score (0-1)
### Secondary Metrics
- **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
- **CER/WER**: Character/Word Error Rate (lower is better)
- **Length Ratio**: Prediction/reference length ratio
All metrics include 95% confidence intervals for statistical reliability.
## πŸ”„ Submission Process
### Step 1: Download Scientific Test Set
1. Click "Download Scientific Test Set" in the first tab
2. Review test set adequacy and track breakdown
3. Save the enhanced test set with statistical weights
### Step 2: Generate Predictions
1. Load the test set in your evaluation pipeline
2. For each row, translate `source_text` from `source_language` to `target_language`
3. Save results as CSV with columns: `sample_id`, `prediction`
4. Optional: Add `category` column for automatic classification
### Step 3: Submit & Evaluate
1. Fill in detailed model information (improves categorization)
2. Upload your predictions file
3. Review validation report with track-specific adequacy assessment
4. Submit for scientific evaluation across all tracks
## πŸ“‹ Enhanced File Formats
### Scientific Test Set Format
```csv
sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
```
### Predictions Format
```csv
sample_id,prediction,category
salt_000001,"Amakuru ensi","community"
salt_000002,"Ibino nining?","community"
salt_000003,"Ejok nanu","community"
```
## πŸ† Scientific Leaderboard Features
### Fair Comparison
- Models only compared within the same category and track
- Statistical significance testing prevents misleading rankings
- Confidence intervals show measurement uncertainty
### Cross-Track Analysis
- Consistency analysis across evaluation tracks
- Identification of model strengths and weaknesses
- Language-specific performance patterns
### Publication Quality
- All visualizations include error bars and statistical annotations
- Comprehensive methodology documentation
- Reproducible evaluation pipeline
## πŸ”¬ Statistical Interpretation Guide
### Confidence Intervals
- **Non-overlapping CIs**: Likely significant difference
- **Overlapping CIs**: May or may not be significant (requires formal testing)
- **Wide CIs**: High uncertainty (need more data)
### Effect Sizes
- **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
- **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
- **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference
- **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
### Statistical Adequacy
- **Excellent**: High statistical power (>0.8) for all comparisons
- **Good**: Adequate power for most comparisons
- **Fair**: Limited power, interpret with caution
- **Insufficient**: Results not reliable for scientific conclusions
## 🀝 Contributing to Science
This leaderboard is designed for the research community. When using results:
1. **Always report confidence intervals** along with point estimates
2. **Acknowledge statistical adequacy** when interpreting results
3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
4. **Consider effect sizes** not just statistical significance
## πŸ“„ Citation
If you use this leaderboard in your research, please cite:
```bibtex
@misc{{salt_leaderboard_scientific_2024,
title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
author={{Sunbird AI}},
year={{2024}},
url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
note={{Three-tier evaluation system with statistical significance testing}}
}}
```
## πŸ”— Related Resources
- **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
- **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
- **Statistical Methodology**: See our technical paper on rigorous MT evaluation
- **Open Source Code**: Available on GitHub for reproducibility
---
*For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
"""
)
# Event handlers with enhanced scientific functionality
predictions_validated = gr.State(value=None)
validation_info_state = gr.State(value=None)
detected_category_state = gr.State(value="community")
# Download test set
download_btn.click(
fn=download_scientific_test_set, outputs=[download_file, download_info]
)
# Validate predictions
def handle_scientific_validation(file, model_name, author, description):
report, predictions, category = validate_scientific_submission(
file, model_name, author, description
)
# Enable button if predictions are available and format is valid
# This allows "can be evaluated with limitations" cases
can_evaluate = predictions is not None
# Additional check: ensure we have some basic validity
if can_evaluate and "❌ **Final Verdict**: Please address issues" in report:
can_evaluate = False
return (
report,
predictions,
{"category": category, "validation_passed": can_evaluate},
category,
gr.update(interactive=can_evaluate),
)
validate_btn.click(
fn=handle_scientific_validation,
inputs=[predictions_file, model_name_input, author_input, description_input],
outputs=[
validation_output,
predictions_validated,
validation_info_state,
detected_category_state,
submit_btn,
],
)
# Submit for evaluation
def handle_scientific_submission(
predictions, model_name, author, description, category, validation_info
):
if predictions is None:
return "❌ Please validate your submission first", None, None, None
return evaluate_scientific_submission(
predictions, model_name, author, description, category, validation_info
)
submit_btn.click(
fn=handle_scientific_submission,
inputs=[
predictions_validated,
model_name_input,
author_input,
description_input,
detected_category_state,
validation_info_state,
],
outputs=[evaluation_output, results_table, submission_plot, cross_track_plot],
)
# Track leaderboard refresh functions
def refresh_google_track(*args):
return refresh_track_leaderboard("google_comparable", *args)
def refresh_ug40_track(*args):
return refresh_track_leaderboard("ug40_complete", *args)
def refresh_matrix_track(*args):
return refresh_track_leaderboard("language_pair_matrix", *args)
# Google-Comparable Track
google_refresh.click(
fn=refresh_google_track,
inputs=[google_search, google_category, google_adequacy],
outputs=[
google_leaderboard,
google_ranking_plot,
google_comparison_plot,
google_stats,
],
)
# UG40-Complete Track
ug40_refresh.click(
fn=refresh_ug40_track,
inputs=[ug40_search, ug40_category, ug40_adequacy],
outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats],
)
# Language-Pair Matrix Track
matrix_refresh.click(
fn=refresh_matrix_track,
inputs=[matrix_search, matrix_category, matrix_adequacy],
outputs=[
matrix_leaderboard,
matrix_ranking_plot,
matrix_comparison_plot,
matrix_stats,
],
)
# Model analysis
analyze_btn.click(
fn=get_scientific_model_details,
inputs=[model_select, track_select],
outputs=[model_details, model_analysis_plot, model_heatmap_plot],
)
# Model comparison
compare_btn.click(
fn=perform_model_comparison,
inputs=[comparison_models, comparison_track, comparison_type],
outputs=[comparison_output, comparison_plot],
)
# Load initial data and update dropdowns
def load_initial_data():
# Load initial Google track data
google_data = refresh_google_track("", "all", 0.0)
# Update dropdown choices
if current_leaderboard is not None and not current_leaderboard.empty:
model_choices = current_leaderboard["model_name"].tolist()
else:
model_choices = []
return (
google_data[0], # google_leaderboard
google_data[1], # google_ranking_plot
google_data[2], # google_comparison_plot
google_data[3], # google_stats
gr.Dropdown(choices=model_choices), # model_select
gr.CheckboxGroup(choices=model_choices), # comparison_models
)
demo.load(
fn=load_initial_data,
outputs=[
google_leaderboard,
google_ranking_plot,
google_comparison_plot,
google_stats,
model_select,
comparison_models,
],
)
# Launch the scientific application
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)