Spaces:
Sleeping
Sleeping
# app.py | |
import subprocess | |
import sys | |
import os | |
from pathlib import Path | |
def setup_salt(): | |
"""Clone and setup SALT library like in Colab.""" | |
try: | |
# Check if salt is already available | |
import salt.dataset | |
print("β SALT library already available") | |
return True | |
except ImportError: | |
pass | |
print("π₯ Setting up SALT library...") | |
try: | |
# Clone SALT repo if not exists | |
salt_dir = Path("salt") | |
if not salt_dir.exists(): | |
print("π Cloning SALT repository...") | |
subprocess.check_call( | |
["git", "clone", "https://github.com/sunbirdai/salt.git"] | |
) | |
else: | |
print("π SALT repository already exists") | |
# Install SALT requirements | |
salt_requirements = salt_dir / "requirements.txt" | |
if salt_requirements.exists(): | |
print("π¦ Installing SALT requirements...") | |
subprocess.check_call( | |
[ | |
sys.executable, | |
"-m", | |
"pip", | |
"install", | |
"-q", | |
"-r", | |
str(salt_requirements), | |
] | |
) | |
# Add SALT directory to Python path | |
salt_path = str(salt_dir.absolute()) | |
if salt_path not in sys.path: | |
sys.path.insert(0, salt_path) | |
print(f"π Added {salt_path} to Python path") | |
# Test import | |
import salt.dataset | |
print("β SALT library setup completed successfully") | |
return True | |
except Exception as e: | |
print(f"β Failed to setup SALT: {e}") | |
return False | |
# Setup SALT on startup | |
print("π Starting SALT Translation Leaderboard - Scientific Edition...") | |
if not setup_salt(): | |
print("β Cannot continue without SALT library") | |
print("π‘ Please check that git is available and GitHub is accessible") | |
sys.exit(1) | |
import gradio as gr | |
import pandas as pd | |
import json | |
import traceback | |
from datetime import datetime | |
from typing import Optional, Dict, Tuple, List | |
# Import our enhanced modules | |
from src.test_set import ( | |
get_public_test_set_scientific, | |
get_complete_test_set_scientific, | |
create_test_set_download_scientific, | |
validate_test_set_integrity_scientific, | |
get_track_test_set, | |
) | |
from src.validation import validate_submission_scientific | |
from src.evaluation import ( | |
evaluate_predictions_scientific, | |
generate_scientific_report, | |
compare_models_statistically, | |
) | |
from src.leaderboard import ( | |
load_scientific_leaderboard, | |
add_model_to_scientific_leaderboard, | |
get_scientific_leaderboard_stats, | |
get_track_leaderboard, | |
prepare_track_leaderboard_display, | |
perform_fair_comparison, | |
export_scientific_leaderboard, | |
) | |
from src.plotting import ( | |
create_scientific_leaderboard_plot, | |
create_language_pair_heatmap_scientific, | |
create_statistical_comparison_plot, | |
create_category_comparison_plot, | |
create_adequacy_analysis_plot, | |
create_cross_track_analysis_plot, | |
create_scientific_model_detail_plot, | |
) | |
from src.utils import ( | |
sanitize_model_name, | |
get_all_language_pairs, | |
get_google_comparable_pairs, | |
get_track_language_pairs, | |
format_metric_value, | |
) | |
from config import * | |
# Global variables for caching | |
current_leaderboard = None | |
public_test_set = None | |
complete_test_set = None | |
test_set_stats = None | |
def initialize_scientific_data(): | |
"""Initialize scientific test sets and leaderboard data.""" | |
global public_test_set, complete_test_set, current_leaderboard, test_set_stats | |
try: | |
print("π¬ Initializing SALT Translation Leaderboard - Scientific Edition...") | |
# Load scientific test sets | |
print("π₯ Loading scientific test sets...") | |
public_test_set = get_public_test_set_scientific() | |
complete_test_set = get_complete_test_set_scientific() | |
# Load scientific leaderboard | |
print("π Loading scientific leaderboard...") | |
current_leaderboard = load_scientific_leaderboard() | |
# Validate test set integrity | |
print("π Validating test set integrity...") | |
test_set_stats = validate_test_set_integrity_scientific() | |
print(f"β Scientific initialization complete!") | |
print(f" - Test set: {len(public_test_set):,} samples") | |
print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}") | |
print( | |
f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}" | |
) | |
print(f" - Current models: {len(current_leaderboard)}") | |
return True | |
except Exception as e: | |
print(f"β Scientific initialization failed: {e}") | |
traceback.print_exc() | |
return False | |
def download_scientific_test_set() -> Tuple[str, str]: | |
"""Create downloadable scientific test set and return file path and info.""" | |
try: | |
global public_test_set | |
if public_test_set is None: | |
public_test_set = get_public_test_set_scientific() | |
# Create download file | |
download_path, stats = create_test_set_download_scientific() | |
# Create comprehensive info message | |
adequacy = stats.get("adequacy_assessment", "unknown") | |
adequacy_emoji = { | |
"excellent": "π’", | |
"good": "π‘", | |
"fair": "π ", | |
"insufficient": "π΄", | |
"unknown": "βͺ", | |
}.get(adequacy, "βͺ") | |
info_msg = f""" | |
## π₯ SALT Scientific Test Set Downloaded Successfully! | |
### π¬ Scientific Edition Features: | |
- **Stratified Sampling**: Ensures representative coverage across domains | |
- **Statistical Weighting**: Samples weighted by track importance | |
- **Track Balancing**: Optimized for fair cross-track comparison | |
- **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}** | |
### π Dataset Statistics: | |
- **Total Samples**: {stats['total_samples']:,} | |
- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))}) | |
- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%}) | |
- **Domains**: {', '.join(stats.get('domains', ['general']))} | |
### π Track Breakdown: | |
""" | |
track_breakdown = stats.get("track_breakdown", {}) | |
for track_name, track_info in track_breakdown.items(): | |
status_emoji = ( | |
"β " if track_info.get("statistical_adequacy", False) else "β οΈ" | |
) | |
info_msg += f""" | |
**{status_emoji} {track_info.get('name', track_name)}**: | |
- Samples: {track_info.get('total_samples', 0):,} | |
- Language Pairs: {track_info.get('language_pairs', 0)} | |
- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)} | |
- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'} | |
""" | |
info_msg += f""" | |
### π Enhanced File Format: | |
- `sample_id`: Unique identifier for each sample | |
- `source_text`: Text to be translated | |
- `source_language`: Source language code | |
- `target_language`: Target language code | |
- `domain`: Content domain (if available) | |
- `google_comparable`: Whether this pair can be compared with Google Translate | |
- `tracks_included`: Comma-separated list of tracks that include this sample | |
- `statistical_weight`: Statistical importance weight (1.0-5.0) | |
### π¬ Next Steps for Scientific Evaluation: | |
1. **Run your model** on the source texts to generate translations | |
2. **Create a predictions file** with columns: `sample_id`, `prediction` | |
3. **Optional**: Add `category` column to help with model classification | |
4. **Submit** your predictions using the appropriate track tab | |
5. **Analyze** results with statistical confidence intervals | |
### π‘ Tips for Best Results: | |
- Ensure coverage of all language pairs for chosen track | |
- Include confidence scores if available | |
- Provide detailed model description for proper categorization | |
- Consider submitting to multiple tracks for comprehensive evaluation | |
""" | |
return download_path, info_msg | |
except Exception as e: | |
error_msg = f"β Error creating scientific test set download: {str(e)}" | |
return None, error_msg | |
def validate_scientific_submission( | |
file, model_name: str, author: str, description: str | |
) -> Tuple[str, Optional[pd.DataFrame], str]: | |
"""Validate uploaded prediction file with scientific rigor.""" | |
try: | |
if file is None: | |
return "β Please upload a predictions file", None, "community" | |
if not model_name.strip(): | |
return "β Please provide a model name", None, "community" | |
# Handle different file input types | |
if isinstance(file, bytes): | |
file_content = file | |
elif isinstance(file, str): | |
if os.path.exists(file): | |
with open(file, "rb") as f: | |
file_content = f.read() | |
else: | |
file_content = file.encode("utf-8") | |
elif hasattr(file, "name") and os.path.exists(file.name): | |
with open(file.name, "rb") as f: | |
file_content = f.read() | |
else: | |
return "β Could not read uploaded file", None, "community" | |
# Determine filename | |
filename = ( | |
getattr(file, "name", None) | |
or getattr(file, "filename", None) | |
or "predictions.csv" | |
) | |
# Load test set if needed | |
global complete_test_set | |
if complete_test_set is None: | |
complete_test_set = get_complete_test_set_scientific() | |
# Run enhanced scientific validation | |
validation_result = validate_submission_scientific( | |
file_content, filename, complete_test_set, model_name, author, description | |
) | |
detected_category = validation_result.get("category", "community") | |
if validation_result["valid"]: | |
return ( | |
validation_result["report"], | |
validation_result["predictions"], | |
detected_category, | |
) | |
else: | |
return validation_result["report"], None, detected_category | |
except Exception as e: | |
return ( | |
f"β Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", | |
None, | |
"community", | |
) | |
def evaluate_scientific_submission( | |
predictions_df: pd.DataFrame, | |
model_name: str, | |
author: str, | |
description: str, | |
detected_category: str, | |
validation_info: Dict, | |
) -> Tuple[str, pd.DataFrame, object, object]: | |
"""Evaluate validated predictions using scientific methodology.""" | |
try: | |
if predictions_df is None: | |
return "β No valid predictions to evaluate", None, None, None | |
# Get complete test set with targets | |
global complete_test_set, current_leaderboard | |
if complete_test_set is None: | |
complete_test_set = get_complete_test_set_scientific() | |
# Run scientific evaluation across all tracks | |
print(f"π¬ Starting scientific evaluation for {model_name}...") | |
evaluation_results = evaluate_predictions_scientific( | |
predictions_df, complete_test_set, detected_category | |
) | |
if any( | |
track_data.get("error") | |
for track_data in evaluation_results.get("tracks", {}).values() | |
): | |
errors = [ | |
track_data["error"] | |
for track_data in evaluation_results["tracks"].values() | |
if track_data.get("error") | |
] | |
return f"β Evaluation errors: {'; '.join(errors)}", None, None, None | |
# Add to scientific leaderboard | |
print("π Adding to scientific leaderboard...") | |
updated_leaderboard = add_model_to_scientific_leaderboard( | |
model_name=sanitize_model_name(model_name), | |
author=author or "Anonymous", | |
evaluation_results=evaluation_results, | |
model_category=detected_category, | |
description=description or "", | |
) | |
# Update global leaderboard | |
current_leaderboard = updated_leaderboard | |
# Generate scientific report | |
report = generate_scientific_report(evaluation_results, model_name) | |
# Create visualizations | |
summary_plot = create_adequacy_analysis_plot(updated_leaderboard) | |
cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard) | |
# Prepare display leaderboard (Google-comparable track by default) | |
google_leaderboard = get_track_leaderboard( | |
updated_leaderboard, "google_comparable" | |
) | |
display_leaderboard = prepare_track_leaderboard_display( | |
google_leaderboard, "google_comparable" | |
) | |
# Format success message with track-specific results | |
success_msg = f""" | |
## π Scientific Evaluation Complete! | |
### π Model Information: | |
- **Model**: {model_name} | |
- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)} | |
- **Author**: {author or 'Anonymous'} | |
### π Track Performance Summary: | |
""" | |
tracks = evaluation_results.get("tracks", {}) | |
for track_name, track_data in tracks.items(): | |
if not track_data.get("error"): | |
track_config = EVALUATION_TRACKS[track_name] | |
track_averages = track_data.get("track_averages", {}) | |
summary = track_data.get("summary", {}) | |
# Get rank in this track | |
track_leaderboard = get_track_leaderboard( | |
updated_leaderboard, track_name | |
) | |
if not track_leaderboard.empty: | |
model_row = track_leaderboard[ | |
track_leaderboard["model_name"] | |
== sanitize_model_name(model_name) | |
] | |
rank = model_row.index[0] + 1 if not model_row.empty else "N/A" | |
total_models = len(track_leaderboard) | |
else: | |
rank = "N/A" | |
total_models = 0 | |
quality_score = track_averages.get("quality_score", 0) | |
bleu_score = track_averages.get("bleu", 0) | |
samples = summary.get("total_samples", 0) | |
success_msg += f""" | |
**π {track_config['name']}**: | |
- Rank: #{rank} out of {total_models} models | |
- Quality Score: {quality_score:.4f} | |
- BLEU: {bleu_score:.2f} | |
- Samples: {samples:,} | |
""" | |
success_msg += f""" | |
### π¬ Scientific Adequacy: | |
- **Cross-Track Consistency**: Available in detailed analysis | |
- **Statistical Confidence**: 95% confidence intervals computed | |
- **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')} | |
{report} | |
""" | |
return success_msg, display_leaderboard, summary_plot, cross_track_plot | |
except Exception as e: | |
error_msg = f"β Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
return error_msg, None, None, None | |
def refresh_track_leaderboard( | |
track: str, | |
search_query: str = "", | |
category_filter: str = "all", | |
min_adequacy: float = 0.0, | |
show_ci: bool = True, | |
) -> Tuple[pd.DataFrame, object, object, str]: | |
"""Refresh leaderboard for a specific track with filters.""" | |
try: | |
global current_leaderboard | |
if current_leaderboard is None: | |
current_leaderboard = load_scientific_leaderboard() | |
# Get track-specific leaderboard | |
track_leaderboard = get_track_leaderboard( | |
current_leaderboard, | |
track, | |
category_filter=category_filter, | |
min_adequacy=min_adequacy, | |
) | |
# Apply search filter | |
if search_query: | |
query_lower = search_query.lower() | |
mask = track_leaderboard["model_name"].str.lower().str.contains( | |
query_lower, na=False | |
) | track_leaderboard["author"].str.lower().str.contains( | |
query_lower, na=False | |
) | |
track_leaderboard = track_leaderboard[mask] | |
# Prepare for display | |
display_df = prepare_track_leaderboard_display(track_leaderboard, track) | |
# Create plots | |
ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track) | |
comparison_plot = create_statistical_comparison_plot(track_leaderboard, track) | |
# Get track statistics | |
track_stats = get_scientific_leaderboard_stats(track_leaderboard, track) | |
track_config = EVALUATION_TRACKS[track] | |
stats_text = f""" | |
### π {track_config['name']} Statistics | |
- **Total Models**: {track_stats.get('total_models', 0)} | |
- **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])} | |
- **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f} | |
**Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')} | |
**Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f} | |
### π¬ Scientific Notes: | |
- All metrics include 95% confidence intervals | |
- Statistical adequacy verified for reliable comparisons | |
- {track_config['description']} | |
""" | |
return display_df, ranking_plot, comparison_plot, stats_text | |
except Exception as e: | |
error_msg = f"Error loading {track} leaderboard: {str(e)}" | |
empty_df = pd.DataFrame() | |
return empty_df, None, None, error_msg | |
def get_scientific_model_details( | |
model_name: str, track: str | |
) -> Tuple[str, object, object]: | |
"""Get detailed scientific analysis for a specific model.""" | |
try: | |
global current_leaderboard | |
if current_leaderboard is None: | |
return "Leaderboard not loaded", None, None | |
# Find model | |
model_row = current_leaderboard[current_leaderboard["model_name"] == model_name] | |
if model_row.empty: | |
return f"Model '{model_name}' not found", None, None | |
model_info = model_row.iloc[0] | |
# Parse detailed metrics for the requested track | |
try: | |
detailed_results = json.loads(model_info[f"detailed_{track}"]) | |
except: | |
detailed_results = {} | |
# Create detailed plots | |
detail_plot = create_scientific_model_detail_plot( | |
detailed_results, model_name, track | |
) | |
# Create language pair heatmap | |
heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track) | |
# Format model details with scientific information | |
track_config = EVALUATION_TRACKS[track] | |
category_info = MODEL_CATEGORIES.get(model_info["model_category"], {}) | |
# Extract track-specific metrics | |
quality_col = f"{track}_quality" | |
bleu_col = f"{track}_bleu" | |
chrf_col = f"{track}_chrf" | |
ci_lower_col = f"{track}_ci_lower" | |
ci_upper_col = f"{track}_ci_upper" | |
samples_col = f"{track}_samples" | |
pairs_col = f"{track}_pairs" | |
adequate_col = f"{track}_adequate" | |
details_text = f""" | |
## π¬ Scientific Model Analysis: {model_name} | |
### π Basic Information: | |
- **Author**: {model_info['author']} | |
- **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')} | |
- **Submission Date**: {model_info['submission_date'][:10]} | |
- **Description**: {model_info['description'] or 'No description provided'} | |
### π {track_config['name']} Performance: | |
- **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))} | |
- **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')} | |
- **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')} | |
### π Coverage Information: | |
- **Total Samples**: {model_info.get(samples_col, 0):,} | |
- **Language Pairs Covered**: {model_info.get(pairs_col, 0)} | |
- **Statistical Adequacy**: {'β Yes' if model_info.get(adequate_col, False) else 'β No'} | |
### π¬ Statistical Metadata: | |
- **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%} | |
- **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,} | |
- **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f} | |
### π Cross-Track Performance: | |
""" | |
# Add other track performances for comparison | |
for other_track in EVALUATION_TRACKS.keys(): | |
if other_track != track: | |
other_quality_col = f"{other_track}_quality" | |
other_adequate_col = f"{other_track}_adequate" | |
if model_info.get(other_adequate_col, False): | |
other_quality = model_info.get(other_quality_col, 0) | |
details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n" | |
else: | |
details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n" | |
details_text += f""" | |
### π‘ Scientific Interpretation: | |
- Performance metrics include 95% confidence intervals for reliability | |
- Statistical adequacy ensures meaningful comparisons with other models | |
- Cross-track analysis reveals model strengths across different language sets | |
- Category classification helps contextualize performance expectations | |
""" | |
return details_text, detail_plot, heatmap_plot | |
except Exception as e: | |
error_msg = f"Error getting model details: {str(e)}" | |
return error_msg, None, None | |
def perform_model_comparison( | |
model_names: List[str], track: str, comparison_type: str = "statistical" | |
) -> Tuple[str, object]: | |
"""Perform scientific comparison between selected models.""" | |
try: | |
global current_leaderboard | |
if current_leaderboard is None: | |
return "Leaderboard not loaded", None | |
if len(model_names) < 2: | |
return "Please select at least 2 models for comparison", None | |
# Get models | |
models = current_leaderboard[ | |
current_leaderboard["model_name"].isin(model_names) | |
] | |
if len(models) < 2: | |
return "Selected models not found in leaderboard", None | |
# Perform fair comparison | |
comparison_result = perform_fair_comparison(current_leaderboard, model_names) | |
if comparison_result.get("error"): | |
return f"Comparison error: {comparison_result['error']}", None | |
# Create comparison visualization | |
if comparison_type == "statistical": | |
comparison_plot = create_statistical_comparison_plot(models, track) | |
else: | |
comparison_plot = create_category_comparison_plot(models, track) | |
# Format comparison report | |
track_config = EVALUATION_TRACKS[track] | |
comparison_text = f""" | |
## π¬ Scientific Model Comparison - {track_config['name']} | |
### π Models Compared: | |
""" | |
quality_col = f"{track}_quality" | |
ci_lower_col = f"{track}_ci_lower" | |
ci_upper_col = f"{track}_ci_upper" | |
# Sort models by performance | |
models_sorted = models.sort_values(quality_col, ascending=False) | |
for i, (_, model) in enumerate(models_sorted.iterrows(), 1): | |
category_info = MODEL_CATEGORIES.get(model["model_category"], {}) | |
comparison_text += f""" | |
**#{i}. {model['model_name']}** | |
- Category: {category_info.get('name', 'Unknown')} | |
- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])} | |
- Author: {model['author']} | |
""" | |
# Add statistical analysis | |
track_comparison = comparison_result.get("track_comparisons", {}).get(track, {}) | |
if track_comparison: | |
comparison_text += f""" | |
### π¬ Statistical Analysis: | |
- **Models with adequate data**: {track_comparison.get('participating_models', 0)} | |
- **Confidence intervals available**: Yes (95% level) | |
- **Fair comparison possible**: {'β Yes' if comparison_result.get('fair_comparison_possible', False) else 'β οΈ Limited'} | |
""" | |
# Check for statistical significance (simplified) | |
quality_scores = list(track_comparison.get("quality_scores", {}).values()) | |
if len(quality_scores) >= 2: | |
score_range = max(quality_scores) - min(quality_scores) | |
if score_range > 0.05: # 5% difference threshold | |
comparison_text += ( | |
"- **Performance differences**: Potentially significant\n" | |
) | |
else: | |
comparison_text += "- **Performance differences**: Minimal\n" | |
# Add recommendations | |
recommendations = comparison_result.get("recommendations", []) | |
if recommendations: | |
comparison_text += "\n### π‘ Recommendations:\n" | |
for rec in recommendations: | |
comparison_text += f"- {rec}\n" | |
return comparison_text, comparison_plot | |
except Exception as e: | |
error_msg = f"Error performing comparison: {str(e)}" | |
return error_msg, None | |
# Initialize data on startup | |
print("π Starting SALT Translation Leaderboard - Scientific Edition...") | |
initialization_success = initialize_scientific_data() | |
# Create Gradio interface with scientific design | |
with gr.Blocks( | |
title=UI_CONFIG["title"], | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1600px !important; | |
margin: 0 auto; | |
} | |
.scientific-header { | |
text-align: center; | |
margin-bottom: 2rem; | |
padding: 2rem; | |
background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); | |
color: white; | |
border-radius: 10px; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
} | |
.track-tab { | |
border-radius: 8px; | |
margin: 0.5rem; | |
padding: 1rem; | |
border: 2px solid transparent; | |
} | |
.track-tab.google-comparable { | |
border-color: #1f77b4; | |
background: linear-gradient(45deg, #f0f9ff, #e0f2fe); | |
} | |
.track-tab.ug40-complete { | |
border-color: #ff7f0e; | |
background: linear-gradient(45deg, #fff7ed, #fed7aa); | |
} | |
.track-tab.language-pair-matrix { | |
border-color: #2ca02c; | |
background: linear-gradient(45deg, #f0fdf4, #dcfce7); | |
} | |
.metric-box { | |
background: #f8fafc; | |
padding: 1rem; | |
border-radius: 8px; | |
margin: 0.5rem 0; | |
border-left: 4px solid #3b82f6; | |
} | |
.scientific-note { | |
background: #fef3c7; | |
border: 1px solid #f59e0b; | |
border-radius: 8px; | |
padding: 1rem; | |
margin: 1rem 0; | |
} | |
.adequacy-excellent { border-left-color: #22c55e; } | |
.adequacy-good { border-left-color: #eab308; } | |
.adequacy-fair { border-left-color: #f97316; } | |
.adequacy-insufficient { border-left-color: #ef4444; } | |
""", | |
) as demo: | |
# Scientific Header | |
gr.HTML( | |
f""" | |
<div class="scientific-header"> | |
<h1>π SALT Translation Leaderboard - Scientific Edition</h1> | |
<p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p> | |
<p>Three-tier evaluation tracks β’ 95% Confidence intervals β’ Research-grade analysis</p> | |
<p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p> | |
</div> | |
""" | |
) | |
# Status indicator | |
if initialization_success: | |
status_msg = "β Scientific system initialized successfully" | |
adequacy_info = test_set_stats.get("scientific_adequacy", {}).get( | |
"overall_adequacy", "unknown" | |
) | |
status_msg += f" | Test set adequacy: {adequacy_info.title()}" | |
else: | |
status_msg = "β System initialization failed - some features may not work" | |
gr.Markdown(f"**System Status**: {status_msg}") | |
# Add scientific overview | |
gr.Markdown( | |
""" | |
## π¬ Scientific Evaluation Framework | |
This leaderboard implements rigorous scientific methodology for translation model evaluation: | |
- **Three Evaluation Tracks**: Fair comparison across different model capabilities | |
- **Statistical Significance**: 95% confidence intervals and effect size analysis | |
- **Category-Based Analysis**: Commercial, Research, Baseline, and Community models | |
- **Cross-Track Consistency**: Validate model performance across language sets | |
""" | |
) | |
with gr.Tabs(): | |
# Tab 1: Download Test Set | |
with gr.Tab("π₯ Download Test Set", id="download"): | |
gr.Markdown( | |
""" | |
## π Get the SALT Scientific Test Set | |
Download our scientifically designed test set with stratified sampling and statistical weighting. | |
""" | |
) | |
with gr.Row(): | |
download_btn = gr.Button( | |
"π₯ Download Scientific Test Set", variant="primary", size="lg" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
download_file = gr.File(label="π Test Set File", interactive=False) | |
with gr.Column(): | |
download_info = gr.Markdown(label="βΉοΈ Test Set Information") | |
# Tab 2: Submit Predictions | |
with gr.Tab("π Submit Predictions", id="submit"): | |
gr.Markdown( | |
""" | |
## π― Submit Your Model's Predictions for Scientific Evaluation | |
Upload predictions for comprehensive evaluation across all three tracks with statistical analysis. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### π Model Information") | |
model_name_input = gr.Textbox( | |
label="π€ Model Name", | |
placeholder="e.g., MyTranslator-v2.0", | |
info="Unique name for your model", | |
) | |
author_input = gr.Textbox( | |
label="π€ Author/Organization", | |
placeholder="Your name or organization", | |
value="Anonymous", | |
) | |
description_input = gr.Textbox( | |
label="π Model Description", | |
placeholder="Architecture, training data, special features...", | |
lines=4, | |
info="Detailed description helps with proper categorization", | |
) | |
gr.Markdown("### π€ Upload Predictions") | |
predictions_file = gr.File( | |
label="π Predictions File", | |
file_types=[".csv", ".tsv", ".json"], | |
) | |
validate_btn = gr.Button( | |
"β Validate Submission", variant="secondary" | |
) | |
submit_btn = gr.Button( | |
"π Submit for Scientific Evaluation", | |
variant="primary", | |
interactive=False, | |
) | |
with gr.Column(scale=1): | |
gr.Markdown("### π Validation Results") | |
validation_output = gr.Markdown() | |
# Results section | |
gr.Markdown("### π Scientific Evaluation Results") | |
with gr.Row(): | |
evaluation_output = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
submission_plot = gr.Plot(label="π Submission Analysis") | |
with gr.Column(): | |
cross_track_plot = gr.Plot(label="π Cross-Track Analysis") | |
with gr.Row(): | |
results_table = gr.Dataframe( | |
label="π Updated Leaderboard (Google-Comparable Track)", | |
interactive=False, | |
) | |
# Tab 3: Google-Comparable Track | |
with gr.Tab( | |
"π€ Google-Comparable Track", | |
id="google_track", | |
elem_classes=["track-tab", "google-comparable"], | |
): | |
gr.Markdown( | |
f""" | |
## {UI_CONFIG['tracks']['google_comparable']['tab_name']} | |
**Fair comparison with commercial translation systems** | |
This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate, | |
enabling direct comparison with commercial baselines. | |
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])} | |
- **Purpose**: Commercial system comparison and baseline establishment | |
- **Statistical Power**: High (optimized sample sizes) | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
google_search = gr.Textbox( | |
label="π Search Models", | |
placeholder="Search by model name, author...", | |
) | |
with gr.Column(scale=1): | |
google_category = gr.Dropdown( | |
label="π·οΈ Category Filter", | |
choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
value="all", | |
) | |
with gr.Column(scale=1): | |
google_adequacy = gr.Slider( | |
label="π Min Adequacy", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.0, | |
step=0.1, | |
) | |
with gr.Column(scale=1): | |
google_refresh = gr.Button("π Refresh", variant="secondary") | |
with gr.Row(): | |
google_stats = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
google_ranking_plot = gr.Plot(label="π Google-Comparable Rankings") | |
with gr.Column(): | |
google_comparison_plot = gr.Plot(label="π Statistical Comparison") | |
with gr.Row(): | |
google_leaderboard = gr.Dataframe( | |
label="π Google-Comparable Leaderboard", interactive=False | |
) | |
# Tab 4: UG40-Complete Track | |
with gr.Tab( | |
"π UG40-Complete Track", | |
id="ug40_track", | |
elem_classes=["track-tab", "ug40-complete"], | |
): | |
gr.Markdown( | |
f""" | |
## {UI_CONFIG['tracks']['ug40_complete']['tab_name']} | |
**Comprehensive evaluation across all Ugandan languages** | |
This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs, | |
providing the most comprehensive assessment of Ugandan language translation capabilities. | |
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages | |
- **Purpose**: Comprehensive Ugandan language capability assessment | |
- **Coverage**: Complete linguistic landscape of Uganda | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
ug40_search = gr.Textbox( | |
label="π Search Models", | |
placeholder="Search by model name, author...", | |
) | |
with gr.Column(scale=1): | |
ug40_category = gr.Dropdown( | |
label="π·οΈ Category Filter", | |
choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
value="all", | |
) | |
with gr.Column(scale=1): | |
ug40_adequacy = gr.Slider( | |
label="π Min Adequacy", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.0, | |
step=0.1, | |
) | |
with gr.Column(scale=1): | |
ug40_refresh = gr.Button("π Refresh", variant="secondary") | |
with gr.Row(): | |
ug40_stats = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
ug40_ranking_plot = gr.Plot(label="π UG40-Complete Rankings") | |
with gr.Column(): | |
ug40_comparison_plot = gr.Plot(label="π Statistical Comparison") | |
with gr.Row(): | |
ug40_leaderboard = gr.Dataframe( | |
label="π UG40-Complete Leaderboard", interactive=False | |
) | |
# Tab 5: Language-Pair Matrix | |
with gr.Tab( | |
"π Language-Pair Matrix", | |
id="matrix_track", | |
elem_classes=["track-tab", "language-pair-matrix"], | |
): | |
gr.Markdown( | |
f""" | |
## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']} | |
**Detailed language pair analysis with statistical significance** | |
This view provides granular analysis of model performance across individual language pairs | |
with statistical significance testing and effect size analysis. | |
- **Resolution**: Individual language pair performance | |
- **Purpose**: Detailed linguistic analysis and model diagnostics | |
- **Statistics**: Pairwise significance testing available | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
matrix_search = gr.Textbox( | |
label="π Search Models", | |
placeholder="Search by model name, author...", | |
) | |
with gr.Column(scale=1): | |
matrix_category = gr.Dropdown( | |
label="π·οΈ Category Filter", | |
choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
value="all", | |
) | |
with gr.Column(scale=1): | |
matrix_adequacy = gr.Slider( | |
label="π Min Adequacy", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.0, | |
step=0.1, | |
) | |
with gr.Column(scale=1): | |
matrix_refresh = gr.Button("π Refresh", variant="secondary") | |
with gr.Row(): | |
matrix_stats = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
matrix_ranking_plot = gr.Plot( | |
label="π Language-Pair Matrix Rankings" | |
) | |
with gr.Column(): | |
matrix_comparison_plot = gr.Plot(label="π Statistical Comparison") | |
with gr.Row(): | |
matrix_leaderboard = gr.Dataframe( | |
label="π Language-Pair Matrix Leaderboard", interactive=False | |
) | |
# Tab 6: Model Analysis | |
with gr.Tab("π Scientific Model Analysis", id="analysis"): | |
gr.Markdown( | |
""" | |
## π¬ Detailed Scientific Model Analysis | |
Comprehensive analysis of individual models with statistical confidence intervals, | |
cross-track performance, and detailed language pair breakdowns. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
model_select = gr.Dropdown( | |
label="π€ Select Model", | |
choices=[], | |
value=None, | |
info="Choose a model for detailed scientific analysis", | |
) | |
with gr.Column(scale=1): | |
track_select = gr.Dropdown( | |
label="π Analysis Track", | |
choices=list(EVALUATION_TRACKS.keys()), | |
value="google_comparable", | |
info="Track for detailed analysis", | |
) | |
with gr.Column(scale=1): | |
analyze_btn = gr.Button("π Analyze", variant="primary") | |
with gr.Row(): | |
model_details = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
model_analysis_plot = gr.Plot( | |
label="π Detailed Performance Analysis" | |
) | |
with gr.Column(): | |
model_heatmap_plot = gr.Plot(label="πΊοΈ Language Pair Heatmap") | |
# Tab 7: Model Comparison | |
with gr.Tab("βοΈ Scientific Model Comparison", id="comparison"): | |
gr.Markdown( | |
""" | |
## π¬ Scientific Model Comparison | |
Compare multiple models with statistical significance testing and fair comparison analysis. | |
Only models evaluated on the same language pairs are compared for scientific validity. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
comparison_models = gr.CheckboxGroup( | |
label="π€ Select Models to Compare", | |
choices=[], | |
value=[], | |
info="Select 2-6 models for comparison", | |
) | |
with gr.Column(scale=1): | |
comparison_track = gr.Dropdown( | |
label="π Comparison Track", | |
choices=list(EVALUATION_TRACKS.keys()), | |
value="google_comparable", | |
) | |
comparison_type = gr.Radio( | |
label="π Comparison Type", | |
choices=["statistical", "category"], | |
value="statistical", | |
) | |
compare_btn = gr.Button("βοΈ Compare Models", variant="primary") | |
with gr.Row(): | |
comparison_output = gr.Markdown() | |
with gr.Row(): | |
comparison_plot = gr.Plot(label="π Model Comparison Analysis") | |
# Tab 8: Documentation | |
with gr.Tab("π Scientific Documentation", id="docs"): | |
gr.Markdown( | |
f""" | |
# π SALT Translation Leaderboard - Scientific Edition Documentation | |
## π― Overview | |
The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology | |
for translation models on Ugandan languages, designed for research publication and scientific analysis. | |
## π¬ Scientific Methodology | |
### Three-Tier Evaluation System | |
**1. π€ Google-Comparable Track** | |
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])} | |
- **Pairs**: {len(get_google_comparable_pairs())} language pairs | |
- **Purpose**: Fair comparison with commercial translation systems | |
- **Statistical Power**: High (β₯200 samples per pair recommended) | |
**2. π UG40-Complete Track** | |
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages | |
- **Pairs**: {len(get_all_language_pairs())} language pairs | |
- **Purpose**: Comprehensive Ugandan language capability assessment | |
- **Statistical Power**: Moderate (β₯100 samples per pair recommended) | |
**3. π Language-Pair Matrix** | |
- **Resolution**: Individual language pair analysis | |
- **Purpose**: Detailed linguistic analysis and model diagnostics | |
- **Statistics**: Pairwise significance testing with multiple comparison correction | |
### Statistical Rigor | |
- **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples) | |
- **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction | |
- **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']}) | |
- **Statistical Power**: Estimated based on sample sizes and effect sizes | |
### Model Categories | |
Models are automatically categorized for fair comparison: | |
- **π’ Commercial**: Production translation systems (Google Translate, Azure, etc.) | |
- **π¬ Research**: Academic and research institution models (NLLB, M2M-100, etc.) | |
- **π Baseline**: Simple baseline and reference models | |
- **π₯ Community**: User-submitted models and fine-tuned variants | |
## π Evaluation Metrics | |
### Primary Metrics | |
- **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE | |
- **BLEU**: Bilingual Evaluation Understudy (0-100) | |
- **ChrF**: Character-level F-score (0-1) | |
### Secondary Metrics | |
- **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap | |
- **CER/WER**: Character/Word Error Rate (lower is better) | |
- **Length Ratio**: Prediction/reference length ratio | |
All metrics include 95% confidence intervals for statistical reliability. | |
## π Submission Process | |
### Step 1: Download Scientific Test Set | |
1. Click "Download Scientific Test Set" in the first tab | |
2. Review test set adequacy and track breakdown | |
3. Save the enhanced test set with statistical weights | |
### Step 2: Generate Predictions | |
1. Load the test set in your evaluation pipeline | |
2. For each row, translate `source_text` from `source_language` to `target_language` | |
3. Save results as CSV with columns: `sample_id`, `prediction` | |
4. Optional: Add `category` column for automatic classification | |
### Step 3: Submit & Evaluate | |
1. Fill in detailed model information (improves categorization) | |
2. Upload your predictions file | |
3. Review validation report with track-specific adequacy assessment | |
4. Submit for scientific evaluation across all tracks | |
## π Enhanced File Formats | |
### Scientific Test Set Format | |
```csv | |
sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight | |
salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5 | |
salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5 | |
salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0 | |
``` | |
### Predictions Format | |
```csv | |
sample_id,prediction,category | |
salt_000001,"Amakuru ensi","community" | |
salt_000002,"Ibino nining?","community" | |
salt_000003,"Ejok nanu","community" | |
``` | |
## π Scientific Leaderboard Features | |
### Fair Comparison | |
- Models only compared within the same category and track | |
- Statistical significance testing prevents misleading rankings | |
- Confidence intervals show measurement uncertainty | |
### Cross-Track Analysis | |
- Consistency analysis across evaluation tracks | |
- Identification of model strengths and weaknesses | |
- Language-specific performance patterns | |
### Publication Quality | |
- All visualizations include error bars and statistical annotations | |
- Comprehensive methodology documentation | |
- Reproducible evaluation pipeline | |
## π¬ Statistical Interpretation Guide | |
### Confidence Intervals | |
- **Non-overlapping CIs**: Likely significant difference | |
- **Overlapping CIs**: May or may not be significant (requires formal testing) | |
- **Wide CIs**: High uncertainty (need more data) | |
### Effect Sizes | |
- **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence | |
- **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference | |
- **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference | |
- **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference | |
### Statistical Adequacy | |
- **Excellent**: High statistical power (>0.8) for all comparisons | |
- **Good**: Adequate power for most comparisons | |
- **Fair**: Limited power, interpret with caution | |
- **Insufficient**: Results not reliable for scientific conclusions | |
## π€ Contributing to Science | |
This leaderboard is designed for the research community. When using results: | |
1. **Always report confidence intervals** along with point estimates | |
2. **Acknowledge statistical adequacy** when interpreting results | |
3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results) | |
4. **Consider effect sizes** not just statistical significance | |
## π Citation | |
If you use this leaderboard in your research, please cite: | |
```bibtex | |
@misc{{salt_leaderboard_scientific_2024, | |
title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}}, | |
author={{Sunbird AI}}, | |
year={{2024}}, | |
url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}}, | |
note={{Three-tier evaluation system with statistical significance testing}} | |
}} | |
``` | |
## π Related Resources | |
- **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt) | |
- **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research) | |
- **Statistical Methodology**: See our technical paper on rigorous MT evaluation | |
- **Open Source Code**: Available on GitHub for reproducibility | |
--- | |
*For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]* | |
""" | |
) | |
# Event handlers with enhanced scientific functionality | |
predictions_validated = gr.State(value=None) | |
validation_info_state = gr.State(value=None) | |
detected_category_state = gr.State(value="community") | |
# Download test set | |
download_btn.click( | |
fn=download_scientific_test_set, outputs=[download_file, download_info] | |
) | |
# Validate predictions | |
def handle_scientific_validation(file, model_name, author, description): | |
report, predictions, category = validate_scientific_submission( | |
file, model_name, author, description | |
) | |
# Enable button if predictions are available and format is valid | |
# This allows "can be evaluated with limitations" cases | |
can_evaluate = predictions is not None | |
# Additional check: ensure we have some basic validity | |
if can_evaluate and "β **Final Verdict**: Please address issues" in report: | |
can_evaluate = False | |
return ( | |
report, | |
predictions, | |
{"category": category, "validation_passed": can_evaluate}, | |
category, | |
gr.update(interactive=can_evaluate), | |
) | |
validate_btn.click( | |
fn=handle_scientific_validation, | |
inputs=[predictions_file, model_name_input, author_input, description_input], | |
outputs=[ | |
validation_output, | |
predictions_validated, | |
validation_info_state, | |
detected_category_state, | |
submit_btn, | |
], | |
) | |
# Submit for evaluation | |
def handle_scientific_submission( | |
predictions, model_name, author, description, category, validation_info | |
): | |
if predictions is None: | |
return "β Please validate your submission first", None, None, None | |
return evaluate_scientific_submission( | |
predictions, model_name, author, description, category, validation_info | |
) | |
submit_btn.click( | |
fn=handle_scientific_submission, | |
inputs=[ | |
predictions_validated, | |
model_name_input, | |
author_input, | |
description_input, | |
detected_category_state, | |
validation_info_state, | |
], | |
outputs=[evaluation_output, results_table, submission_plot, cross_track_plot], | |
) | |
# Track leaderboard refresh functions | |
def refresh_google_track(*args): | |
return refresh_track_leaderboard("google_comparable", *args) | |
def refresh_ug40_track(*args): | |
return refresh_track_leaderboard("ug40_complete", *args) | |
def refresh_matrix_track(*args): | |
return refresh_track_leaderboard("language_pair_matrix", *args) | |
# Google-Comparable Track | |
google_refresh.click( | |
fn=refresh_google_track, | |
inputs=[google_search, google_category, google_adequacy], | |
outputs=[ | |
google_leaderboard, | |
google_ranking_plot, | |
google_comparison_plot, | |
google_stats, | |
], | |
) | |
# UG40-Complete Track | |
ug40_refresh.click( | |
fn=refresh_ug40_track, | |
inputs=[ug40_search, ug40_category, ug40_adequacy], | |
outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats], | |
) | |
# Language-Pair Matrix Track | |
matrix_refresh.click( | |
fn=refresh_matrix_track, | |
inputs=[matrix_search, matrix_category, matrix_adequacy], | |
outputs=[ | |
matrix_leaderboard, | |
matrix_ranking_plot, | |
matrix_comparison_plot, | |
matrix_stats, | |
], | |
) | |
# Model analysis | |
analyze_btn.click( | |
fn=get_scientific_model_details, | |
inputs=[model_select, track_select], | |
outputs=[model_details, model_analysis_plot, model_heatmap_plot], | |
) | |
# Model comparison | |
compare_btn.click( | |
fn=perform_model_comparison, | |
inputs=[comparison_models, comparison_track, comparison_type], | |
outputs=[comparison_output, comparison_plot], | |
) | |
# Load initial data and update dropdowns | |
def load_initial_data(): | |
# Load initial Google track data | |
google_data = refresh_google_track("", "all", 0.0) | |
# Update dropdown choices | |
if current_leaderboard is not None and not current_leaderboard.empty: | |
model_choices = current_leaderboard["model_name"].tolist() | |
else: | |
model_choices = [] | |
return ( | |
google_data[0], # google_leaderboard | |
google_data[1], # google_ranking_plot | |
google_data[2], # google_comparison_plot | |
google_data[3], # google_stats | |
gr.Dropdown(choices=model_choices), # model_select | |
gr.CheckboxGroup(choices=model_choices), # comparison_models | |
) | |
demo.load( | |
fn=load_initial_data, | |
outputs=[ | |
google_leaderboard, | |
google_ranking_plot, | |
google_comparison_plot, | |
google_stats, | |
model_select, | |
comparison_models, | |
], | |
) | |
# Launch the scientific application | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) | |