Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

leaderboard / app.py

akera

Update app.py

aed11c8 verified 3 months ago

raw

history blame

56.9 kB

	# app.py
	import subprocess
	import sys
	import os
	from pathlib import Path


	def setup_salt():
	"""Clone and setup SALT library like in Colab."""
	try:
	# Check if salt is already available
	import salt.dataset

	print("✅ SALT library already available")
	return True
	except ImportError:
	pass

	print("📥 Setting up SALT library...")

	try:
	# Clone SALT repo if not exists
	salt_dir = Path("salt")
	if not salt_dir.exists():
	print("🔄 Cloning SALT repository...")
	subprocess.check_call(
	["git", "clone", "https://github.com/sunbirdai/salt.git"]
	)
	else:
	print("📁 SALT repository already exists")

	# Install SALT requirements
	salt_requirements = salt_dir / "requirements.txt"
	if salt_requirements.exists():
	print("📦 Installing SALT requirements...")
	subprocess.check_call(
	[
	sys.executable,
	"-m",
	"pip",
	"install",
	"-q",
	"-r",
	str(salt_requirements),
	]
	)

	# Add SALT directory to Python path
	salt_path = str(salt_dir.absolute())
	if salt_path not in sys.path:
	sys.path.insert(0, salt_path)
	print(f"🔗 Added {salt_path} to Python path")

	# Test import
	import salt.dataset

	print("✅ SALT library setup completed successfully")
	return True

	except Exception as e:
	print(f"❌ Failed to setup SALT: {e}")
	return False


	# Setup SALT on startup
	print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
	if not setup_salt():
	print("❌ Cannot continue without SALT library")
	print("💡 Please check that git is available and GitHub is accessible")
	sys.exit(1)

	import gradio as gr
	import pandas as pd
	import json
	import traceback
	from datetime import datetime
	from typing import Optional, Dict, Tuple, List

	# Import our enhanced modules
	from src.test_set import (
	get_public_test_set_scientific,
	get_complete_test_set_scientific,
	create_test_set_download_scientific,
	validate_test_set_integrity_scientific,
	get_track_test_set,
	)
	from src.validation import validate_submission_scientific
	from src.evaluation import (
	evaluate_predictions_scientific,
	generate_scientific_report,
	compare_models_statistically,
	)
	from src.leaderboard import (
	load_scientific_leaderboard,
	add_model_to_scientific_leaderboard,
	get_scientific_leaderboard_stats,
	get_track_leaderboard,
	prepare_track_leaderboard_display,
	perform_fair_comparison,
	export_scientific_leaderboard,
	)
	from src.plotting import (
	create_scientific_leaderboard_plot,
	create_language_pair_heatmap_scientific,
	create_statistical_comparison_plot,
	create_category_comparison_plot,
	create_adequacy_analysis_plot,
	create_cross_track_analysis_plot,
	create_scientific_model_detail_plot,
	)
	from src.utils import (
	sanitize_model_name,
	get_all_language_pairs,
	get_google_comparable_pairs,
	get_track_language_pairs,
	format_metric_value,
	)
	from config import *

	# Global variables for caching
	current_leaderboard = None
	public_test_set = None
	complete_test_set = None
	test_set_stats = None


	def initialize_scientific_data():
	"""Initialize scientific test sets and leaderboard data."""
	global public_test_set, complete_test_set, current_leaderboard, test_set_stats

	try:
	print("🔬 Initializing SALT Translation Leaderboard - Scientific Edition...")

	# Load scientific test sets
	print("📥 Loading scientific test sets...")
	public_test_set = get_public_test_set_scientific()
	complete_test_set = get_complete_test_set_scientific()

	# Load scientific leaderboard
	print("🏆 Loading scientific leaderboard...")
	current_leaderboard = load_scientific_leaderboard()

	# Validate test set integrity
	print("🔍 Validating test set integrity...")
	test_set_stats = validate_test_set_integrity_scientific()

	print(f"✅ Scientific initialization complete!")
	print(f" - Test set: {len(public_test_set):,} samples")
	print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
	print(
	f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}"
	)
	print(f" - Current models: {len(current_leaderboard)}")

	return True

	except Exception as e:
	print(f"❌ Scientific initialization failed: {e}")
	traceback.print_exc()
	return False


	def download_scientific_test_set() -> Tuple[str, str]:
	"""Create downloadable scientific test set and return file path and info."""

	try:
	global public_test_set
	if public_test_set is None:
	public_test_set = get_public_test_set_scientific()

	# Create download file
	download_path, stats = create_test_set_download_scientific()

	# Create comprehensive info message
	adequacy = stats.get("adequacy_assessment", "unknown")
	adequacy_emoji = {
	"excellent": "🟢",
	"good": "🟡",
	"fair": "🟠",
	"insufficient": "🔴",
	"unknown": "⚪",
	}.get(adequacy, "⚪")

	info_msg = f"""
	## 📥 SALT Scientific Test Set Downloaded Successfully!

	### 🔬 Scientific Edition Features:
	- Stratified Sampling: Ensures representative coverage across domains
	- Statistical Weighting: Samples weighted by track importance
	- Track Balancing: Optimized for fair cross-track comparison
	- Adequacy Validation: {adequacy_emoji} Overall adequacy: {adequacy.title()}

	### 📊 Dataset Statistics:
	- Total Samples: {stats['total_samples']:,}
	- Languages: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
	- Google Comparable: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
	- Domains: {', '.join(stats.get('domains', ['general']))}

	### 🏁 Track Breakdown:
	"""

	track_breakdown = stats.get("track_breakdown", {})
	for track_name, track_info in track_breakdown.items():
	status_emoji = (
	"✅" if track_info.get("statistical_adequacy", False) else "⚠️"
	)
	info_msg += f"""
	{status_emoji} {track_info.get('name', track_name)}:
	- Samples: {track_info.get('total_samples', 0):,}
	- Language Pairs: {track_info.get('language_pairs', 0)}
	- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
	- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
	"""

	info_msg += f"""

	### 📋 Enhanced File Format:
	- `sample_id`: Unique identifier for each sample
	- `source_text`: Text to be translated
	- `source_language`: Source language code
	- `target_language`: Target language code
	- `domain`: Content domain (if available)
	- `google_comparable`: Whether this pair can be compared with Google Translate
	- `tracks_included`: Comma-separated list of tracks that include this sample
	- `statistical_weight`: Statistical importance weight (1.0-5.0)

	### 🔬 Next Steps for Scientific Evaluation:
	1. Run your model on the source texts to generate translations
	2. Create a predictions file with columns: `sample_id`, `prediction`
	3. Optional: Add `category` column to help with model classification
	4. Submit your predictions using the appropriate track tab
	5. Analyze results with statistical confidence intervals

	### 💡 Tips for Best Results:
	- Ensure coverage of all language pairs for chosen track
	- Include confidence scores if available
	- Provide detailed model description for proper categorization
	- Consider submitting to multiple tracks for comprehensive evaluation
	"""

	return download_path, info_msg

	except Exception as e:
	error_msg = f"❌ Error creating scientific test set download: {str(e)}"
	return None, error_msg


	def validate_scientific_submission(
	file, model_name: str, author: str, description: str
	) -> Tuple[str, Optional[pd.DataFrame], str]:
	"""Validate uploaded prediction file with scientific rigor."""

	try:
	if file is None:
	return "❌ Please upload a predictions file", None, "community"
	if not model_name.strip():
	return "❌ Please provide a model name", None, "community"

	# Handle different file input types
	if isinstance(file, bytes):
	file_content = file
	elif isinstance(file, str):
	if os.path.exists(file):
	with open(file, "rb") as f:
	file_content = f.read()
	else:
	file_content = file.encode("utf-8")
	elif hasattr(file, "name") and os.path.exists(file.name):
	with open(file.name, "rb") as f:
	file_content = f.read()
	else:
	return "❌ Could not read uploaded file", None, "community"

	# Determine filename
	filename = (
	getattr(file, "name", None)
	or getattr(file, "filename", None)
	or "predictions.csv"
	)

	# Load test set if needed
	global complete_test_set
	if complete_test_set is None:
	complete_test_set = get_complete_test_set_scientific()

	# Run enhanced scientific validation
	validation_result = validate_submission_scientific(
	file_content, filename, complete_test_set, model_name, author, description
	)

	detected_category = validation_result.get("category", "community")

	if validation_result["valid"]:
	return (
	validation_result["report"],
	validation_result["predictions"],
	detected_category,
	)
	else:
	return validation_result["report"], None, detected_category

	except Exception as e:
	return (
	f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
	None,
	"community",
	)


	def evaluate_scientific_submission(
	predictions_df: pd.DataFrame,
	model_name: str,
	author: str,
	description: str,
	detected_category: str,
	validation_info: Dict,
	) -> Tuple[str, pd.DataFrame, object, object]:
	"""Evaluate validated predictions using scientific methodology."""

	try:
	if predictions_df is None:
	return "❌ No valid predictions to evaluate", None, None, None

	# Get complete test set with targets
	global complete_test_set, current_leaderboard
	if complete_test_set is None:
	complete_test_set = get_complete_test_set_scientific()

	# Run scientific evaluation across all tracks
	print(f"🔬 Starting scientific evaluation for {model_name}...")
	evaluation_results = evaluate_predictions_scientific(
	predictions_df, complete_test_set, detected_category
	)

	if any(
	track_data.get("error")
	for track_data in evaluation_results.get("tracks", {}).values()
	):
	errors = [
	track_data["error"]
	for track_data in evaluation_results["tracks"].values()
	if track_data.get("error")
	]
	return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None

	# Add to scientific leaderboard
	print("🏆 Adding to scientific leaderboard...")
	updated_leaderboard = add_model_to_scientific_leaderboard(
	model_name=sanitize_model_name(model_name),
	author=author or "Anonymous",
	evaluation_results=evaluation_results,
	model_category=detected_category,
	description=description or "",
	)

	# Update global leaderboard
	current_leaderboard = updated_leaderboard

	# Generate scientific report
	report = generate_scientific_report(evaluation_results, model_name)

	# Create visualizations
	summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
	cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)

	# Prepare display leaderboard (Google-comparable track by default)
	google_leaderboard = get_track_leaderboard(
	updated_leaderboard, "google_comparable"
	)
	display_leaderboard = prepare_track_leaderboard_display(
	google_leaderboard, "google_comparable"
	)

	# Format success message with track-specific results
	success_msg = f"""
	## 🎉 Scientific Evaluation Complete!

	### 📊 Model Information:
	- Model: {model_name}
	- Category: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
	- Author: {author or 'Anonymous'}

	### 🏆 Track Performance Summary:
	"""

	tracks = evaluation_results.get("tracks", {})
	for track_name, track_data in tracks.items():
	if not track_data.get("error"):
	track_config = EVALUATION_TRACKS[track_name]
	track_averages = track_data.get("track_averages", {})
	summary = track_data.get("summary", {})

	# Get rank in this track
	track_leaderboard = get_track_leaderboard(
	updated_leaderboard, track_name
	)
	if not track_leaderboard.empty:
	model_row = track_leaderboard[
	track_leaderboard["model_name"]
	== sanitize_model_name(model_name)
	]
	rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
	total_models = len(track_leaderboard)
	else:
	rank = "N/A"
	total_models = 0

	quality_score = track_averages.get("quality_score", 0)
	bleu_score = track_averages.get("bleu", 0)
	samples = summary.get("total_samples", 0)

	success_msg += f"""
	🏁 {track_config['name']}:
	- Rank: #{rank} out of {total_models} models
	- Quality Score: {quality_score:.4f}
	- BLEU: {bleu_score:.2f}
	- Samples: {samples:,}
	"""

	success_msg += f"""

	### 🔬 Scientific Adequacy:
	- Cross-Track Consistency: Available in detailed analysis
	- Statistical Confidence: 95% confidence intervals computed
	- Sample Adequacy: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}

	{report}
	"""

	return success_msg, display_leaderboard, summary_plot, cross_track_plot

	except Exception as e:
	error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
	return error_msg, None, None, None


	def refresh_track_leaderboard(
	track: str,
	search_query: str = "",
	category_filter: str = "all",
	min_adequacy: float = 0.0,
	show_ci: bool = True,
	) -> Tuple[pd.DataFrame, object, object, str]:
	"""Refresh leaderboard for a specific track with filters."""

	try:
	global current_leaderboard
	if current_leaderboard is None:
	current_leaderboard = load_scientific_leaderboard()

	# Get track-specific leaderboard
	track_leaderboard = get_track_leaderboard(
	current_leaderboard,
	track,
	category_filter=category_filter,
	min_adequacy=min_adequacy,
	)

	# Apply search filter
	if search_query:
	query_lower = search_query.lower()
	mask = track_leaderboard["model_name"].str.lower().str.contains(
	query_lower, na=False
	) \| track_leaderboard["author"].str.lower().str.contains(
	query_lower, na=False
	)
	track_leaderboard = track_leaderboard[mask]

	# Prepare for display
	display_df = prepare_track_leaderboard_display(track_leaderboard, track)

	# Create plots
	ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
	comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)

	# Get track statistics
	track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
	track_config = EVALUATION_TRACKS[track]

	stats_text = f"""
	### 📊 {track_config['name']} Statistics

	- Total Models: {track_stats.get('total_models', 0)}
	- Models by Category: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
	- Average Quality Score: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}

	Best Model: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
	Best Score: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}

	### 🔬 Scientific Notes:
	- All metrics include 95% confidence intervals
	- Statistical adequacy verified for reliable comparisons
	- {track_config['description']}
	"""

	return display_df, ranking_plot, comparison_plot, stats_text

	except Exception as e:
	error_msg = f"Error loading {track} leaderboard: {str(e)}"
	empty_df = pd.DataFrame()
	return empty_df, None, None, error_msg


	def get_scientific_model_details(
	model_name: str, track: str
	) -> Tuple[str, object, object]:
	"""Get detailed scientific analysis for a specific model."""

	try:
	global current_leaderboard
	if current_leaderboard is None:
	return "Leaderboard not loaded", None, None

	# Find model
	model_row = current_leaderboard[current_leaderboard["model_name"] == model_name]

	if model_row.empty:
	return f"Model '{model_name}' not found", None, None

	model_info = model_row.iloc[0]

	# Parse detailed metrics for the requested track
	try:
	detailed_results = json.loads(model_info[f"detailed_{track}"])
	except:
	detailed_results = {}

	# Create detailed plots
	detail_plot = create_scientific_model_detail_plot(
	detailed_results, model_name, track
	)

	# Create language pair heatmap
	heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)

	# Format model details with scientific information
	track_config = EVALUATION_TRACKS[track]
	category_info = MODEL_CATEGORIES.get(model_info["model_category"], {})

	# Extract track-specific metrics
	quality_col = f"{track}_quality"
	bleu_col = f"{track}_bleu"
	chrf_col = f"{track}_chrf"
	ci_lower_col = f"{track}_ci_lower"
	ci_upper_col = f"{track}_ci_upper"
	samples_col = f"{track}_samples"
	pairs_col = f"{track}_pairs"
	adequate_col = f"{track}_adequate"

	details_text = f"""
	## 🔬 Scientific Model Analysis: {model_name}

	### 📋 Basic Information:
	- Author: {model_info['author']}
	- Category: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
	- Submission Date: {model_info['submission_date'][:10]}
	- Description: {model_info['description'] or 'No description provided'}

	### 🏁 {track_config['name']} Performance:
	- Quality Score: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))}
	- BLEU: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')}
	- ChrF: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')}

	### 📊 Coverage Information:
	- Total Samples: {model_info.get(samples_col, 0):,}
	- Language Pairs Covered: {model_info.get(pairs_col, 0)}
	- Statistical Adequacy: {'✅ Yes' if model_info.get(adequate_col, False) else '❌ No'}

	### 🔬 Statistical Metadata:
	- Confidence Level: {STATISTICAL_CONFIG['confidence_level']:.0%}
	- Bootstrap Samples: {STATISTICAL_CONFIG['bootstrap_samples']:,}
	- Scientific Adequacy Score: {model_info.get('scientific_adequacy_score', 0.0):.3f}

	### 📈 Cross-Track Performance:
	"""

	# Add other track performances for comparison
	for other_track in EVALUATION_TRACKS.keys():
	if other_track != track:
	other_quality_col = f"{other_track}_quality"
	other_adequate_col = f"{other_track}_adequate"

	if model_info.get(other_adequate_col, False):
	other_quality = model_info.get(other_quality_col, 0)
	details_text += f"- {EVALUATION_TRACKS[other_track]['name']}: {other_quality:.4f}\n"
	else:
	details_text += f"- {EVALUATION_TRACKS[other_track]['name']}: Not evaluated\n"

	details_text += f"""

	### 💡 Scientific Interpretation:
	- Performance metrics include 95% confidence intervals for reliability
	- Statistical adequacy ensures meaningful comparisons with other models
	- Cross-track analysis reveals model strengths across different language sets
	- Category classification helps contextualize performance expectations
	"""

	return details_text, detail_plot, heatmap_plot

	except Exception as e:
	error_msg = f"Error getting model details: {str(e)}"
	return error_msg, None, None


	def perform_model_comparison(
	model_names: List[str], track: str, comparison_type: str = "statistical"
	) -> Tuple[str, object]:
	"""Perform scientific comparison between selected models."""

	try:
	global current_leaderboard
	if current_leaderboard is None:
	return "Leaderboard not loaded", None

	if len(model_names) < 2:
	return "Please select at least 2 models for comparison", None

	# Get models
	models = current_leaderboard[
	current_leaderboard["model_name"].isin(model_names)
	]

	if len(models) < 2:
	return "Selected models not found in leaderboard", None

	# Perform fair comparison
	comparison_result = perform_fair_comparison(current_leaderboard, model_names)

	if comparison_result.get("error"):
	return f"Comparison error: {comparison_result['error']}", None

	# Create comparison visualization
	if comparison_type == "statistical":
	comparison_plot = create_statistical_comparison_plot(models, track)
	else:
	comparison_plot = create_category_comparison_plot(models, track)

	# Format comparison report
	track_config = EVALUATION_TRACKS[track]
	comparison_text = f"""
	## 🔬 Scientific Model Comparison - {track_config['name']}

	### 📊 Models Compared:
	"""

	quality_col = f"{track}_quality"
	ci_lower_col = f"{track}_ci_lower"
	ci_upper_col = f"{track}_ci_upper"

	# Sort models by performance
	models_sorted = models.sort_values(quality_col, ascending=False)

	for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
	category_info = MODEL_CATEGORIES.get(model["model_category"], {})

	comparison_text += f"""
	#{i}. {model['model_name']}
	- Category: {category_info.get('name', 'Unknown')}
	- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
	- Author: {model['author']}
	"""

	# Add statistical analysis
	track_comparison = comparison_result.get("track_comparisons", {}).get(track, {})
	if track_comparison:
	comparison_text += f"""

	### 🔬 Statistical Analysis:
	- Models with adequate data: {track_comparison.get('participating_models', 0)}
	- Confidence intervals available: Yes (95% level)
	- Fair comparison possible: {'✅ Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
	"""

	# Check for statistical significance (simplified)
	quality_scores = list(track_comparison.get("quality_scores", {}).values())
	if len(quality_scores) >= 2:
	score_range = max(quality_scores) - min(quality_scores)
	if score_range > 0.05: # 5% difference threshold
	comparison_text += (
	"- Performance differences: Potentially significant\n"
	)
	else:
	comparison_text += "- Performance differences: Minimal\n"

	# Add recommendations
	recommendations = comparison_result.get("recommendations", [])
	if recommendations:
	comparison_text += "\n### 💡 Recommendations:\n"
	for rec in recommendations:
	comparison_text += f"- {rec}\n"

	return comparison_text, comparison_plot

	except Exception as e:
	error_msg = f"Error performing comparison: {str(e)}"
	return error_msg, None


	# Initialize data on startup
	print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
	initialization_success = initialize_scientific_data()

	# Create Gradio interface with scientific design
	with gr.Blocks(
	title=UI_CONFIG["title"],
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1600px !important;
	margin: 0 auto;
	}
	.scientific-header {
	text-align: center;
	margin-bottom: 2rem;
	padding: 2rem;
	background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
	color: white;
	border-radius: 10px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}
	.track-tab {
	border-radius: 8px;
	margin: 0.5rem;
	padding: 1rem;
	border: 2px solid transparent;
	}
	.track-tab.google-comparable {
	border-color: #1f77b4;
	background: linear-gradient(45deg, #f0f9ff, #e0f2fe);
	}
	.track-tab.ug40-complete {
	border-color: #ff7f0e;
	background: linear-gradient(45deg, #fff7ed, #fed7aa);
	}
	.track-tab.language-pair-matrix {
	border-color: #2ca02c;
	background: linear-gradient(45deg, #f0fdf4, #dcfce7);
	}
	.metric-box {
	background: #f8fafc;
	padding: 1rem;
	border-radius: 8px;
	margin: 0.5rem 0;
	border-left: 4px solid #3b82f6;
	}
	.scientific-note {
	background: #fef3c7;
	border: 1px solid #f59e0b;
	border-radius: 8px;
	padding: 1rem;
	margin: 1rem 0;
	}
	.adequacy-excellent { border-left-color: #22c55e; }
	.adequacy-good { border-left-color: #eab308; }
	.adequacy-fair { border-left-color: #f97316; }
	.adequacy-insufficient { border-left-color: #ef4444; }
	""",
	) as demo:

	# Scientific Header
	gr.HTML(
	f"""
	<div class="scientific-header">
	<h1>🏆 SALT Translation Leaderboard - Scientific Edition</h1>
	<p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
	<p>Three-tier evaluation tracks • 95% Confidence intervals • Research-grade analysis</p>
	<p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages \| <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
	</div>
	"""
	)

	# Status indicator
	if initialization_success:
	status_msg = "✅ Scientific system initialized successfully"
	adequacy_info = test_set_stats.get("scientific_adequacy", {}).get(
	"overall_adequacy", "unknown"
	)
	status_msg += f" \| Test set adequacy: {adequacy_info.title()}"
	else:
	status_msg = "❌ System initialization failed - some features may not work"

	gr.Markdown(f"System Status: {status_msg}")

	# Add scientific overview
	gr.Markdown(
	"""
	## 🔬 Scientific Evaluation Framework

	This leaderboard implements rigorous scientific methodology for translation model evaluation:

	- Three Evaluation Tracks: Fair comparison across different model capabilities
	- Statistical Significance: 95% confidence intervals and effect size analysis
	- Category-Based Analysis: Commercial, Research, Baseline, and Community models
	- Cross-Track Consistency: Validate model performance across language sets
	"""
	)

	with gr.Tabs():

	# Tab 1: Download Test Set
	with gr.Tab("📥 Download Test Set", id="download"):
	gr.Markdown(
	"""
	## 📋 Get the SALT Scientific Test Set

	Download our scientifically designed test set with stratified sampling and statistical weighting.
	"""
	)

	with gr.Row():
	download_btn = gr.Button(
	"📥 Download Scientific Test Set", variant="primary", size="lg"
	)

	with gr.Row():
	with gr.Column():
	download_file = gr.File(label="📂 Test Set File", interactive=False)
	with gr.Column():
	download_info = gr.Markdown(label="ℹ️ Test Set Information")

	# Tab 2: Submit Predictions
	with gr.Tab("🚀 Submit Predictions", id="submit"):
	gr.Markdown(
	"""
	## 🎯 Submit Your Model's Predictions for Scientific Evaluation

	Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Model Information")

	model_name_input = gr.Textbox(
	label="🤖 Model Name",
	placeholder="e.g., MyTranslator-v2.0",
	info="Unique name for your model",
	)

	author_input = gr.Textbox(
	label="👤 Author/Organization",
	placeholder="Your name or organization",
	value="Anonymous",
	)

	description_input = gr.Textbox(
	label="📄 Model Description",
	placeholder="Architecture, training data, special features...",
	lines=4,
	info="Detailed description helps with proper categorization",
	)

	gr.Markdown("### 📤 Upload Predictions")
	predictions_file = gr.File(
	label="📂 Predictions File",
	file_types=[".csv", ".tsv", ".json"],
	)

	validate_btn = gr.Button(
	"✅ Validate Submission", variant="secondary"
	)
	submit_btn = gr.Button(
	"🚀 Submit for Scientific Evaluation",
	variant="primary",
	interactive=False,
	)

	with gr.Column(scale=1):
	gr.Markdown("### 📊 Validation Results")
	validation_output = gr.Markdown()

	# Results section
	gr.Markdown("### 🏆 Scientific Evaluation Results")

	with gr.Row():
	evaluation_output = gr.Markdown()

	with gr.Row():
	with gr.Column():
	submission_plot = gr.Plot(label="📈 Submission Analysis")
	with gr.Column():
	cross_track_plot = gr.Plot(label="🔄 Cross-Track Analysis")

	with gr.Row():
	results_table = gr.Dataframe(
	label="📊 Updated Leaderboard (Google-Comparable Track)",
	interactive=False,
	)

	# Tab 3: Google-Comparable Track
	with gr.Tab(
	"🤖 Google-Comparable Track",
	id="google_track",
	elem_classes=["track-tab", "google-comparable"],
	):
	gr.Markdown(
	f"""
	## {UI_CONFIG['tracks']['google_comparable']['tab_name']}

	Fair comparison with commercial translation systems

	This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
	enabling direct comparison with commercial baselines.

	- Languages: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
	- Purpose: Commercial system comparison and baseline establishment
	- Statistical Power: High (optimized sample sizes)
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	google_search = gr.Textbox(
	label="🔍 Search Models",
	placeholder="Search by model name, author...",
	)
	with gr.Column(scale=1):
	google_category = gr.Dropdown(
	label="🏷️ Category Filter",
	choices=["all"] + list(MODEL_CATEGORIES.keys()),
	value="all",
	)
	with gr.Column(scale=1):
	google_adequacy = gr.Slider(
	label="📊 Min Adequacy",
	minimum=0.0,
	maximum=1.0,
	value=0.0,
	step=0.1,
	)
	with gr.Column(scale=1):
	google_refresh = gr.Button("🔄 Refresh", variant="secondary")

	with gr.Row():
	google_stats = gr.Markdown()

	with gr.Row():
	with gr.Column():
	google_ranking_plot = gr.Plot(label="🏆 Google-Comparable Rankings")
	with gr.Column():
	google_comparison_plot = gr.Plot(label="📊 Statistical Comparison")

	with gr.Row():
	google_leaderboard = gr.Dataframe(
	label="📈 Google-Comparable Leaderboard", interactive=False
	)

	# Tab 4: UG40-Complete Track
	with gr.Tab(
	"🌍 UG40-Complete Track",
	id="ug40_track",
	elem_classes=["track-tab", "ug40-complete"],
	):
	gr.Markdown(
	f"""
	## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}

	Comprehensive evaluation across all Ugandan languages

	This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
	providing the most comprehensive assessment of Ugandan language translation capabilities.

	- Languages: All {len(ALL_UG40_LANGUAGES)} UG40 languages
	- Purpose: Comprehensive Ugandan language capability assessment
	- Coverage: Complete linguistic landscape of Uganda
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	ug40_search = gr.Textbox(
	label="🔍 Search Models",
	placeholder="Search by model name, author...",
	)
	with gr.Column(scale=1):
	ug40_category = gr.Dropdown(
	label="🏷️ Category Filter",
	choices=["all"] + list(MODEL_CATEGORIES.keys()),
	value="all",
	)
	with gr.Column(scale=1):
	ug40_adequacy = gr.Slider(
	label="📊 Min Adequacy",
	minimum=0.0,
	maximum=1.0,
	value=0.0,
	step=0.1,
	)
	with gr.Column(scale=1):
	ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")

	with gr.Row():
	ug40_stats = gr.Markdown()

	with gr.Row():
	with gr.Column():
	ug40_ranking_plot = gr.Plot(label="🏆 UG40-Complete Rankings")
	with gr.Column():
	ug40_comparison_plot = gr.Plot(label="📊 Statistical Comparison")

	with gr.Row():
	ug40_leaderboard = gr.Dataframe(
	label="📈 UG40-Complete Leaderboard", interactive=False
	)

	# Tab 5: Language-Pair Matrix
	with gr.Tab(
	"📊 Language-Pair Matrix",
	id="matrix_track",
	elem_classes=["track-tab", "language-pair-matrix"],
	):
	gr.Markdown(
	f"""
	## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}

	Detailed language pair analysis with statistical significance

	This view provides granular analysis of model performance across individual language pairs
	with statistical significance testing and effect size analysis.

	- Resolution: Individual language pair performance
	- Purpose: Detailed linguistic analysis and model diagnostics
	- Statistics: Pairwise significance testing available
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	matrix_search = gr.Textbox(
	label="🔍 Search Models",
	placeholder="Search by model name, author...",
	)
	with gr.Column(scale=1):
	matrix_category = gr.Dropdown(
	label="🏷️ Category Filter",
	choices=["all"] + list(MODEL_CATEGORIES.keys()),
	value="all",
	)
	with gr.Column(scale=1):
	matrix_adequacy = gr.Slider(
	label="📊 Min Adequacy",
	minimum=0.0,
	maximum=1.0,
	value=0.0,
	step=0.1,
	)
	with gr.Column(scale=1):
	matrix_refresh = gr.Button("🔄 Refresh", variant="secondary")

	with gr.Row():
	matrix_stats = gr.Markdown()

	with gr.Row():
	with gr.Column():
	matrix_ranking_plot = gr.Plot(
	label="🏆 Language-Pair Matrix Rankings"
	)
	with gr.Column():
	matrix_comparison_plot = gr.Plot(label="📊 Statistical Comparison")

	with gr.Row():
	matrix_leaderboard = gr.Dataframe(
	label="📈 Language-Pair Matrix Leaderboard", interactive=False
	)

	# Tab 6: Model Analysis
	with gr.Tab("🔍 Scientific Model Analysis", id="analysis"):
	gr.Markdown(
	"""
	## 🔬 Detailed Scientific Model Analysis

	Comprehensive analysis of individual models with statistical confidence intervals,
	cross-track performance, and detailed language pair breakdowns.
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	model_select = gr.Dropdown(
	label="🤖 Select Model",
	choices=[],
	value=None,
	info="Choose a model for detailed scientific analysis",
	)
	with gr.Column(scale=1):
	track_select = gr.Dropdown(
	label="🏁 Analysis Track",
	choices=list(EVALUATION_TRACKS.keys()),
	value="google_comparable",
	info="Track for detailed analysis",
	)
	with gr.Column(scale=1):
	analyze_btn = gr.Button("🔍 Analyze", variant="primary")

	with gr.Row():
	model_details = gr.Markdown()

	with gr.Row():
	with gr.Column():
	model_analysis_plot = gr.Plot(
	label="📊 Detailed Performance Analysis"
	)
	with gr.Column():
	model_heatmap_plot = gr.Plot(label="🗺️ Language Pair Heatmap")

	# Tab 7: Model Comparison
	with gr.Tab("⚖️ Scientific Model Comparison", id="comparison"):
	gr.Markdown(
	"""
	## 🔬 Scientific Model Comparison

	Compare multiple models with statistical significance testing and fair comparison analysis.
	Only models evaluated on the same language pairs are compared for scientific validity.
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	comparison_models = gr.CheckboxGroup(
	label="🤖 Select Models to Compare",
	choices=[],
	value=[],
	info="Select 2-6 models for comparison",
	)
	with gr.Column(scale=1):
	comparison_track = gr.Dropdown(
	label="🏁 Comparison Track",
	choices=list(EVALUATION_TRACKS.keys()),
	value="google_comparable",
	)
	comparison_type = gr.Radio(
	label="📊 Comparison Type",
	choices=["statistical", "category"],
	value="statistical",
	)
	compare_btn = gr.Button("⚖️ Compare Models", variant="primary")

	with gr.Row():
	comparison_output = gr.Markdown()

	with gr.Row():
	comparison_plot = gr.Plot(label="📊 Model Comparison Analysis")

	# Tab 8: Documentation
	with gr.Tab("📚 Scientific Documentation", id="docs"):
	gr.Markdown(
	f"""
	# 📖 SALT Translation Leaderboard - Scientific Edition Documentation

	## 🎯 Overview

	The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
	for translation models on Ugandan languages, designed for research publication and scientific analysis.

	## 🔬 Scientific Methodology

	### Three-Tier Evaluation System

	1. 🤖 Google-Comparable Track
	- Languages: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
	- Pairs: {len(get_google_comparable_pairs())} language pairs
	- Purpose: Fair comparison with commercial translation systems
	- Statistical Power: High (≥200 samples per pair recommended)

	2. 🌍 UG40-Complete Track
	- Languages: All {len(ALL_UG40_LANGUAGES)} UG40 languages
	- Pairs: {len(get_all_language_pairs())} language pairs
	- Purpose: Comprehensive Ugandan language capability assessment
	- Statistical Power: Moderate (≥100 samples per pair recommended)

	3. 📊 Language-Pair Matrix
	- Resolution: Individual language pair analysis
	- Purpose: Detailed linguistic analysis and model diagnostics
	- Statistics: Pairwise significance testing with multiple comparison correction

	### Statistical Rigor

	- Confidence Intervals: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
	- Significance Testing: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
	- Effect Size: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
	- Statistical Power: Estimated based on sample sizes and effect sizes

	### Model Categories

	Models are automatically categorized for fair comparison:

	- 🏢 Commercial: Production translation systems (Google Translate, Azure, etc.)
	- 🔬 Research: Academic and research institution models (NLLB, M2M-100, etc.)
	- 📊 Baseline: Simple baseline and reference models
	- 👥 Community: User-submitted models and fine-tuned variants

	## 📊 Evaluation Metrics

	### Primary Metrics
	- Quality Score: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
	- BLEU: Bilingual Evaluation Understudy (0-100)
	- ChrF: Character-level F-score (0-1)

	### Secondary Metrics
	- ROUGE-1/ROUGE-L: Recall-oriented metrics for content overlap
	- CER/WER: Character/Word Error Rate (lower is better)
	- Length Ratio: Prediction/reference length ratio

	All metrics include 95% confidence intervals for statistical reliability.

	## 🔄 Submission Process

	### Step 1: Download Scientific Test Set
	1. Click "Download Scientific Test Set" in the first tab
	2. Review test set adequacy and track breakdown
	3. Save the enhanced test set with statistical weights

	### Step 2: Generate Predictions
	1. Load the test set in your evaluation pipeline
	2. For each row, translate `source_text` from `source_language` to `target_language`
	3. Save results as CSV with columns: `sample_id`, `prediction`
	4. Optional: Add `category` column for automatic classification

	### Step 3: Submit & Evaluate
	1. Fill in detailed model information (improves categorization)
	2. Upload your predictions file
	3. Review validation report with track-specific adequacy assessment
	4. Submit for scientific evaluation across all tracks

	## 📋 Enhanced File Formats

	### Scientific Test Set Format
	```csv
	sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
	salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
	salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
	salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
	```

	### Predictions Format
	```csv
	sample_id,prediction,category
	salt_000001,"Amakuru ensi","community"
	salt_000002,"Ibino nining?","community"
	salt_000003,"Ejok nanu","community"
	```

	## 🏆 Scientific Leaderboard Features

	### Fair Comparison
	- Models only compared within the same category and track
	- Statistical significance testing prevents misleading rankings
	- Confidence intervals show measurement uncertainty

	### Cross-Track Analysis
	- Consistency analysis across evaluation tracks
	- Identification of model strengths and weaknesses
	- Language-specific performance patterns

	### Publication Quality
	- All visualizations include error bars and statistical annotations
	- Comprehensive methodology documentation
	- Reproducible evaluation pipeline

	## 🔬 Statistical Interpretation Guide

	### Confidence Intervals
	- Non-overlapping CIs: Likely significant difference
	- Overlapping CIs: May or may not be significant (requires formal testing)
	- Wide CIs: High uncertainty (need more data)

	### Effect Sizes
	- Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']}): Practical equivalence
	- Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']}): Noticeable difference
	- Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']}): Substantial difference
	- Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']}): Very large difference

	### Statistical Adequacy
	- Excellent: High statistical power (>0.8) for all comparisons
	- Good: Adequate power for most comparisons
	- Fair: Limited power, interpret with caution
	- Insufficient: Results not reliable for scientific conclusions

	## 🤝 Contributing to Science

	This leaderboard is designed for the research community. When using results:

	1. Always report confidence intervals along with point estimates
	2. Acknowledge statistical adequacy when interpreting results
	3. Use appropriate track for your comparison (don't compare Google-track vs UG40-track results)
	4. Consider effect sizes not just statistical significance

	## 📄 Citation

	If you use this leaderboard in your research, please cite:

	```bibtex
	@misc{{salt_leaderboard_scientific_2024,
	title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
	author={{Sunbird AI}},
	year={{2024}},
	url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
	note={{Three-tier evaluation system with statistical significance testing}}
	}}
	```

	## 🔗 Related Resources

	- SALT Dataset: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
	- Sunbird AI Research: [sunbird.ai/research](https://sunbird.ai/research)
	- Statistical Methodology: See our technical paper on rigorous MT evaluation
	- Open Source Code: Available on GitHub for reproducibility

	---

	For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]
	"""
	)

	# Event handlers with enhanced scientific functionality
	predictions_validated = gr.State(value=None)
	validation_info_state = gr.State(value=None)
	detected_category_state = gr.State(value="community")

	# Download test set
	download_btn.click(
	fn=download_scientific_test_set, outputs=[download_file, download_info]
	)

	# Validate predictions
	def handle_scientific_validation(file, model_name, author, description):
	report, predictions, category = validate_scientific_submission(
	file, model_name, author, description
	)

	# Enable button if predictions are available and format is valid
	# This allows "can be evaluated with limitations" cases
	can_evaluate = predictions is not None

	# Additional check: ensure we have some basic validity
	if can_evaluate and "❌ Final Verdict: Please address issues" in report:
	can_evaluate = False

	return (
	report,
	predictions,
	{"category": category, "validation_passed": can_evaluate},
	category,
	gr.update(interactive=can_evaluate),
	)

	validate_btn.click(
	fn=handle_scientific_validation,
	inputs=[predictions_file, model_name_input, author_input, description_input],
	outputs=[
	validation_output,
	predictions_validated,
	validation_info_state,
	detected_category_state,
	submit_btn,
	],
	)

	# Submit for evaluation
	def handle_scientific_submission(
	predictions, model_name, author, description, category, validation_info
	):
	if predictions is None:
	return "❌ Please validate your submission first", None, None, None

	return evaluate_scientific_submission(
	predictions, model_name, author, description, category, validation_info
	)

	submit_btn.click(
	fn=handle_scientific_submission,
	inputs=[
	predictions_validated,
	model_name_input,
	author_input,
	description_input,
	detected_category_state,
	validation_info_state,
	],
	outputs=[evaluation_output, results_table, submission_plot, cross_track_plot],
	)

	# Track leaderboard refresh functions
	def refresh_google_track(*args):
	return refresh_track_leaderboard("google_comparable", *args)

	def refresh_ug40_track(*args):
	return refresh_track_leaderboard("ug40_complete", *args)

	def refresh_matrix_track(*args):
	return refresh_track_leaderboard("language_pair_matrix", *args)

	# Google-Comparable Track
	google_refresh.click(
	fn=refresh_google_track,
	inputs=[google_search, google_category, google_adequacy],
	outputs=[
	google_leaderboard,
	google_ranking_plot,
	google_comparison_plot,
	google_stats,
	],
	)

	# UG40-Complete Track
	ug40_refresh.click(
	fn=refresh_ug40_track,
	inputs=[ug40_search, ug40_category, ug40_adequacy],
	outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats],
	)

	# Language-Pair Matrix Track
	matrix_refresh.click(
	fn=refresh_matrix_track,
	inputs=[matrix_search, matrix_category, matrix_adequacy],
	outputs=[
	matrix_leaderboard,
	matrix_ranking_plot,
	matrix_comparison_plot,
	matrix_stats,
	],
	)

	# Model analysis
	analyze_btn.click(
	fn=get_scientific_model_details,
	inputs=[model_select, track_select],
	outputs=[model_details, model_analysis_plot, model_heatmap_plot],
	)

	# Model comparison
	compare_btn.click(
	fn=perform_model_comparison,
	inputs=[comparison_models, comparison_track, comparison_type],
	outputs=[comparison_output, comparison_plot],
	)

	# Load initial data and update dropdowns
	def load_initial_data():
	# Load initial Google track data
	google_data = refresh_google_track("", "all", 0.0)

	# Update dropdown choices
	if current_leaderboard is not None and not current_leaderboard.empty:
	model_choices = current_leaderboard["model_name"].tolist()
	else:
	model_choices = []

	return (
	google_data[0], # google_leaderboard
	google_data[1], # google_ranking_plot
	google_data[2], # google_comparison_plot
	google_data[3], # google_stats
	gr.Dropdown(choices=model_choices), # model_select
	gr.CheckboxGroup(choices=model_choices), # comparison_models
	)

	demo.load(
	fn=load_initial_data,
	outputs=[
	google_leaderboard,
	google_ranking_plot,
	google_comparison_plot,
	google_stats,
	model_select,
	comparison_models,
	],
	)

	# Launch the scientific application
	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)