Spaces:

akera
/

leaderboard

Running

File size: 56,475 Bytes

# app.py
import subprocess
import sys
import os
from pathlib import Path

def setup_salt():
    """Clone and setup SALT library like in Colab."""
    try:
        # Check if salt is already available
        import salt.dataset
        print("✅ SALT library already available")
        return True
    except ImportError:
        pass
    
    print("📥 Setting up SALT library...")
    
    try:
        # Clone SALT repo if not exists
        salt_dir = Path("salt")
        if not salt_dir.exists():
            print("🔄 Cloning SALT repository...")
            subprocess.check_call([
                "git", "clone", "https://github.com/sunbirdai/salt.git"
            ])
        else:
            print("📁 SALT repository already exists")
        
        # Install SALT requirements
        salt_requirements = salt_dir / "requirements.txt"
        if salt_requirements.exists():
            print("📦 Installing SALT requirements...")
            subprocess.check_call([
                sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
            ])
        
        # Add SALT directory to Python path
        salt_path = str(salt_dir.absolute())
        if salt_path not in sys.path:
            sys.path.insert(0, salt_path)
            print(f"🔗 Added {salt_path} to Python path")
        
        # Test import
        import salt.dataset
        print("✅ SALT library setup completed successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to setup SALT: {e}")
        return False

# Setup SALT on startup
print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
if not setup_salt():
    print("❌ Cannot continue without SALT library")
    print("💡 Please check that git is available and GitHub is accessible")
    sys.exit(1)

import gradio as gr
import pandas as pd
import json
import traceback
from datetime import datetime
from typing import Optional, Dict, Tuple, List

# Import our enhanced modules
from src.test_set import (
    get_public_test_set_scientific, 
    get_complete_test_set_scientific,
    create_test_set_download_scientific, 
    validate_test_set_integrity_scientific,
    get_track_test_set
)
from src.validation import validate_submission_scientific
from src.evaluation import (
    evaluate_predictions_scientific, 
    generate_scientific_report,
    compare_models_statistically
)
from src.leaderboard import (
    load_scientific_leaderboard, 
    add_model_to_scientific_leaderboard,
    get_scientific_leaderboard_stats, 
    get_track_leaderboard,
    prepare_track_leaderboard_display,
    perform_fair_comparison,
    export_scientific_leaderboard
)
from src.plotting import (
    create_scientific_leaderboard_plot, 
    create_language_pair_heatmap_scientific,
    create_statistical_comparison_plot,
    create_category_comparison_plot,
    create_adequacy_analysis_plot,
    create_cross_track_analysis_plot,
    create_scientific_model_detail_plot
)
from src.utils import (
    sanitize_model_name, 
    get_all_language_pairs, 
    get_google_comparable_pairs,
    get_track_language_pairs,
    format_metric_value
)
from config import *

# Global variables for caching
current_leaderboard = None
public_test_set = None
complete_test_set = None
test_set_stats = None

def initialize_scientific_data():
    """Initialize scientific test sets and leaderboard data."""
    global public_test_set, complete_test_set, current_leaderboard, test_set_stats
    
    try:
        print("🔬 Initializing SALT Translation Leaderboard - Scientific Edition...")
        
        # Load scientific test sets
        print("📥 Loading scientific test sets...")
        public_test_set = get_public_test_set_scientific()
        complete_test_set = get_complete_test_set_scientific()
        
        # Load scientific leaderboard
        print("🏆 Loading scientific leaderboard...")
        current_leaderboard = load_scientific_leaderboard()
        
        # Validate test set integrity
        print("🔍 Validating test set integrity...")
        test_set_stats = validate_test_set_integrity_scientific()
        
        print(f"✅ Scientific initialization complete!")
        print(f"   - Test set: {len(public_test_set):,} samples")
        print(f"   - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
        print(f"   - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
        print(f"   - Current models: {len(current_leaderboard)}")
        
        return True
        
    except Exception as e:
        print(f"❌ Scientific initialization failed: {e}")
        traceback.print_exc()
        return False

def download_scientific_test_set() -> Tuple[str, str]:
    """Create downloadable scientific test set and return file path and info."""
    
    try:
        global public_test_set
        if public_test_set is None:
            public_test_set = get_public_test_set_scientific()
        
        # Create download file
        download_path, stats = create_test_set_download_scientific()
        
        # Create comprehensive info message
        adequacy = stats.get('adequacy_assessment', 'unknown')
        adequacy_emoji = {
            'excellent': '🟢',
            'good': '🟡', 
            'fair': '🟠',
            'insufficient': '🔴',
            'unknown': '⚪'
        }.get(adequacy, '⚪')
        
        info_msg = f"""
## 📥 SALT Scientific Test Set Downloaded Successfully!

### 🔬 Scientific Edition Features:
- **Stratified Sampling**: Ensures representative coverage across domains
- **Statistical Weighting**: Samples weighted by track importance  
- **Track Balancing**: Optimized for fair cross-track comparison
- **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**

### 📊 Dataset Statistics:
- **Total Samples**: {stats['total_samples']:,}
- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
- **Domains**: {', '.join(stats.get('domains', ['general']))}

### 🏁 Track Breakdown:
"""
        
        track_breakdown = stats.get('track_breakdown', {})
        for track_name, track_info in track_breakdown.items():
            status_emoji = '✅' if track_info.get('statistical_adequacy', False) else '⚠️'
            info_msg += f"""
**{status_emoji} {track_info.get('name', track_name)}**:
- Samples: {track_info.get('total_samples', 0):,}
- Language Pairs: {track_info.get('language_pairs', 0)}
- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
"""
        
        info_msg += f"""

### 📋 Enhanced File Format:
- `sample_id`: Unique identifier for each sample
- `source_text`: Text to be translated
- `source_language`: Source language code
- `target_language`: Target language code  
- `domain`: Content domain (if available)
- `google_comparable`: Whether this pair can be compared with Google Translate
- `tracks_included`: Comma-separated list of tracks that include this sample
- `statistical_weight`: Statistical importance weight (1.0-5.0)

### 🔬 Next Steps for Scientific Evaluation:
1. **Run your model** on the source texts to generate translations
2. **Create a predictions file** with columns: `sample_id`, `prediction`
3. **Optional**: Add `category` column to help with model classification
4. **Submit** your predictions using the appropriate track tab
5. **Analyze** results with statistical confidence intervals

### 💡 Tips for Best Results:
- Ensure coverage of all language pairs for chosen track
- Include confidence scores if available
- Provide detailed model description for proper categorization
- Consider submitting to multiple tracks for comprehensive evaluation
        """
        
        return download_path, info_msg
        
    except Exception as e:
        error_msg = f"❌ Error creating scientific test set download: {str(e)}"
        return None, error_msg

def validate_scientific_submission(
    file, model_name: str, author: str, description: str
) -> Tuple[str, Optional[pd.DataFrame], str]:
    """Validate uploaded prediction file with scientific rigor."""
    
    try:
        if file is None:
            return "❌ Please upload a predictions file", None, "community"
        if not model_name.strip():
            return "❌ Please provide a model name", None, "community"

        # Handle different file input types
        if isinstance(file, bytes):
            file_content = file
        elif isinstance(file, str):
            if os.path.exists(file):
                with open(file, "rb") as f:
                    file_content = f.read()
            else:
                file_content = file.encode("utf-8")
        elif hasattr(file, "name") and os.path.exists(file.name):
            with open(file.name, "rb") as f:
                file_content = f.read()
        else:
            return "❌ Could not read uploaded file", None, "community"

        # Determine filename
        filename = (
            getattr(file, "name", None)
            or getattr(file, "filename", None)
            or "predictions.csv"
        )

        # Load test set if needed
        global complete_test_set
        if complete_test_set is None:
            complete_test_set = get_complete_test_set_scientific()

        # Run enhanced scientific validation
        validation_result = validate_submission_scientific(
            file_content, filename, complete_test_set, model_name, author, description
        )

        detected_category = validation_result.get("category", "community")
        
        # Return predictions if evaluation is possible (even with limitations)
        if validation_result.get("can_evaluate", False):
            return validation_result["report"], validation_result["predictions"], detected_category
        else:
            return validation_result["report"], None, detected_category

    except Exception as e:
        return (
            f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
            None,
            "community"
        )

def evaluate_scientific_submission(
    predictions_df: pd.DataFrame,
    model_name: str,
    author: str,
    description: str,
    detected_category: str,
    validation_info: Dict,
) -> Tuple[str, pd.DataFrame, object, object]:
    """Evaluate validated predictions using scientific methodology."""
    
    try:
        if predictions_df is None:
            return "❌ No valid predictions to evaluate", None, None, None
        
        # Get complete test set with targets
        global complete_test_set, current_leaderboard
        if complete_test_set is None:
            complete_test_set = get_complete_test_set_scientific()
        
        # Run scientific evaluation across all tracks
        print(f"🔬 Starting scientific evaluation for {model_name}...")
        evaluation_results = evaluate_predictions_scientific(
            predictions_df, complete_test_set, detected_category
        )
        
        if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
            errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
            return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
        
        # Add to scientific leaderboard
        print("🏆 Adding to scientific leaderboard...")
        updated_leaderboard = add_model_to_scientific_leaderboard(
            model_name=sanitize_model_name(model_name),
            author=author or "Anonymous",
            evaluation_results=evaluation_results,
            model_category=detected_category,
            description=description or ""
        )
        
        # Update global leaderboard
        current_leaderboard = updated_leaderboard
        
        # Generate scientific report
        report = generate_scientific_report(evaluation_results, model_name)
        
        # Create visualizations
        summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
        cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
        
        # Prepare display leaderboard (Google-comparable track by default)
        google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
        display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
        
        # Format success message with track-specific results
        success_msg = f"""
## 🎉 Scientific Evaluation Complete!

### 📊 Model Information:
- **Model**: {model_name}
- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
- **Author**: {author or 'Anonymous'}

### 🏆 Track Performance Summary:
"""
        
        tracks = evaluation_results.get('tracks', {})
        for track_name, track_data in tracks.items():
            if not track_data.get('error'):
                track_config = EVALUATION_TRACKS[track_name]
                track_averages = track_data.get('track_averages', {})
                summary = track_data.get('summary', {})
                
                # Get rank in this track
                track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
                if not track_leaderboard.empty:
                    model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
                    rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
                    total_models = len(track_leaderboard)
                else:
                    rank = "N/A"
                    total_models = 0
                
                quality_score = track_averages.get('quality_score', 0)
                bleu_score = track_averages.get('bleu', 0)
                samples = summary.get('total_samples', 0)
                
                success_msg += f"""
**🏁 {track_config['name']}**:
- Rank: #{rank} out of {total_models} models
- Quality Score: {quality_score:.4f}
- BLEU: {bleu_score:.2f}
- Samples: {samples:,}
"""
        
        success_msg += f"""

### 🔬 Scientific Adequacy:
- **Cross-Track Consistency**: Available in detailed analysis
- **Statistical Confidence**: 95% confidence intervals computed
- **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}

{report}
        """
        
        return success_msg, display_leaderboard, summary_plot, cross_track_plot

    except Exception as e:
        error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        return error_msg, None, None, None

def refresh_track_leaderboard(
    track: str,
    search_query: str = "",
    category_filter: str = "all",
    min_adequacy: float = 0.0,
    show_ci: bool = True
) -> Tuple[pd.DataFrame, object, object, str]:
    """Refresh leaderboard for a specific track with filters."""
    
    try:
        global current_leaderboard
        if current_leaderboard is None:
            current_leaderboard = load_scientific_leaderboard()
        
        # Get track-specific leaderboard
        track_leaderboard = get_track_leaderboard(
            current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
        )
        
        # Apply search filter
        if search_query:
            query_lower = search_query.lower()
            mask = (
                track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
                track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
            )
            track_leaderboard = track_leaderboard[mask]
        
        # Prepare for display
        display_df = prepare_track_leaderboard_display(track_leaderboard, track)
        
        # Create plots
        ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
        comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
        
        # Get track statistics
        track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
        track_config = EVALUATION_TRACKS[track]
        
        stats_text = f"""
### 📊 {track_config['name']} Statistics

- **Total Models**: {track_stats.get('total_models', 0)}
- **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
- **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}

**Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}  
**Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}

### 🔬 Scientific Notes:
- All metrics include 95% confidence intervals
- Statistical adequacy verified for reliable comparisons
- {track_config['description']}
        """
        
        return display_df, ranking_plot, comparison_plot, stats_text
        
    except Exception as e:
        error_msg = f"Error loading {track} leaderboard: {str(e)}"
        empty_df = pd.DataFrame()
        return empty_df, None, None, error_msg

def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
    """Get detailed scientific analysis for a specific model."""
    
    try:
        global current_leaderboard
        if current_leaderboard is None:
            return "Leaderboard not loaded", None, None
        
        # Find model
        model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
        
        if model_row.empty:
            return f"Model '{model_name}' not found", None, None
        
        model_info = model_row.iloc[0]
        
        # Parse detailed metrics for the requested track
        try:
            detailed_results = json.loads(model_info[f'detailed_{track}'])
        except:
            detailed_results = {}
        
        # Create detailed plots
        detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
        
        # Create language pair heatmap
        heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
        
        # Format model details with scientific information
        track_config = EVALUATION_TRACKS[track]
        category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
        
        # Extract track-specific metrics
        quality_col = f"{track}_quality"
        bleu_col = f"{track}_bleu"
        chrf_col = f"{track}_chrf"
        ci_lower_col = f"{track}_ci_lower"
        ci_upper_col = f"{track}_ci_upper"
        samples_col = f"{track}_samples"
        pairs_col = f"{track}_pairs"
        adequate_col = f"{track}_adequate"
        
        details_text = f"""
## 🔬 Scientific Model Analysis: {model_name}

### 📋 Basic Information:
- **Author**: {model_info['author']}
- **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
- **Submission Date**: {model_info['submission_date'][:10]}
- **Description**: {model_info['description'] or 'No description provided'}

### 🏁 {track_config['name']} Performance:
- **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))}
- **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')}
- **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')}

### 📊 Coverage Information:
- **Total Samples**: {model_info.get(samples_col, 0):,}
- **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
- **Statistical Adequacy**: {'✅ Yes' if model_info.get(adequate_col, False) else '❌ No'}

### 🔬 Statistical Metadata:
- **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
- **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
- **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}

### 📈 Cross-Track Performance:
"""
        
        # Add other track performances for comparison
        for other_track in EVALUATION_TRACKS.keys():
            if other_track != track:
                other_quality_col = f"{other_track}_quality"
                other_adequate_col = f"{other_track}_adequate"
                
                if model_info.get(other_adequate_col, False):
                    other_quality = model_info.get(other_quality_col, 0)
                    details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
                else:
                    details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
        
        details_text += f"""

### 💡 Scientific Interpretation:
- Performance metrics include 95% confidence intervals for reliability
- Statistical adequacy ensures meaningful comparisons with other models
- Cross-track analysis reveals model strengths across different language sets
- Category classification helps contextualize performance expectations
        """
        
        return details_text, detail_plot, heatmap_plot
        
    except Exception as e:
        error_msg = f"Error getting model details: {str(e)}"
        return error_msg, None, None

def perform_model_comparison(
    model_names: List[str], track: str, comparison_type: str = "statistical"
) -> Tuple[str, object]:
    """Perform scientific comparison between selected models."""
    
    try:
        global current_leaderboard
        if current_leaderboard is None:
            return "Leaderboard not loaded", None
        
        if len(model_names) < 2:
            return "Please select at least 2 models for comparison", None
        
        # Get models
        models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
        
        if len(models) < 2:
            return "Selected models not found in leaderboard", None
        
        # Perform fair comparison
        comparison_result = perform_fair_comparison(current_leaderboard, model_names)
        
        if comparison_result.get('error'):
            return f"Comparison error: {comparison_result['error']}", None
        
        # Create comparison visualization
        if comparison_type == "statistical":
            comparison_plot = create_statistical_comparison_plot(models, track)
        else:
            comparison_plot = create_category_comparison_plot(models, track)
        
        # Format comparison report
        track_config = EVALUATION_TRACKS[track]
        comparison_text = f"""
## 🔬 Scientific Model Comparison - {track_config['name']}

### 📊 Models Compared:
"""
        
        quality_col = f"{track}_quality"
        ci_lower_col = f"{track}_ci_lower"
        ci_upper_col = f"{track}_ci_upper"
        
        # Sort models by performance
        models_sorted = models.sort_values(quality_col, ascending=False)
        
        for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
            category_info = MODEL_CATEGORIES.get(model['model_category'], {})
            
            comparison_text += f"""
**#{i}. {model['model_name']}**
- Category: {category_info.get('name', 'Unknown')}
- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
- Author: {model['author']}
"""
        
        # Add statistical analysis
        track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
        if track_comparison:
            comparison_text += f"""

### 🔬 Statistical Analysis:
- **Models with adequate data**: {track_comparison.get('participating_models', 0)}
- **Confidence intervals available**: Yes (95% level)
- **Fair comparison possible**: {'✅ Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
"""
            
            # Check for statistical significance (simplified)
            quality_scores = list(track_comparison.get('quality_scores', {}).values())
            if len(quality_scores) >= 2:
                score_range = max(quality_scores) - min(quality_scores)
                if score_range > 0.05:  # 5% difference threshold
                    comparison_text += "- **Performance differences**: Potentially significant\n"
                else:
                    comparison_text += "- **Performance differences**: Minimal\n"
        
        # Add recommendations
        recommendations = comparison_result.get('recommendations', [])
        if recommendations:
            comparison_text += "\n### 💡 Recommendations:\n"
            for rec in recommendations:
                comparison_text += f"- {rec}\n"
        
        return comparison_text, comparison_plot
        
    except Exception as e:
        error_msg = f"Error performing comparison: {str(e)}"
        return error_msg, None

# Initialize data on startup
print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
initialization_success = initialize_scientific_data()

# Create Gradio interface with scientific design
with gr.Blocks(
    title=UI_CONFIG["title"],
    theme=gr.themes.Soft(),
    css="""
    .gradio-container {
        max-width: 1600px !important;
        margin: 0 auto;
    }
    .scientific-header {
        text-align: center;
        margin-bottom: 2rem;
        padding: 2rem;
        background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
        color: white;
        border-radius: 10px;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    }
    .track-tab {
        border-radius: 8px;
        margin: 0.5rem;
        padding: 1rem;
        border: 2px solid transparent;
    }
    .track-tab.google-comparable {
        border-color: #1f77b4;
        background: linear-gradient(45deg, #f0f9ff, #e0f2fe);
    }
    .track-tab.ug40-complete {
        border-color: #ff7f0e;
        background: linear-gradient(45deg, #fff7ed, #fed7aa);
    }
    .track-tab.language-pair-matrix {
        border-color: #2ca02c;
        background: linear-gradient(45deg, #f0fdf4, #dcfce7);
    }
    .metric-box {
        background: #f8fafc;
        padding: 1rem;
        border-radius: 8px;
        margin: 0.5rem 0;
        border-left: 4px solid #3b82f6;
    }
    .scientific-note {
        background: #fef3c7;
        border: 1px solid #f59e0b;
        border-radius: 8px;
        padding: 1rem;
        margin: 1rem 0;
    }
    .adequacy-excellent { border-left-color: #22c55e; }
    .adequacy-good { border-left-color: #eab308; }
    .adequacy-fair { border-left-color: #f97316; }
    .adequacy-insufficient { border-left-color: #ef4444; }
    """
) as demo:
    
    # Scientific Header
    gr.HTML(f"""
    <div class="scientific-header">
    <h1>🏆 SALT Translation Leaderboard - Scientific Edition</h1>
    <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
    <p>Three-tier evaluation tracks • 95% Confidence intervals • Research-grade analysis</p>
    <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
    </div>
    """)
    
    # Status indicator
    if initialization_success:
        status_msg = "✅ Scientific system initialized successfully"
        adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
        status_msg += f" | Test set adequacy: {adequacy_info.title()}"
    else:
        status_msg = "❌ System initialization failed - some features may not work"
    
    gr.Markdown(f"**System Status**: {status_msg}")
    
    # Add scientific overview
    gr.Markdown("""
    ## 🔬 Scientific Evaluation Framework
    
    This leaderboard implements rigorous scientific methodology for translation model evaluation:
    
    - **Three Evaluation Tracks**: Fair comparison across different model capabilities
    - **Statistical Significance**: 95% confidence intervals and effect size analysis  
    - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
    - **Cross-Track Consistency**: Validate model performance across language sets
    """)

    with gr.Tabs():
        
        # Tab 1: Download Test Set
        with gr.Tab("📥 Download Test Set", id="download"):
            gr.Markdown("""
            ## 📋 Get the SALT Scientific Test Set
            
            Download our scientifically designed test set with stratified sampling and statistical weighting.
            """)
            
            with gr.Row():
                download_btn = gr.Button("📥 Download Scientific Test Set", variant="primary", size="lg")
            
            with gr.Row():
                with gr.Column():
                    download_file = gr.File(label="📂 Test Set File", interactive=False)
                with gr.Column():
                    download_info = gr.Markdown(label="ℹ️ Test Set Information")
        
        # Tab 2: Submit Predictions  
        with gr.Tab("🚀 Submit Predictions", id="submit"):
            gr.Markdown("""
            ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
            
            Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 📝 Model Information")
                    
                    model_name_input = gr.Textbox(
                        label="🤖 Model Name",
                        placeholder="e.g., MyTranslator-v2.0",
                        info="Unique name for your model"
                    )
                    
                    author_input = gr.Textbox(
                        label="👤 Author/Organization", 
                        placeholder="Your name or organization",
                        value="Anonymous"
                    )
                    
                    description_input = gr.Textbox(
                        label="📄 Model Description",
                        placeholder="Architecture, training data, special features...",
                        lines=4,
                        info="Detailed description helps with proper categorization"
                    )
                    
                    gr.Markdown("### 📤 Upload Predictions")
                    predictions_file = gr.File(
                        label="📂 Predictions File",
                        file_types=[".csv", ".tsv", ".json"]
                    )
                    
                    validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
                    submit_btn = gr.Button("🚀 Submit for Scientific Evaluation", variant="primary", interactive=False)
                
                with gr.Column(scale=1):
                    gr.Markdown("### 📊 Validation Results")
                    validation_output = gr.Markdown()
            
            # Results section
            gr.Markdown("### 🏆 Scientific Evaluation Results")
            
            with gr.Row():
                evaluation_output = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    submission_plot = gr.Plot(label="📈 Submission Analysis")
                with gr.Column():
                    cross_track_plot = gr.Plot(label="🔄 Cross-Track Analysis")
            
            with gr.Row():
                results_table = gr.Dataframe(label="📊 Updated Leaderboard (Google-Comparable Track)", interactive=False)
        
        # Tab 3: Google-Comparable Track
        with gr.Tab("🤖 Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
            gr.Markdown(f"""
            ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
            
            **Fair comparison with commercial translation systems**
            
            This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate, 
            enabling direct comparison with commercial baselines.
            
            - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
            - **Purpose**: Commercial system comparison and baseline establishment
            - **Statistical Power**: High (optimized sample sizes)
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    google_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                with gr.Column(scale=1):
                    google_category = gr.Dropdown(
                        label="🏷️ Category Filter",
                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
                        value="all"
                    )
                with gr.Column(scale=1):
                    google_adequacy = gr.Slider(
                        label="📊 Min Adequacy",
                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                    )
                with gr.Column(scale=1):
                    google_refresh = gr.Button("🔄 Refresh", variant="secondary")
            
            with gr.Row():
                google_stats = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    google_ranking_plot = gr.Plot(label="🏆 Google-Comparable Rankings")
                with gr.Column():
                    google_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
            
            with gr.Row():
                google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)
        
        # Tab 4: UG40-Complete Track
        with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
            gr.Markdown(f"""
            ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
            
            **Comprehensive evaluation across all Ugandan languages**
            
            This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs, 
            providing the most comprehensive assessment of Ugandan language translation capabilities.
            
            - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
            - **Purpose**: Comprehensive Ugandan language capability assessment
            - **Coverage**: Complete linguistic landscape of Uganda
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    ug40_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                with gr.Column(scale=1):
                    ug40_category = gr.Dropdown(
                        label="🏷️ Category Filter",
                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
                        value="all"
                    )
                with gr.Column(scale=1):
                    ug40_adequacy = gr.Slider(
                        label="📊 Min Adequacy",
                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                    )
                with gr.Column(scale=1):
                    ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
            
            with gr.Row():
                ug40_stats = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    ug40_ranking_plot = gr.Plot(label="🏆 UG40-Complete Rankings")
                with gr.Column():
                    ug40_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
            
            with gr.Row():
                ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)
        
        # Tab 5: Language-Pair Matrix
        with gr.Tab("📊 Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
            gr.Markdown(f"""
            ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
            
            **Detailed language pair analysis with statistical significance**
            
            This view provides granular analysis of model performance across individual language pairs 
            with statistical significance testing and effect size analysis.
            
            - **Resolution**: Individual language pair performance
            - **Purpose**: Detailed linguistic analysis and model diagnostics
            - **Statistics**: Pairwise significance testing available
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    matrix_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                with gr.Column(scale=1):
                    matrix_category = gr.Dropdown(
                        label="🏷️ Category Filter",
                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
                        value="all"
                    )
                with gr.Column(scale=1):
                    matrix_adequacy = gr.Slider(
                        label="📊 Min Adequacy",
                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                    )
                with gr.Column(scale=1):
                    matrix_refresh = gr.Button("🔄 Refresh", variant="secondary")
            
            with gr.Row():
                matrix_stats = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    matrix_ranking_plot = gr.Plot(label="🏆 Language-Pair Matrix Rankings")
                with gr.Column():
                    matrix_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
            
            with gr.Row():
                matrix_leaderboard = gr.Dataframe(label="📈 Language-Pair Matrix Leaderboard", interactive=False)
        
        # Tab 6: Model Analysis
        with gr.Tab("🔍 Scientific Model Analysis", id="analysis"):
            gr.Markdown("""
            ## 🔬 Detailed Scientific Model Analysis
            
            Comprehensive analysis of individual models with statistical confidence intervals,
            cross-track performance, and detailed language pair breakdowns.
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    model_select = gr.Dropdown(
                        label="🤖 Select Model",
                        choices=[],
                        value=None,
                        info="Choose a model for detailed scientific analysis"
                    )
                with gr.Column(scale=1):
                    track_select = gr.Dropdown(
                        label="🏁 Analysis Track",
                        choices=list(EVALUATION_TRACKS.keys()),
                        value="google_comparable",
                        info="Track for detailed analysis"
                    )
                with gr.Column(scale=1):
                    analyze_btn = gr.Button("🔍 Analyze", variant="primary")
            
            with gr.Row():
                model_details = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    model_analysis_plot = gr.Plot(label="📊 Detailed Performance Analysis")
                with gr.Column():
                    model_heatmap_plot = gr.Plot(label="🗺️ Language Pair Heatmap")
        
        # Tab 7: Model Comparison
        with gr.Tab("⚖️ Scientific Model Comparison", id="comparison"):
            gr.Markdown("""
            ## 🔬 Scientific Model Comparison
            
            Compare multiple models with statistical significance testing and fair comparison analysis.
            Only models evaluated on the same language pairs are compared for scientific validity.
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    comparison_models = gr.CheckboxGroup(
                        label="🤖 Select Models to Compare",
                        choices=[],
                        value=[],
                        info="Select 2-6 models for comparison"
                    )
                with gr.Column(scale=1):
                    comparison_track = gr.Dropdown(
                        label="🏁 Comparison Track",
                        choices=list(EVALUATION_TRACKS.keys()),
                        value="google_comparable"
                    )
                    comparison_type = gr.Radio(
                        label="📊 Comparison Type",
                        choices=["statistical", "category"],
                        value="statistical"
                    )
                    compare_btn = gr.Button("⚖️ Compare Models", variant="primary")
            
            with gr.Row():
                comparison_output = gr.Markdown()
            
            with gr.Row():
                comparison_plot = gr.Plot(label="📊 Model Comparison Analysis")
        
        # Tab 8: Documentation
        with gr.Tab("📚 Scientific Documentation", id="docs"):
            gr.Markdown(f"""
            # 📖 SALT Translation Leaderboard - Scientific Edition Documentation
            
            ## 🎯 Overview
            
            The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology 
            for translation models on Ugandan languages, designed for research publication and scientific analysis.
            
            ## 🔬 Scientific Methodology
            
            ### Three-Tier Evaluation System
            
            **1. 🤖 Google-Comparable Track**
            - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
            - **Pairs**: {len(get_google_comparable_pairs())} language pairs
            - **Purpose**: Fair comparison with commercial translation systems
            - **Statistical Power**: High (≥200 samples per pair recommended)
            
            **2. 🌍 UG40-Complete Track**  
            - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
            - **Pairs**: {len(get_all_language_pairs())} language pairs
            - **Purpose**: Comprehensive Ugandan language capability assessment
            - **Statistical Power**: Moderate (≥100 samples per pair recommended)
            
            **3. 📊 Language-Pair Matrix**
            - **Resolution**: Individual language pair analysis
            - **Purpose**: Detailed linguistic analysis and model diagnostics
            - **Statistics**: Pairwise significance testing with multiple comparison correction
            
            ### Statistical Rigor
            
            - **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
            - **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
            - **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
            - **Statistical Power**: Estimated based on sample sizes and effect sizes
            
            ### Model Categories
            
            Models are automatically categorized for fair comparison:
            
            - **🏢 Commercial**: Production translation systems (Google Translate, Azure, etc.)
            - **🔬 Research**: Academic and research institution models (NLLB, M2M-100, etc.)
            - **📊 Baseline**: Simple baseline and reference models
            - **👥 Community**: User-submitted models and fine-tuned variants
            
            ## 📊 Evaluation Metrics
            
            ### Primary Metrics
            - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
            - **BLEU**: Bilingual Evaluation Understudy (0-100)
            - **ChrF**: Character-level F-score (0-1)
            
            ### Secondary Metrics  
            - **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
            - **CER/WER**: Character/Word Error Rate (lower is better)
            - **Length Ratio**: Prediction/reference length ratio
            
            All metrics include 95% confidence intervals for statistical reliability.
            
            ## 🔄 Submission Process
            
            ### Step 1: Download Scientific Test Set
            1. Click "Download Scientific Test Set" in the first tab
            2. Review test set adequacy and track breakdown
            3. Save the enhanced test set with statistical weights
            
            ### Step 2: Generate Predictions
            1. Load the test set in your evaluation pipeline
            2. For each row, translate `source_text` from `source_language` to `target_language`
            3. Save results as CSV with columns: `sample_id`, `prediction`
            4. Optional: Add `category` column for automatic classification
            
            ### Step 3: Submit & Evaluate
            1. Fill in detailed model information (improves categorization)
            2. Upload your predictions file
            3. Review validation report with track-specific adequacy assessment
            4. Submit for scientific evaluation across all tracks
            
            ## 📋 Enhanced File Formats
            
            ### Scientific Test Set Format
            ```csv
            sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
            salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
            salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
            salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
            ```
            
            ### Predictions Format
            ```csv
            sample_id,prediction,category
            salt_000001,"Amakuru ensi","community"
            salt_000002,"Ibino nining?","community"
            salt_000003,"Ejok nanu","community"
            ```
            
            ## 🏆 Scientific Leaderboard Features
            
            ### Fair Comparison
            - Models only compared within the same category and track
            - Statistical significance testing prevents misleading rankings
            - Confidence intervals show measurement uncertainty
            
            ### Cross-Track Analysis
            - Consistency analysis across evaluation tracks
            - Identification of model strengths and weaknesses
            - Language-specific performance patterns
            
            ### Publication Quality
            - All visualizations include error bars and statistical annotations
            - Comprehensive methodology documentation
            - Reproducible evaluation pipeline
            
            ## 🔬 Statistical Interpretation Guide
            
            ### Confidence Intervals
            - **Non-overlapping CIs**: Likely significant difference
            - **Overlapping CIs**: May or may not be significant (requires formal testing)
            - **Wide CIs**: High uncertainty (need more data)
            
            ### Effect Sizes
            - **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
            - **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
            - **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference  
            - **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
            
            ### Statistical Adequacy
            - **Excellent**: High statistical power (>0.8) for all comparisons
            - **Good**: Adequate power for most comparisons
            - **Fair**: Limited power, interpret with caution
            - **Insufficient**: Results not reliable for scientific conclusions
            
            ## 🤝 Contributing to Science
            
            This leaderboard is designed for the research community. When using results:
            
            1. **Always report confidence intervals** along with point estimates
            2. **Acknowledge statistical adequacy** when interpreting results
            3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
            4. **Consider effect sizes** not just statistical significance
            
            ## 📄 Citation
            
            If you use this leaderboard in your research, please cite:
            
            ```bibtex
            @misc{{salt_leaderboard_scientific_2024,
              title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
              author={{Sunbird AI}},
              year={{2024}},
              url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
              note={{Three-tier evaluation system with statistical significance testing}}
            }}
            ```
            
            ## 🔗 Related Resources
            
            - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
            - **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
            - **Statistical Methodology**: See our technical paper on rigorous MT evaluation
            - **Open Source Code**: Available on GitHub for reproducibility
            
            ---
            
            *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
            """)
    
    # Event handlers with enhanced scientific functionality
    predictions_validated = gr.State(value=None)
    validation_info_state = gr.State(value=None)
    detected_category_state = gr.State(value="community")
    
    # Download test set
    download_btn.click(
        fn=download_scientific_test_set,
        outputs=[download_file, download_info]
    )
    
    # Validate predictions
    def handle_scientific_validation(file, model_name, author, description):
        report, predictions, category = validate_scientific_submission(file, model_name, author, description)
        
        # Enable button if predictions are available (allows evaluation with limitations)
        can_evaluate = predictions is not None
        
        # Add user-friendly button status message to report
        if can_evaluate:
            if "🎉 **Final Verdict**: Ready for scientific evaluation!" in report:
                button_status = "\n\n✅ **Button Status**: Ready to submit for evaluation!"
            elif "⚠️ **Final Verdict**: Can be evaluated with limitations" in report:
                button_status = "\n\n⚠️ **Button Status**: Can submit for evaluation (results will include limitations note)"
            else:
                button_status = "\n\n✅ **Button Status**: Evaluation possible"
        else:
            button_status = "\n\n❌ **Button Status**: Please fix issues above before evaluation"
        
        enhanced_report = report + button_status
        
        return (
            enhanced_report,
            predictions,
            {"category": category, "validation_passed": can_evaluate},
            category,
            gr.update(interactive=can_evaluate)
        )
    
    validate_btn.click(
        fn=handle_scientific_validation,
        inputs=[predictions_file, model_name_input, author_input, description_input],
        outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
    )
    
    # Submit for evaluation
    def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
        if predictions is None:
            return "❌ Please validate your submission first", None, None, None
        
        return evaluate_scientific_submission(
            predictions, model_name, author, description, category, validation_info
        )
    
    submit_btn.click(
        fn=handle_scientific_submission,
        inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
        outputs=[evaluation_output, results_table, submission_plot, cross_track_plot]
    )
    
    # Track leaderboard refresh functions
    def refresh_google_track(*args):
        return refresh_track_leaderboard("google_comparable", *args)
    
    def refresh_ug40_track(*args):
        return refresh_track_leaderboard("ug40_complete", *args)
    
    def refresh_matrix_track(*args):
        return refresh_track_leaderboard("language_pair_matrix", *args)
    
    # Google-Comparable Track
    google_refresh.click(
        fn=refresh_google_track,
        inputs=[google_search, google_category, google_adequacy],
        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
    )
    
    # UG40-Complete Track
    ug40_refresh.click(
        fn=refresh_ug40_track,
        inputs=[ug40_search, ug40_category, ug40_adequacy],
        outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
    )
    
    # Language-Pair Matrix Track
    matrix_refresh.click(
        fn=refresh_matrix_track,
        inputs=[matrix_search, matrix_category, matrix_adequacy],
        outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
    )
    
    # Model analysis
    analyze_btn.click(
        fn=get_scientific_model_details,
        inputs=[model_select, track_select],
        outputs=[model_details, model_analysis_plot, model_heatmap_plot]
    )
    
    # Model comparison
    compare_btn.click(
        fn=perform_model_comparison,
        inputs=[comparison_models, comparison_track, comparison_type],
        outputs=[comparison_output, comparison_plot]
    )
    
    # Load initial data and update dropdowns
    def load_initial_data():
        # Load initial Google track data
        google_data = refresh_google_track("", "all", 0.0)
        
        # Update dropdown choices
        if current_leaderboard is not None and not current_leaderboard.empty:
            model_choices = current_leaderboard['model_name'].tolist()
        else:
            model_choices = []
        
        return (
            google_data[0],  # google_leaderboard
            google_data[1],  # google_ranking_plot  
            google_data[2],  # google_comparison_plot
            google_data[3],  # google_stats
            gr.Dropdown(choices=model_choices),  # model_select
            gr.CheckboxGroup(choices=model_choices)  # comparison_models
        )
    
    demo.load(
        fn=load_initial_data,
        outputs=[
            google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats,
            model_select, comparison_models
        ]
    )

# Launch the scientific application
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )