Spaces:

akera
/

leaderboard

Running

App Files Files Community

akera commited on Jun 16

Commit

aed11c8

verified ·

1 Parent(s): 988dfa3

Update app.py

Browse files

Files changed (1) hide show

app.py +447 -286

app.py CHANGED Viewed

@@ -4,52 +4,64 @@ import sys
 import os
 from pathlib import Path
 def setup_salt():
     """Clone and setup SALT library like in Colab."""
     try:
         # Check if salt is already available
         import salt.dataset
         print("✅ SALT library already available")
         return True
     except ImportError:
         pass
     print("📥 Setting up SALT library...")
     try:
         # Clone SALT repo if not exists
         salt_dir = Path("salt")
         if not salt_dir.exists():
             print("🔄 Cloning SALT repository...")
-            subprocess.check_call([
-                "git", "clone", "https://github.com/sunbirdai/salt.git"
-            ])
         else:
             print("📁 SALT repository already exists")
         # Install SALT requirements
         salt_requirements = salt_dir / "requirements.txt"
         if salt_requirements.exists():
             print("📦 Installing SALT requirements...")
-            subprocess.check_call([
-                sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
-            ])
         # Add SALT directory to Python path
         salt_path = str(salt_dir.absolute())
         if salt_path not in sys.path:
             sys.path.insert(0, salt_path)
             print(f"🔗 Added {salt_path} to Python path")
         # Test import
         import salt.dataset
         print("✅ SALT library setup completed successfully")
         return True
     except Exception as e:
         print(f"❌ Failed to setup SALT: {e}")
         return False
 # Setup SALT on startup
 print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
 if not setup_salt():
@@ -66,42 +78,42 @@ from typing import Optional, Dict, Tuple, List
 # Import our enhanced modules
 from src.test_set import (
-    get_public_test_set_scientific,
     get_complete_test_set_scientific,
-    create_test_set_download_scientific,
     validate_test_set_integrity_scientific,
-    get_track_test_set
 )
 from src.validation import validate_submission_scientific
 from src.evaluation import (
-    evaluate_predictions_scientific,
     generate_scientific_report,
-    compare_models_statistically
 )
 from src.leaderboard import (
-    load_scientific_leaderboard,
     add_model_to_scientific_leaderboard,
-    get_scientific_leaderboard_stats,
     get_track_leaderboard,
     prepare_track_leaderboard_display,
     perform_fair_comparison,
-    export_scientific_leaderboard
 )
 from src.plotting import (
-    create_scientific_leaderboard_plot,
     create_language_pair_heatmap_scientific,
     create_statistical_comparison_plot,
     create_category_comparison_plot,
     create_adequacy_analysis_plot,
     create_cross_track_analysis_plot,
-    create_scientific_model_detail_plot
 )
 from src.utils import (
-    sanitize_model_name,
-    get_all_language_pairs,
     get_google_comparable_pairs,
     get_track_language_pairs,
-    format_metric_value
 )
 from config import *
@@ -111,60 +123,64 @@ public_test_set = None
 complete_test_set = None
 test_set_stats = None
 def initialize_scientific_data():
     """Initialize scientific test sets and leaderboard data."""
     global public_test_set, complete_test_set, current_leaderboard, test_set_stats
     try:
         print("🔬 Initializing SALT Translation Leaderboard - Scientific Edition...")
         # Load scientific test sets
         print("📥 Loading scientific test sets...")
         public_test_set = get_public_test_set_scientific()
         complete_test_set = get_complete_test_set_scientific()
         # Load scientific leaderboard
         print("🏆 Loading scientific leaderboard...")
         current_leaderboard = load_scientific_leaderboard()
         # Validate test set integrity
         print("🔍 Validating test set integrity...")
         test_set_stats = validate_test_set_integrity_scientific()
         print(f"✅ Scientific initialization complete!")
         print(f"   - Test set: {len(public_test_set):,} samples")
         print(f"   - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
-        print(f"   - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
         print(f"   - Current models: {len(current_leaderboard)}")
         return True
     except Exception as e:
         print(f"❌ Scientific initialization failed: {e}")
         traceback.print_exc()
         return False
 def download_scientific_test_set() -> Tuple[str, str]:
     """Create downloadable scientific test set and return file path and info."""
     try:
         global public_test_set
         if public_test_set is None:
             public_test_set = get_public_test_set_scientific()
         # Create download file
         download_path, stats = create_test_set_download_scientific()
         # Create comprehensive info message
-        adequacy = stats.get('adequacy_assessment', 'unknown')
         adequacy_emoji = {
-            'excellent': '🟢',
-            'good': '🟡',
-            'fair': '🟠',
-            'insufficient': '🔴',
-            'unknown': '⚪'
-        }.get(adequacy, '⚪')
         info_msg = f"""
 ## 📥 SALT Scientific Test Set Downloaded Successfully!
@@ -182,10 +198,12 @@ def download_scientific_test_set() -> Tuple[str, str]:
 ### 🏁 Track Breakdown:
 """
-        track_breakdown = stats.get('track_breakdown', {})
         for track_name, track_info in track_breakdown.items():
-            status_emoji = '✅' if track_info.get('statistical_adequacy', False) else '⚠️'
             info_msg += f"""
 **{status_emoji} {track_info.get('name', track_name)}**:
 - Samples: {track_info.get('total_samples', 0):,}
@@ -193,7 +211,7 @@ def download_scientific_test_set() -> Tuple[str, str]:
 - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
 - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
 """
         info_msg += f"""
 ### 📋 Enhanced File Format:
@@ -219,18 +237,19 @@ def download_scientific_test_set() -> Tuple[str, str]:
 - Provide detailed model description for proper categorization
 - Consider submitting to multiple tracks for comprehensive evaluation
         """
         return download_path, info_msg
     except Exception as e:
         error_msg = f"❌ Error creating scientific test set download: {str(e)}"
         return None, error_msg
 def validate_scientific_submission(
     file, model_name: str, author: str, description: str
 ) -> Tuple[str, Optional[pd.DataFrame], str]:
     """Validate uploaded prediction file with scientific rigor."""
     try:
         if file is None:
             return "❌ Please upload a predictions file", None, "community"
@@ -270,9 +289,13 @@ def validate_scientific_submission(
         )
         detected_category = validation_result.get("category", "community")
         if validation_result["valid"]:
-            return validation_result["report"], validation_result["predictions"], detected_category
         else:
             return validation_result["report"], None, detected_category
@@ -280,9 +303,10 @@ def validate_scientific_submission(
         return (
             f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
             None,
-            "community"
         )
 def evaluate_scientific_submission(
     predictions_df: pd.DataFrame,
     model_name: str,
@@ -292,26 +316,33 @@ def evaluate_scientific_submission(
     validation_info: Dict,
 ) -> Tuple[str, pd.DataFrame, object, object]:
     """Evaluate validated predictions using scientific methodology."""
     try:
         if predictions_df is None:
             return "❌ No valid predictions to evaluate", None, None, None
         # Get complete test set with targets
         global complete_test_set, current_leaderboard
         if complete_test_set is None:
             complete_test_set = get_complete_test_set_scientific()
         # Run scientific evaluation across all tracks
         print(f"🔬 Starting scientific evaluation for {model_name}...")
         evaluation_results = evaluate_predictions_scientific(
             predictions_df, complete_test_set, detected_category
         )
-        if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
-            errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
             return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
         # Add to scientific leaderboard
         print("🏆 Adding to scientific leaderboard...")
         updated_leaderboard = add_model_to_scientific_leaderboard(
@@ -319,23 +350,27 @@ def evaluate_scientific_submission(
             author=author or "Anonymous",
             evaluation_results=evaluation_results,
             model_category=detected_category,
-            description=description or ""
         )
         # Update global leaderboard
         current_leaderboard = updated_leaderboard
         # Generate scientific report
         report = generate_scientific_report(evaluation_results, model_name)
         # Create visualizations
         summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
         cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
         # Prepare display leaderboard (Google-comparable track by default)
-        google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
-        display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
         # Format success message with track-specific results
         success_msg = f"""
 ## 🎉 Scientific Evaluation Complete!
@@ -347,28 +382,33 @@ def evaluate_scientific_submission(
 ### 🏆 Track Performance Summary:
 """
-        tracks = evaluation_results.get('tracks', {})
         for track_name, track_data in tracks.items():
-            if not track_data.get('error'):
                 track_config = EVALUATION_TRACKS[track_name]
-                track_averages = track_data.get('track_averages', {})
-                summary = track_data.get('summary', {})
                 # Get rank in this track
-                track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
                 if not track_leaderboard.empty:
-                    model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
                     rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
                     total_models = len(track_leaderboard)
                 else:
                     rank = "N/A"
                     total_models = 0
-                quality_score = track_averages.get('quality_score', 0)
-                bleu_score = track_averages.get('bleu', 0)
-                samples = summary.get('total_samples', 0)
                 success_msg += f"""
 **🏁 {track_config['name']}**:
 - Rank: #{rank} out of {total_models} models
@@ -376,7 +416,7 @@ def evaluate_scientific_submission(
 - BLEU: {bleu_score:.2f}
 - Samples: {samples:,}
 """
         success_msg += f"""
 ### 🔬 Scientific Adequacy:
@@ -386,52 +426,57 @@ def evaluate_scientific_submission(
 {report}
         """
         return success_msg, display_leaderboard, summary_plot, cross_track_plot
     except Exception as e:
         error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
         return error_msg, None, None, None
 def refresh_track_leaderboard(
     track: str,
     search_query: str = "",
     category_filter: str = "all",
     min_adequacy: float = 0.0,
-    show_ci: bool = True
 ) -> Tuple[pd.DataFrame, object, object, str]:
     """Refresh leaderboard for a specific track with filters."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
             current_leaderboard = load_scientific_leaderboard()
         # Get track-specific leaderboard
         track_leaderboard = get_track_leaderboard(
-            current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
         )
         # Apply search filter
         if search_query:
             query_lower = search_query.lower()
-            mask = (
-                track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
-                track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
             )
             track_leaderboard = track_leaderboard[mask]
         # Prepare for display
         display_df = prepare_track_leaderboard_display(track_leaderboard, track)
         # Create plots
         ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
         comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
         # Get track statistics
         track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
         track_config = EVALUATION_TRACKS[track]
         stats_text = f"""
 ### 📊 {track_config['name']} Statistics
@@ -447,46 +492,51 @@ def refresh_track_leaderboard(
 - Statistical adequacy verified for reliable comparisons
 - {track_config['description']}
         """
         return display_df, ranking_plot, comparison_plot, stats_text
     except Exception as e:
         error_msg = f"Error loading {track} leaderboard: {str(e)}"
         empty_df = pd.DataFrame()
         return empty_df, None, None, error_msg
-def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
     """Get detailed scientific analysis for a specific model."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
             return "Leaderboard not loaded", None, None
         # Find model
-        model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
         if model_row.empty:
             return f"Model '{model_name}' not found", None, None
         model_info = model_row.iloc[0]
         # Parse detailed metrics for the requested track
         try:
-            detailed_results = json.loads(model_info[f'detailed_{track}'])
         except:
             detailed_results = {}
         # Create detailed plots
-        detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
         # Create language pair heatmap
         heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
         # Format model details with scientific information
         track_config = EVALUATION_TRACKS[track]
-        category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
         # Extract track-specific metrics
         quality_col = f"{track}_quality"
         bleu_col = f"{track}_bleu"
@@ -496,7 +546,7 @@ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, obje
         samples_col = f"{track}_samples"
         pairs_col = f"{track}_pairs"
         adequate_col = f"{track}_adequate"
         details_text = f"""
 ## 🔬 Scientific Model Analysis: {model_name}
@@ -523,19 +573,19 @@ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, obje
 ### 📈 Cross-Track Performance:
 """
         # Add other track performances for comparison
         for other_track in EVALUATION_TRACKS.keys():
             if other_track != track:
                 other_quality_col = f"{other_track}_quality"
                 other_adequate_col = f"{other_track}_adequate"
                 if model_info.get(other_adequate_col, False):
                     other_quality = model_info.get(other_quality_col, 0)
                     details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
                 else:
                     details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
         details_text += f"""
 ### 💡 Scientific Interpretation:
@@ -544,44 +594,47 @@ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, obje
 - Cross-track analysis reveals model strengths across different language sets
 - Category classification helps contextualize performance expectations
         """
         return details_text, detail_plot, heatmap_plot
     except Exception as e:
         error_msg = f"Error getting model details: {str(e)}"
         return error_msg, None, None
 def perform_model_comparison(
     model_names: List[str], track: str, comparison_type: str = "statistical"
 ) -> Tuple[str, object]:
     """Perform scientific comparison between selected models."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
             return "Leaderboard not loaded", None
         if len(model_names) < 2:
             return "Please select at least 2 models for comparison", None
         # Get models
-        models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
         if len(models) < 2:
             return "Selected models not found in leaderboard", None
         # Perform fair comparison
         comparison_result = perform_fair_comparison(current_leaderboard, model_names)
-        if comparison_result.get('error'):
             return f"Comparison error: {comparison_result['error']}", None
         # Create comparison visualization
         if comparison_type == "statistical":
             comparison_plot = create_statistical_comparison_plot(models, track)
         else:
             comparison_plot = create_category_comparison_plot(models, track)
         # Format comparison report
         track_config = EVALUATION_TRACKS[track]
         comparison_text = f"""
@@ -589,26 +642,26 @@ def perform_model_comparison(
 ### 📊 Models Compared:
 """
         quality_col = f"{track}_quality"
         ci_lower_col = f"{track}_ci_lower"
         ci_upper_col = f"{track}_ci_upper"
         # Sort models by performance
         models_sorted = models.sort_values(quality_col, ascending=False)
         for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
-            category_info = MODEL_CATEGORIES.get(model['model_category'], {})
             comparison_text += f"""
 **#{i}. {model['model_name']}**
 - Category: {category_info.get('name', 'Unknown')}
 - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
 - Author: {model['author']}
 """
         # Add statistical analysis
-        track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
         if track_comparison:
             comparison_text += f"""
@@ -617,29 +670,32 @@ def perform_model_comparison(
 - **Confidence intervals available**: Yes (95% level)
 - **Fair comparison possible**: {'✅ Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
 """
             # Check for statistical significance (simplified)
-            quality_scores = list(track_comparison.get('quality_scores', {}).values())
             if len(quality_scores) >= 2:
                 score_range = max(quality_scores) - min(quality_scores)
                 if score_range > 0.05:  # 5% difference threshold
-                    comparison_text += "- **Performance differences**: Potentially significant\n"
                 else:
                     comparison_text += "- **Performance differences**: Minimal\n"
         # Add recommendations
-        recommendations = comparison_result.get('recommendations', [])
         if recommendations:
             comparison_text += "\n### 💡 Recommendations:\n"
             for rec in recommendations:
                 comparison_text += f"- {rec}\n"
         return comparison_text, comparison_plot
     except Exception as e:
         error_msg = f"Error performing comparison: {str(e)}"
         return error_msg, None
 # Initialize data on startup
 print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
 initialization_success = initialize_scientific_data()
@@ -698,31 +754,36 @@ with gr.Blocks(
     .adequacy-good { border-left-color: #eab308; }
     .adequacy-fair { border-left-color: #f97316; }
     .adequacy-insufficient { border-left-color: #ef4444; }
-    """
 ) as demo:
     # Scientific Header
-    gr.HTML(f"""
     <div class="scientific-header">
     <h1>🏆 SALT Translation Leaderboard - Scientific Edition</h1>
     <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
     <p>Three-tier evaluation tracks • 95% Confidence intervals • Research-grade analysis</p>
     <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
     </div>
-    """)
     # Status indicator
     if initialization_success:
         status_msg = "✅ Scientific system initialized successfully"
-        adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
         status_msg += f" | Test set adequacy: {adequacy_info.title()}"
     else:
         status_msg = "❌ System initialization failed - some features may not work"
     gr.Markdown(f"**System Status**: {status_msg}")
     # Add scientific overview
-    gr.Markdown("""
     ## 🔬 Scientific Evaluation Framework
     This leaderboard implements rigorous scientific methodology for translation model evaluation:
@@ -731,89 +792,110 @@ with gr.Blocks(
     - **Statistical Significance**: 95% confidence intervals and effect size analysis
     - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
     - **Cross-Track Consistency**: Validate model performance across language sets
-    """)
     with gr.Tabs():
         # Tab 1: Download Test Set
         with gr.Tab("📥 Download Test Set", id="download"):
-            gr.Markdown("""
             ## 📋 Get the SALT Scientific Test Set
             Download our scientifically designed test set with stratified sampling and statistical weighting.
-            """)
             with gr.Row():
-                download_btn = gr.Button("📥 Download Scientific Test Set", variant="primary", size="lg")
             with gr.Row():
                 with gr.Column():
                     download_file = gr.File(label="📂 Test Set File", interactive=False)
                 with gr.Column():
                     download_info = gr.Markdown(label="ℹ️ Test Set Information")
-        # Tab 2: Submit Predictions
         with gr.Tab("🚀 Submit Predictions", id="submit"):
-            gr.Markdown("""
             ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
             Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
-            """)
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown("### 📝 Model Information")
                     model_name_input = gr.Textbox(
                         label="🤖 Model Name",
                         placeholder="e.g., MyTranslator-v2.0",
-                        info="Unique name for your model"
                     )
                     author_input = gr.Textbox(
-                        label="👤 Author/Organization",
                         placeholder="Your name or organization",
-                        value="Anonymous"
                     )
                     description_input = gr.Textbox(
                         label="📄 Model Description",
                         placeholder="Architecture, training data, special features...",
                         lines=4,
-                        info="Detailed description helps with proper categorization"
                     )
                     gr.Markdown("### 📤 Upload Predictions")
                     predictions_file = gr.File(
                         label="📂 Predictions File",
-                        file_types=[".csv", ".tsv", ".json"]
                     )
-                    validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
-                    submit_btn = gr.Button("🚀 Submit for Scientific Evaluation", variant="primary", interactive=False)
                 with gr.Column(scale=1):
                     gr.Markdown("### 📊 Validation Results")
                     validation_output = gr.Markdown()
             # Results section
             gr.Markdown("### 🏆 Scientific Evaluation Results")
             with gr.Row():
                 evaluation_output = gr.Markdown()
             with gr.Row():
                 with gr.Column():
                     submission_plot = gr.Plot(label="📈 Submission Analysis")
                 with gr.Column():
                     cross_track_plot = gr.Plot(label="🔄 Cross-Track Analysis")
             with gr.Row():
-                results_table = gr.Dataframe(label="📊 Updated Leaderboard (Google-Comparable Track)", interactive=False)
         # Tab 3: Google-Comparable Track
-        with gr.Tab("🤖 Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
-            gr.Markdown(f"""
             ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
             **Fair comparison with commercial translation systems**
@@ -824,40 +906,54 @@ with gr.Blocks(
             - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
             - **Purpose**: Commercial system comparison and baseline establishment
             - **Statistical Power**: High (optimized sample sizes)
-            """)
             with gr.Row():
                 with gr.Column(scale=2):
-                    google_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                 with gr.Column(scale=1):
                     google_category = gr.Dropdown(
                         label="🏷️ Category Filter",
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
-                        value="all"
                     )
                 with gr.Column(scale=1):
                     google_adequacy = gr.Slider(
                         label="📊 Min Adequacy",
-                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                     )
                 with gr.Column(scale=1):
                     google_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
                 google_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
                     google_ranking_plot = gr.Plot(label="🏆 Google-Comparable Rankings")
                 with gr.Column():
                     google_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
-                google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)
         # Tab 4: UG40-Complete Track
-        with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
-            gr.Markdown(f"""
             ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
             **Comprehensive evaluation across all Ugandan languages**
@@ -868,40 +964,54 @@ with gr.Blocks(
             - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
             - **Purpose**: Comprehensive Ugandan language capability assessment
             - **Coverage**: Complete linguistic landscape of Uganda
-            """)
             with gr.Row():
                 with gr.Column(scale=2):
-                    ug40_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                 with gr.Column(scale=1):
                     ug40_category = gr.Dropdown(
                         label="🏷️ Category Filter",
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
-                        value="all"
                     )
                 with gr.Column(scale=1):
                     ug40_adequacy = gr.Slider(
                         label="📊 Min Adequacy",
-                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                     )
                 with gr.Column(scale=1):
                     ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
                 ug40_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
                     ug40_ranking_plot = gr.Plot(label="🏆 UG40-Complete Rankings")
                 with gr.Column():
                     ug40_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
-                ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)
         # Tab 5: Language-Pair Matrix
-        with gr.Tab("📊 Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
-            gr.Markdown(f"""
             ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
             **Detailed language pair analysis with statistical significance**
@@ -912,112 +1022,130 @@ with gr.Blocks(
             - **Resolution**: Individual language pair performance
             - **Purpose**: Detailed linguistic analysis and model diagnostics
             - **Statistics**: Pairwise significance testing available
-            """)
             with gr.Row():
                 with gr.Column(scale=2):
-                    matrix_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                 with gr.Column(scale=1):
                     matrix_category = gr.Dropdown(
                         label="🏷️ Category Filter",
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
-                        value="all"
                     )
                 with gr.Column(scale=1):
                     matrix_adequacy = gr.Slider(
                         label="📊 Min Adequacy",
-                        minimum=0.0, maximum=1.0, value=0.0, step=0.1
                     )
                 with gr.Column(scale=1):
                     matrix_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
                 matrix_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    matrix_ranking_plot = gr.Plot(label="🏆 Language-Pair Matrix Rankings")
                 with gr.Column():
                     matrix_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
-                matrix_leaderboard = gr.Dataframe(label="📈 Language-Pair Matrix Leaderboard", interactive=False)
         # Tab 6: Model Analysis
         with gr.Tab("🔍 Scientific Model Analysis", id="analysis"):
-            gr.Markdown("""
             ## 🔬 Detailed Scientific Model Analysis
             Comprehensive analysis of individual models with statistical confidence intervals,
             cross-track performance, and detailed language pair breakdowns.
-            """)
             with gr.Row():
                 with gr.Column(scale=2):
                     model_select = gr.Dropdown(
                         label="🤖 Select Model",
                         choices=[],
                         value=None,
-                        info="Choose a model for detailed scientific analysis"
                     )
                 with gr.Column(scale=1):
                     track_select = gr.Dropdown(
                         label="🏁 Analysis Track",
                         choices=list(EVALUATION_TRACKS.keys()),
                         value="google_comparable",
-                        info="Track for detailed analysis"
                     )
                 with gr.Column(scale=1):
                     analyze_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Row():
                 model_details = gr.Markdown()
             with gr.Row():
                 with gr.Column():
-                    model_analysis_plot = gr.Plot(label="📊 Detailed Performance Analysis")
                 with gr.Column():
                     model_heatmap_plot = gr.Plot(label="🗺️ Language Pair Heatmap")
         # Tab 7: Model Comparison
         with gr.Tab("⚖️ Scientific Model Comparison", id="comparison"):
-            gr.Markdown("""
             ## 🔬 Scientific Model Comparison
             Compare multiple models with statistical significance testing and fair comparison analysis.
             Only models evaluated on the same language pairs are compared for scientific validity.
-            """)
             with gr.Row():
                 with gr.Column(scale=2):
                     comparison_models = gr.CheckboxGroup(
                         label="🤖 Select Models to Compare",
                         choices=[],
                         value=[],
-                        info="Select 2-6 models for comparison"
                     )
                 with gr.Column(scale=1):
                     comparison_track = gr.Dropdown(
                         label="🏁 Comparison Track",
                         choices=list(EVALUATION_TRACKS.keys()),
-                        value="google_comparable"
                     )
                     comparison_type = gr.Radio(
                         label="📊 Comparison Type",
                         choices=["statistical", "category"],
-                        value="statistical"
                     )
                     compare_btn = gr.Button("⚖️ Compare Models", variant="primary")
             with gr.Row():
                 comparison_output = gr.Markdown()
             with gr.Row():
                 comparison_plot = gr.Plot(label="📊 Model Comparison Analysis")
         # Tab 8: Documentation
         with gr.Tab("📚 Scientific Documentation", id="docs"):
-            gr.Markdown(f"""
             # 📖 SALT Translation Leaderboard - Scientific Edition Documentation
             ## 🎯 Overview
@@ -1182,131 +1310,164 @@ with gr.Blocks(
             ---
             *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
-            """)
     # Event handlers with enhanced scientific functionality
     predictions_validated = gr.State(value=None)
     validation_info_state = gr.State(value=None)
     detected_category_state = gr.State(value="community")
     # Download test set
     download_btn.click(
-        fn=download_scientific_test_set,
-        outputs=[download_file, download_info]
     )
     # Validate predictions
     def handle_scientific_validation(file, model_name, author, description):
-        report, predictions, category = validate_scientific_submission(file, model_name, author, description)
-        valid = predictions is not None
         return (
             report,
             predictions,
-            {"category": category, "validation_passed": valid},
             category,
-            gr.update(interactive=valid)
         )
     validate_btn.click(
         fn=handle_scientific_validation,
         inputs=[predictions_file, model_name_input, author_input, description_input],
-        outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
     )
     # Submit for evaluation
-    def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
         if predictions is None:
             return "❌ Please validate your submission first", None, None, None
         return evaluate_scientific_submission(
             predictions, model_name, author, description, category, validation_info
         )
     submit_btn.click(
         fn=handle_scientific_submission,
-        inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
-        outputs=[evaluation_output, results_table, submission_plot, cross_track_plot]
     )
     # Track leaderboard refresh functions
     def refresh_google_track(*args):
         return refresh_track_leaderboard("google_comparable", *args)
     def refresh_ug40_track(*args):
         return refresh_track_leaderboard("ug40_complete", *args)
     def refresh_matrix_track(*args):
         return refresh_track_leaderboard("language_pair_matrix", *args)
     # Google-Comparable Track
     google_refresh.click(
         fn=refresh_google_track,
         inputs=[google_search, google_category, google_adequacy],
-        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
     )
     # UG40-Complete Track
     ug40_refresh.click(
         fn=refresh_ug40_track,
         inputs=[ug40_search, ug40_category, ug40_adequacy],
-        outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
     )
     # Language-Pair Matrix Track
     matrix_refresh.click(
         fn=refresh_matrix_track,
         inputs=[matrix_search, matrix_category, matrix_adequacy],
-        outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
     )
     # Model analysis
     analyze_btn.click(
         fn=get_scientific_model_details,
         inputs=[model_select, track_select],
-        outputs=[model_details, model_analysis_plot, model_heatmap_plot]
     )
     # Model comparison
     compare_btn.click(
         fn=perform_model_comparison,
         inputs=[comparison_models, comparison_track, comparison_type],
-        outputs=[comparison_output, comparison_plot]
     )
     # Load initial data and update dropdowns
     def load_initial_data():
         # Load initial Google track data
         google_data = refresh_google_track("", "all", 0.0)
         # Update dropdown choices
         if current_leaderboard is not None and not current_leaderboard.empty:
-            model_choices = current_leaderboard['model_name'].tolist()
         else:
             model_choices = []
         return (
             google_data[0],  # google_leaderboard
-            google_data[1],  # google_ranking_plot
             google_data[2],  # google_comparison_plot
             google_data[3],  # google_stats
             gr.Dropdown(choices=model_choices),  # model_select
-            gr.CheckboxGroup(choices=model_choices)  # comparison_models
         )
     demo.load(
         fn=load_initial_data,
         outputs=[
-            google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats,
-            model_select, comparison_models
-        ]
     )
 # Launch the scientific application
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )

 import os
 from pathlib import Path
 def setup_salt():
     """Clone and setup SALT library like in Colab."""
     try:
         # Check if salt is already available
         import salt.dataset
         print("✅ SALT library already available")
         return True
     except ImportError:
         pass
     print("📥 Setting up SALT library...")
     try:
         # Clone SALT repo if not exists
         salt_dir = Path("salt")
         if not salt_dir.exists():
             print("🔄 Cloning SALT repository...")
+            subprocess.check_call(
+                ["git", "clone", "https://github.com/sunbirdai/salt.git"]
+            )
         else:
             print("📁 SALT repository already exists")
         # Install SALT requirements
         salt_requirements = salt_dir / "requirements.txt"
         if salt_requirements.exists():
             print("📦 Installing SALT requirements...")
+            subprocess.check_call(
+                [
+                    sys.executable,
+                    "-m",
+                    "pip",
+                    "install",
+                    "-q",
+                    "-r",
+                    str(salt_requirements),
+                ]
+            )
         # Add SALT directory to Python path
         salt_path = str(salt_dir.absolute())
         if salt_path not in sys.path:
             sys.path.insert(0, salt_path)
             print(f"🔗 Added {salt_path} to Python path")
         # Test import
         import salt.dataset
         print("✅ SALT library setup completed successfully")
         return True
     except Exception as e:
         print(f"❌ Failed to setup SALT: {e}")
         return False
 # Setup SALT on startup
 print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
 if not setup_salt():
 # Import our enhanced modules
 from src.test_set import (
+    get_public_test_set_scientific,
     get_complete_test_set_scientific,
+    create_test_set_download_scientific,
     validate_test_set_integrity_scientific,
+    get_track_test_set,
 )
 from src.validation import validate_submission_scientific
 from src.evaluation import (
+    evaluate_predictions_scientific,
     generate_scientific_report,
+    compare_models_statistically,
 )
 from src.leaderboard import (
+    load_scientific_leaderboard,
     add_model_to_scientific_leaderboard,
+    get_scientific_leaderboard_stats,
     get_track_leaderboard,
     prepare_track_leaderboard_display,
     perform_fair_comparison,
+    export_scientific_leaderboard,
 )
 from src.plotting import (
+    create_scientific_leaderboard_plot,
     create_language_pair_heatmap_scientific,
     create_statistical_comparison_plot,
     create_category_comparison_plot,
     create_adequacy_analysis_plot,
     create_cross_track_analysis_plot,
+    create_scientific_model_detail_plot,
 )
 from src.utils import (
+    sanitize_model_name,
+    get_all_language_pairs,
     get_google_comparable_pairs,
     get_track_language_pairs,
+    format_metric_value,
 )
 from config import *
 complete_test_set = None
 test_set_stats = None
 def initialize_scientific_data():
     """Initialize scientific test sets and leaderboard data."""
     global public_test_set, complete_test_set, current_leaderboard, test_set_stats
     try:
         print("🔬 Initializing SALT Translation Leaderboard - Scientific Edition...")
         # Load scientific test sets
         print("📥 Loading scientific test sets...")
         public_test_set = get_public_test_set_scientific()
         complete_test_set = get_complete_test_set_scientific()
         # Load scientific leaderboard
         print("🏆 Loading scientific leaderboard...")
         current_leaderboard = load_scientific_leaderboard()
         # Validate test set integrity
         print("🔍 Validating test set integrity...")
         test_set_stats = validate_test_set_integrity_scientific()
         print(f"✅ Scientific initialization complete!")
         print(f"   - Test set: {len(public_test_set):,} samples")
         print(f"   - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
+        print(
+            f"   - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}"
+        )
         print(f"   - Current models: {len(current_leaderboard)}")
         return True
     except Exception as e:
         print(f"❌ Scientific initialization failed: {e}")
         traceback.print_exc()
         return False
 def download_scientific_test_set() -> Tuple[str, str]:
     """Create downloadable scientific test set and return file path and info."""
     try:
         global public_test_set
         if public_test_set is None:
             public_test_set = get_public_test_set_scientific()
         # Create download file
         download_path, stats = create_test_set_download_scientific()
         # Create comprehensive info message
+        adequacy = stats.get("adequacy_assessment", "unknown")
         adequacy_emoji = {
+            "excellent": "🟢",
+            "good": "🟡",
+            "fair": "🟠",
+            "insufficient": "🔴",
+            "unknown": "⚪",
+        }.get(adequacy, "⚪")
         info_msg = f"""
 ## 📥 SALT Scientific Test Set Downloaded Successfully!
 ### 🏁 Track Breakdown:
 """
+        track_breakdown = stats.get("track_breakdown", {})
         for track_name, track_info in track_breakdown.items():
+            status_emoji = (
+                "✅" if track_info.get("statistical_adequacy", False) else "⚠️"
+            )
             info_msg += f"""
 **{status_emoji} {track_info.get('name', track_name)}**:
 - Samples: {track_info.get('total_samples', 0):,}
 - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
 - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
 """
         info_msg += f"""
 ### 📋 Enhanced File Format:
 - Provide detailed model description for proper categorization
 - Consider submitting to multiple tracks for comprehensive evaluation
         """
         return download_path, info_msg
     except Exception as e:
         error_msg = f"❌ Error creating scientific test set download: {str(e)}"
         return None, error_msg
 def validate_scientific_submission(
     file, model_name: str, author: str, description: str
 ) -> Tuple[str, Optional[pd.DataFrame], str]:
     """Validate uploaded prediction file with scientific rigor."""
     try:
         if file is None:
             return "❌ Please upload a predictions file", None, "community"
         )
         detected_category = validation_result.get("category", "community")
         if validation_result["valid"]:
+            return (
+                validation_result["report"],
+                validation_result["predictions"],
+                detected_category,
+            )
         else:
             return validation_result["report"], None, detected_category
         return (
             f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
             None,
+            "community",
         )
 def evaluate_scientific_submission(
     predictions_df: pd.DataFrame,
     model_name: str,
     validation_info: Dict,
 ) -> Tuple[str, pd.DataFrame, object, object]:
     """Evaluate validated predictions using scientific methodology."""
     try:
         if predictions_df is None:
             return "❌ No valid predictions to evaluate", None, None, None
         # Get complete test set with targets
         global complete_test_set, current_leaderboard
         if complete_test_set is None:
             complete_test_set = get_complete_test_set_scientific()
         # Run scientific evaluation across all tracks
         print(f"🔬 Starting scientific evaluation for {model_name}...")
         evaluation_results = evaluate_predictions_scientific(
             predictions_df, complete_test_set, detected_category
         )
+        if any(
+            track_data.get("error")
+            for track_data in evaluation_results.get("tracks", {}).values()
+        ):
+            errors = [
+                track_data["error"]
+                for track_data in evaluation_results["tracks"].values()
+                if track_data.get("error")
+            ]
             return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
         # Add to scientific leaderboard
         print("🏆 Adding to scientific leaderboard...")
         updated_leaderboard = add_model_to_scientific_leaderboard(
             author=author or "Anonymous",
             evaluation_results=evaluation_results,
             model_category=detected_category,
+            description=description or "",
         )
         # Update global leaderboard
         current_leaderboard = updated_leaderboard
         # Generate scientific report
         report = generate_scientific_report(evaluation_results, model_name)
         # Create visualizations
         summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
         cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
         # Prepare display leaderboard (Google-comparable track by default)
+        google_leaderboard = get_track_leaderboard(
+            updated_leaderboard, "google_comparable"
+        )
+        display_leaderboard = prepare_track_leaderboard_display(
+            google_leaderboard, "google_comparable"
+        )
         # Format success message with track-specific results
         success_msg = f"""
 ## 🎉 Scientific Evaluation Complete!
 ### 🏆 Track Performance Summary:
 """
+        tracks = evaluation_results.get("tracks", {})
         for track_name, track_data in tracks.items():
+            if not track_data.get("error"):
                 track_config = EVALUATION_TRACKS[track_name]
+                track_averages = track_data.get("track_averages", {})
+                summary = track_data.get("summary", {})
                 # Get rank in this track
+                track_leaderboard = get_track_leaderboard(
+                    updated_leaderboard, track_name
+                )
                 if not track_leaderboard.empty:
+                    model_row = track_leaderboard[
+                        track_leaderboard["model_name"]
+                        == sanitize_model_name(model_name)
+                    ]
                     rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
                     total_models = len(track_leaderboard)
                 else:
                     rank = "N/A"
                     total_models = 0
+                quality_score = track_averages.get("quality_score", 0)
+                bleu_score = track_averages.get("bleu", 0)
+                samples = summary.get("total_samples", 0)
                 success_msg += f"""
 **🏁 {track_config['name']}**:
 - Rank: #{rank} out of {total_models} models
 - BLEU: {bleu_score:.2f}
 - Samples: {samples:,}
 """
         success_msg += f"""
 ### 🔬 Scientific Adequacy:
 {report}
         """
         return success_msg, display_leaderboard, summary_plot, cross_track_plot
     except Exception as e:
         error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
         return error_msg, None, None, None
 def refresh_track_leaderboard(
     track: str,
     search_query: str = "",
     category_filter: str = "all",
     min_adequacy: float = 0.0,
+    show_ci: bool = True,
 ) -> Tuple[pd.DataFrame, object, object, str]:
     """Refresh leaderboard for a specific track with filters."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
             current_leaderboard = load_scientific_leaderboard()
         # Get track-specific leaderboard
         track_leaderboard = get_track_leaderboard(
+            current_leaderboard,
+            track,
+            category_filter=category_filter,
+            min_adequacy=min_adequacy,
         )
         # Apply search filter
         if search_query:
             query_lower = search_query.lower()
+            mask = track_leaderboard["model_name"].str.lower().str.contains(
+                query_lower, na=False
+            ) | track_leaderboard["author"].str.lower().str.contains(
+                query_lower, na=False
             )
             track_leaderboard = track_leaderboard[mask]
         # Prepare for display
         display_df = prepare_track_leaderboard_display(track_leaderboard, track)
         # Create plots
         ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
         comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
         # Get track statistics
         track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
         track_config = EVALUATION_TRACKS[track]
         stats_text = f"""
 ### 📊 {track_config['name']} Statistics
 - Statistical adequacy verified for reliable comparisons
 - {track_config['description']}
         """
         return display_df, ranking_plot, comparison_plot, stats_text
     except Exception as e:
         error_msg = f"Error loading {track} leaderboard: {str(e)}"
         empty_df = pd.DataFrame()
         return empty_df, None, None, error_msg
+def get_scientific_model_details(
+    model_name: str, track: str
+) -> Tuple[str, object, object]:
     """Get detailed scientific analysis for a specific model."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
             return "Leaderboard not loaded", None, None
         # Find model
+        model_row = current_leaderboard[current_leaderboard["model_name"] == model_name]
         if model_row.empty:
             return f"Model '{model_name}' not found", None, None
         model_info = model_row.iloc[0]
         # Parse detailed metrics for the requested track
         try:
+            detailed_results = json.loads(model_info[f"detailed_{track}"])
         except:
             detailed_results = {}
         # Create detailed plots
+        detail_plot = create_scientific_model_detail_plot(
+            detailed_results, model_name, track
+        )
         # Create language pair heatmap
         heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
         # Format model details with scientific information
         track_config = EVALUATION_TRACKS[track]
+        category_info = MODEL_CATEGORIES.get(model_info["model_category"], {})
         # Extract track-specific metrics
         quality_col = f"{track}_quality"
         bleu_col = f"{track}_bleu"
         samples_col = f"{track}_samples"
         pairs_col = f"{track}_pairs"
         adequate_col = f"{track}_adequate"
         details_text = f"""
 ## 🔬 Scientific Model Analysis: {model_name}
 ### 📈 Cross-Track Performance:
 """
         # Add other track performances for comparison
         for other_track in EVALUATION_TRACKS.keys():
             if other_track != track:
                 other_quality_col = f"{other_track}_quality"
                 other_adequate_col = f"{other_track}_adequate"
                 if model_info.get(other_adequate_col, False):
                     other_quality = model_info.get(other_quality_col, 0)
                     details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
                 else:
                     details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
         details_text += f"""
 ### 💡 Scientific Interpretation:
 - Cross-track analysis reveals model strengths across different language sets
 - Category classification helps contextualize performance expectations
         """
         return details_text, detail_plot, heatmap_plot
     except Exception as e:
         error_msg = f"Error getting model details: {str(e)}"
         return error_msg, None, None
 def perform_model_comparison(
     model_names: List[str], track: str, comparison_type: str = "statistical"
 ) -> Tuple[str, object]:
     """Perform scientific comparison between selected models."""
     try:
         global current_leaderboard
         if current_leaderboard is None:
             return "Leaderboard not loaded", None
         if len(model_names) < 2:
             return "Please select at least 2 models for comparison", None
         # Get models
+        models = current_leaderboard[
+            current_leaderboard["model_name"].isin(model_names)
+        ]
         if len(models) < 2:
             return "Selected models not found in leaderboard", None
         # Perform fair comparison
         comparison_result = perform_fair_comparison(current_leaderboard, model_names)
+        if comparison_result.get("error"):
             return f"Comparison error: {comparison_result['error']}", None
         # Create comparison visualization
         if comparison_type == "statistical":
             comparison_plot = create_statistical_comparison_plot(models, track)
         else:
             comparison_plot = create_category_comparison_plot(models, track)
         # Format comparison report
         track_config = EVALUATION_TRACKS[track]
         comparison_text = f"""
 ### 📊 Models Compared:
 """
         quality_col = f"{track}_quality"
         ci_lower_col = f"{track}_ci_lower"
         ci_upper_col = f"{track}_ci_upper"
         # Sort models by performance
         models_sorted = models.sort_values(quality_col, ascending=False)
         for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
+            category_info = MODEL_CATEGORIES.get(model["model_category"], {})
             comparison_text += f"""
 **#{i}. {model['model_name']}**
 - Category: {category_info.get('name', 'Unknown')}
 - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
 - Author: {model['author']}
 """
         # Add statistical analysis
+        track_comparison = comparison_result.get("track_comparisons", {}).get(track, {})
         if track_comparison:
             comparison_text += f"""
 - **Confidence intervals available**: Yes (95% level)
 - **Fair comparison possible**: {'✅ Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
 """
             # Check for statistical significance (simplified)
+            quality_scores = list(track_comparison.get("quality_scores", {}).values())
             if len(quality_scores) >= 2:
                 score_range = max(quality_scores) - min(quality_scores)
                 if score_range > 0.05:  # 5% difference threshold
+                    comparison_text += (
+                        "- **Performance differences**: Potentially significant\n"
+                    )
                 else:
                     comparison_text += "- **Performance differences**: Minimal\n"
         # Add recommendations
+        recommendations = comparison_result.get("recommendations", [])
         if recommendations:
             comparison_text += "\n### 💡 Recommendations:\n"
             for rec in recommendations:
                 comparison_text += f"- {rec}\n"
         return comparison_text, comparison_plot
     except Exception as e:
         error_msg = f"Error performing comparison: {str(e)}"
         return error_msg, None
 # Initialize data on startup
 print("🚀 Starting SALT Translation Leaderboard - Scientific Edition...")
 initialization_success = initialize_scientific_data()
     .adequacy-good { border-left-color: #eab308; }
     .adequacy-fair { border-left-color: #f97316; }
     .adequacy-insufficient { border-left-color: #ef4444; }
+    """,
 ) as demo:
     # Scientific Header
+    gr.HTML(
+        f"""
     <div class="scientific-header">
     <h1>🏆 SALT Translation Leaderboard - Scientific Edition</h1>
     <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
     <p>Three-tier evaluation tracks • 95% Confidence intervals • Research-grade analysis</p>
     <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
     </div>
+    """
+    )
     # Status indicator
     if initialization_success:
         status_msg = "✅ Scientific system initialized successfully"
+        adequacy_info = test_set_stats.get("scientific_adequacy", {}).get(
+            "overall_adequacy", "unknown"
+        )
         status_msg += f" | Test set adequacy: {adequacy_info.title()}"
     else:
         status_msg = "❌ System initialization failed - some features may not work"
     gr.Markdown(f"**System Status**: {status_msg}")
     # Add scientific overview
+    gr.Markdown(
+        """
     ## 🔬 Scientific Evaluation Framework
     This leaderboard implements rigorous scientific methodology for translation model evaluation:
     - **Statistical Significance**: 95% confidence intervals and effect size analysis
     - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
     - **Cross-Track Consistency**: Validate model performance across language sets
+    """
+    )
     with gr.Tabs():
         # Tab 1: Download Test Set
         with gr.Tab("📥 Download Test Set", id="download"):
+            gr.Markdown(
+                """
             ## 📋 Get the SALT Scientific Test Set
             Download our scientifically designed test set with stratified sampling and statistical weighting.
+            """
+            )
             with gr.Row():
+                download_btn = gr.Button(
+                    "📥 Download Scientific Test Set", variant="primary", size="lg"
+                )
             with gr.Row():
                 with gr.Column():
                     download_file = gr.File(label="📂 Test Set File", interactive=False)
                 with gr.Column():
                     download_info = gr.Markdown(label="ℹ️ Test Set Information")
+        # Tab 2: Submit Predictions
         with gr.Tab("🚀 Submit Predictions", id="submit"):
+            gr.Markdown(
+                """
             ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
             Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
+            """
+            )
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown("### 📝 Model Information")
                     model_name_input = gr.Textbox(
                         label="🤖 Model Name",
                         placeholder="e.g., MyTranslator-v2.0",
+                        info="Unique name for your model",
                     )
                     author_input = gr.Textbox(
+                        label="👤 Author/Organization",
                         placeholder="Your name or organization",
+                        value="Anonymous",
                     )
                     description_input = gr.Textbox(
                         label="📄 Model Description",
                         placeholder="Architecture, training data, special features...",
                         lines=4,
+                        info="Detailed description helps with proper categorization",
                     )
                     gr.Markdown("### 📤 Upload Predictions")
                     predictions_file = gr.File(
                         label="📂 Predictions File",
+                        file_types=[".csv", ".tsv", ".json"],
                     )
+                    validate_btn = gr.Button(
+                        "✅ Validate Submission", variant="secondary"
+                    )
+                    submit_btn = gr.Button(
+                        "🚀 Submit for Scientific Evaluation",
+                        variant="primary",
+                        interactive=False,
+                    )
                 with gr.Column(scale=1):
                     gr.Markdown("### 📊 Validation Results")
                     validation_output = gr.Markdown()
             # Results section
             gr.Markdown("### 🏆 Scientific Evaluation Results")
             with gr.Row():
                 evaluation_output = gr.Markdown()
             with gr.Row():
                 with gr.Column():
                     submission_plot = gr.Plot(label="📈 Submission Analysis")
                 with gr.Column():
                     cross_track_plot = gr.Plot(label="🔄 Cross-Track Analysis")
             with gr.Row():
+                results_table = gr.Dataframe(
+                    label="📊 Updated Leaderboard (Google-Comparable Track)",
+                    interactive=False,
+                )
         # Tab 3: Google-Comparable Track
+        with gr.Tab(
+            "🤖 Google-Comparable Track",
+            id="google_track",
+            elem_classes=["track-tab", "google-comparable"],
+        ):
+            gr.Markdown(
+                f"""
             ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
             **Fair comparison with commercial translation systems**
             - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
             - **Purpose**: Commercial system comparison and baseline establishment
             - **Statistical Power**: High (optimized sample sizes)
+            """
+            )
             with gr.Row():
                 with gr.Column(scale=2):
+                    google_search = gr.Textbox(
+                        label="🔍 Search Models",
+                        placeholder="Search by model name, author...",
+                    )
                 with gr.Column(scale=1):
                     google_category = gr.Dropdown(
                         label="🏷️ Category Filter",
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
+                        value="all",
                     )
                 with gr.Column(scale=1):
                     google_adequacy = gr.Slider(
                         label="📊 Min Adequacy",
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.0,
+                        step=0.1,
                     )
                 with gr.Column(scale=1):
                     google_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
                 google_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
                     google_ranking_plot = gr.Plot(label="🏆 Google-Comparable Rankings")
                 with gr.Column():
                     google_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
+                google_leaderboard = gr.Dataframe(
+                    label="📈 Google-Comparable Leaderboard", interactive=False
+                )
         # Tab 4: UG40-Complete Track
+        with gr.Tab(
+            "🌍 UG40-Complete Track",
+            id="ug40_track",
+            elem_classes=["track-tab", "ug40-complete"],
+        ):
+            gr.Markdown(
+                f"""
             ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
             **Comprehensive evaluation across all Ugandan languages**
             - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
             - **Purpose**: Comprehensive Ugandan language capability assessment
             - **Coverage**: Complete linguistic landscape of Uganda
+            """
+            )
             with gr.Row():
                 with gr.Column(scale=2):
+                    ug40_search = gr.Textbox(
+                        label="🔍 Search Models",
+                        placeholder="Search by model name, author...",
+                    )
                 with gr.Column(scale=1):
                     ug40_category = gr.Dropdown(
                         label="🏷️ Category Filter",
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
+                        value="all",
                     )
                 with gr.Column(scale=1):
                     ug40_adequacy = gr.Slider(
                         label="📊 Min Adequacy",
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.0,
+                        step=0.1,
                     )
                 with gr.Column(scale=1):
                     ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
                 ug40_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
                     ug40_ranking_plot = gr.Plot(label="🏆 UG40-Complete Rankings")
                 with gr.Column():
                     ug40_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
+                ug40_leaderboard = gr.Dataframe(
+                    label="📈 UG40-Complete Leaderboard", interactive=False
+                )
         # Tab 5: Language-Pair Matrix
+        with gr.Tab(
+            "📊 Language-Pair Matrix",
+            id="matrix_track",
+            elem_classes=["track-tab", "language-pair-matrix"],
+        ):
+            gr.Markdown(
+                f"""
             ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
             **Detailed language pair analysis with statistical significance**
             - **Resolution**: Individual language pair performance
             - **Purpose**: Detailed linguistic analysis and model diagnostics
             - **Statistics**: Pairwise significance testing available
+            """
+            )
             with gr.Row():
                 with gr.Column(scale=2):
+                    matrix_search = gr.Textbox(
+                        label="🔍 Search Models",
+                        placeholder="Search by model name, author...",
+                    )
                 with gr.Column(scale=1):
                     matrix_category = gr.Dropdown(
                         label="🏷️ Category Filter",
                         choices=["all"] + list(MODEL_CATEGORIES.keys()),
+                        value="all",
                     )
                 with gr.Column(scale=1):
                     matrix_adequacy = gr.Slider(
                         label="📊 Min Adequacy",
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.0,
+                        step=0.1,
                     )
                 with gr.Column(scale=1):
                     matrix_refresh = gr.Button("🔄 Refresh", variant="secondary")
             with gr.Row():
                 matrix_stats = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    matrix_ranking_plot = gr.Plot(
+                        label="🏆 Language-Pair Matrix Rankings"
+                    )
                 with gr.Column():
                     matrix_comparison_plot = gr.Plot(label="📊 Statistical Comparison")
             with gr.Row():
+                matrix_leaderboard = gr.Dataframe(
+                    label="📈 Language-Pair Matrix Leaderboard", interactive=False
+                )
         # Tab 6: Model Analysis
         with gr.Tab("🔍 Scientific Model Analysis", id="analysis"):
+            gr.Markdown(
+                """
             ## 🔬 Detailed Scientific Model Analysis
             Comprehensive analysis of individual models with statistical confidence intervals,
             cross-track performance, and detailed language pair breakdowns.
+            """
+            )
             with gr.Row():
                 with gr.Column(scale=2):
                     model_select = gr.Dropdown(
                         label="🤖 Select Model",
                         choices=[],
                         value=None,
+                        info="Choose a model for detailed scientific analysis",
                     )
                 with gr.Column(scale=1):
                     track_select = gr.Dropdown(
                         label="🏁 Analysis Track",
                         choices=list(EVALUATION_TRACKS.keys()),
                         value="google_comparable",
+                        info="Track for detailed analysis",
                     )
                 with gr.Column(scale=1):
                     analyze_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Row():
                 model_details = gr.Markdown()
             with gr.Row():
                 with gr.Column():
+                    model_analysis_plot = gr.Plot(
+                        label="📊 Detailed Performance Analysis"
+                    )
                 with gr.Column():
                     model_heatmap_plot = gr.Plot(label="🗺️ Language Pair Heatmap")
         # Tab 7: Model Comparison
         with gr.Tab("⚖️ Scientific Model Comparison", id="comparison"):
+            gr.Markdown(
+                """
             ## 🔬 Scientific Model Comparison
             Compare multiple models with statistical significance testing and fair comparison analysis.
             Only models evaluated on the same language pairs are compared for scientific validity.
+            """
+            )
             with gr.Row():
                 with gr.Column(scale=2):
                     comparison_models = gr.CheckboxGroup(
                         label="🤖 Select Models to Compare",
                         choices=[],
                         value=[],
+                        info="Select 2-6 models for comparison",
                     )
                 with gr.Column(scale=1):
                     comparison_track = gr.Dropdown(
                         label="🏁 Comparison Track",
                         choices=list(EVALUATION_TRACKS.keys()),
+                        value="google_comparable",
                     )
                     comparison_type = gr.Radio(
                         label="📊 Comparison Type",
                         choices=["statistical", "category"],
+                        value="statistical",
                     )
                     compare_btn = gr.Button("⚖️ Compare Models", variant="primary")
             with gr.Row():
                 comparison_output = gr.Markdown()
             with gr.Row():
                 comparison_plot = gr.Plot(label="📊 Model Comparison Analysis")
         # Tab 8: Documentation
         with gr.Tab("📚 Scientific Documentation", id="docs"):
+            gr.Markdown(
+                f"""
             # 📖 SALT Translation Leaderboard - Scientific Edition Documentation
             ## 🎯 Overview
             ---
             *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
+            """
+            )
     # Event handlers with enhanced scientific functionality
     predictions_validated = gr.State(value=None)
     validation_info_state = gr.State(value=None)
     detected_category_state = gr.State(value="community")
     # Download test set
     download_btn.click(
+        fn=download_scientific_test_set, outputs=[download_file, download_info]
     )
     # Validate predictions
     def handle_scientific_validation(file, model_name, author, description):
+        report, predictions, category = validate_scientific_submission(
+            file, model_name, author, description
+        )
+        # Enable button if predictions are available and format is valid
+        # This allows "can be evaluated with limitations" cases
+        can_evaluate = predictions is not None
+        # Additional check: ensure we have some basic validity
+        if can_evaluate and "❌ **Final Verdict**: Please address issues" in report:
+            can_evaluate = False
         return (
             report,
             predictions,
+            {"category": category, "validation_passed": can_evaluate},
             category,
+            gr.update(interactive=can_evaluate),
         )
     validate_btn.click(
         fn=handle_scientific_validation,
         inputs=[predictions_file, model_name_input, author_input, description_input],
+        outputs=[
+            validation_output,
+            predictions_validated,
+            validation_info_state,
+            detected_category_state,
+            submit_btn,
+        ],
     )
     # Submit for evaluation
+    def handle_scientific_submission(
+        predictions, model_name, author, description, category, validation_info
+    ):
         if predictions is None:
             return "❌ Please validate your submission first", None, None, None
         return evaluate_scientific_submission(
             predictions, model_name, author, description, category, validation_info
         )
     submit_btn.click(
         fn=handle_scientific_submission,
+        inputs=[
+            predictions_validated,
+            model_name_input,
+            author_input,
+            description_input,
+            detected_category_state,
+            validation_info_state,
+        ],
+        outputs=[evaluation_output, results_table, submission_plot, cross_track_plot],
     )
     # Track leaderboard refresh functions
     def refresh_google_track(*args):
         return refresh_track_leaderboard("google_comparable", *args)
     def refresh_ug40_track(*args):
         return refresh_track_leaderboard("ug40_complete", *args)
     def refresh_matrix_track(*args):
         return refresh_track_leaderboard("language_pair_matrix", *args)
     # Google-Comparable Track
     google_refresh.click(
         fn=refresh_google_track,
         inputs=[google_search, google_category, google_adequacy],
+        outputs=[
+            google_leaderboard,
+            google_ranking_plot,
+            google_comparison_plot,
+            google_stats,
+        ],
     )
     # UG40-Complete Track
     ug40_refresh.click(
         fn=refresh_ug40_track,
         inputs=[ug40_search, ug40_category, ug40_adequacy],
+        outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats],
     )
     # Language-Pair Matrix Track
     matrix_refresh.click(
         fn=refresh_matrix_track,
         inputs=[matrix_search, matrix_category, matrix_adequacy],
+        outputs=[
+            matrix_leaderboard,
+            matrix_ranking_plot,
+            matrix_comparison_plot,
+            matrix_stats,
+        ],
     )
     # Model analysis
     analyze_btn.click(
         fn=get_scientific_model_details,
         inputs=[model_select, track_select],
+        outputs=[model_details, model_analysis_plot, model_heatmap_plot],
     )
     # Model comparison
     compare_btn.click(
         fn=perform_model_comparison,
         inputs=[comparison_models, comparison_track, comparison_type],
+        outputs=[comparison_output, comparison_plot],
     )
     # Load initial data and update dropdowns
     def load_initial_data():
         # Load initial Google track data
         google_data = refresh_google_track("", "all", 0.0)
         # Update dropdown choices
         if current_leaderboard is not None and not current_leaderboard.empty:
+            model_choices = current_leaderboard["model_name"].tolist()
         else:
             model_choices = []
         return (
             google_data[0],  # google_leaderboard
+            google_data[1],  # google_ranking_plot
             google_data[2],  # google_comparison_plot
             google_data[3],  # google_stats
             gr.Dropdown(choices=model_choices),  # model_select
+            gr.CheckboxGroup(choices=model_choices),  # comparison_models
         )
     demo.load(
         fn=load_initial_data,
         outputs=[
+            google_leaderboard,
+            google_ranking_plot,
+            google_comparison_plot,
+            google_stats,
+            model_select,
+            comparison_models,
+        ],
     )
 # Launch the scientific application
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)