Spaces:

akera
/

leaderboard

Running

App Files Files Community

akera commited on Jun 12

Commit

e32fdda

verified ·

1 Parent(s): cfbcff1

Update app.py

Browse files

Files changed (1) hide show

app.py +591 -257

app.py CHANGED Viewed

@@ -1,200 +1,321 @@
 # app.py
 import gradio as gr
 import pandas as pd
-import matplotlib.pyplot as plt
-from datasets import load_dataset
-import yaml
 import json
-import torch
-from datetime import datetime
 import traceback
 # Import our modules
-from src.model_loader import load_model, get_model_info
-from src.evaluation import evaluate_model_full
-from src.leaderboard import load_leaderboard, add_model_results, get_leaderboard_summary, search_models
-from src.plotting import create_leaderboard_plot, create_detailed_comparison_plot, create_summary_metrics_plot
-from src.utils import validate_model_path, get_model_type, sanitize_input
 from config import *
 # Global variables for caching
 current_leaderboard = None
-test_data = None
-def load_salt_data():
-    """Load SALT dataset for evaluation."""
-    global test_data
-    if test_data is not None:
-        return test_data
     try:
-        print("Loading SALT dataset...")
-        # Configuration for SALT dataset
-        dataset_config = f'''
-        huggingface_load:
-          path: {SALT_DATASET}
-          name: text-all
-          split: dev[:{MAX_EVAL_SAMPLES}]
-        source:
-          type: text
-          language: {SUPPORTED_LANGUAGES}
-        target:
-          type: text
-          language: {SUPPORTED_LANGUAGES}
-        src_or_tgt_languages_must_contain: eng
-        allow_same_src_and_tgt_language: False
-        '''
-        config = yaml.safe_load(dataset_config)
-        # Import salt dataset utilities
-        import salt.dataset
-        test_data = pd.DataFrame(salt.dataset.create(config))
-        print(f"Loaded {len(test_data)} evaluation samples")
-        return test_data
     except Exception as e:
-        print(f"Error loading SALT dataset: {e}")
-        # Fallback: create minimal test data
-        test_data = pd.DataFrame({
-            'source': ['Hello world', 'How are you?'],
-            'target': ['Amakuru', 'Oli otya?'],
-            'source.language': ['eng', 'eng'],
-            'target.language': ['lug', 'lug']
-        })
-        return test_data
-def refresh_leaderboard():
-    """Refresh leaderboard data."""
-    global current_leaderboard
-    current_leaderboard = load_leaderboard()
-    return current_leaderboard
-def evaluate_submission(model_path: str, author_name: str) -> tuple:
-    """Main evaluation function."""
     try:
-        # Validate inputs
-        model_path = sanitize_input(model_path)
-        author_name = sanitize_input(author_name)
-        if not model_path:
-            return "❌ Error: Model path is required", None, None, None
-        if not author_name:
-            author_name = "Anonymous"
-        if not validate_model_path(model_path):
-            return "❌ Error: Invalid model path format", None, None, None
-        # Load test data
-        test_data = load_salt_data()
-        if test_data is None or len(test_data) == 0:
-            return "❌ Error: Could not load evaluation data", None, None, None
-        # Get model info
-        print(f"Getting model info for: {model_path}")
-        model_info = get_model_info(model_path)
-        model_type = get_model_type(model_path)
-        # Load model
-        print(f"Loading model: {model_path}")
-        try:
-            model, tokenizer = load_model(model_path)
-        except Exception as e:
-            return f"❌ Error loading model: {str(e)}", None, None, None
         # Run evaluation
-        print("Starting evaluation...")
-        try:
-            detailed_metrics = evaluate_model_full(model, tokenizer, model_path, test_data)
-        except Exception as e:
-            return f"❌ Error during evaluation: {str(e)}", None, None, None
-        # Extract average metrics
-        avg_metrics = detailed_metrics.get('averages', {})
-        if not avg_metrics:
-            return "❌ Error: No metrics calculated", None, None, None
-        # Add results to leaderboard
-        print("Adding results to leaderboard...")
-        updated_leaderboard = add_model_results(
-            model_path=model_path,
-            author=author_name,
-            metrics=avg_metrics,
-            detailed_metrics=detailed_metrics,
-            evaluation_samples=len(test_data),
-            model_type=model_type
         )
         # Update global leaderboard
-        global current_leaderboard
         current_leaderboard = updated_leaderboard
-        # Create visualizations
-        leaderboard_plot = create_leaderboard_plot(updated_leaderboard, 'quality_score')
-        detailed_plot = create_detailed_comparison_plot({model_path: detailed_metrics}, [model_path])
-        # Format results message
-        results_msg = f"""
-        ✅ **Evaluation Complete!**
-        **Model:** {model_path}
-        **Author:** {author_name}
-        **Type:** {model_type}
-        **Results:**
-        - Quality Score: {avg_metrics.get('quality_score', 0):.4f}
-        - BLEU: {avg_metrics.get('bleu', 0):.2f}
-        - ChrF: {avg_metrics.get('chrf', 0):.4f}
-        - ROUGE-L: {avg_metrics.get('rougeL', 0):.4f}
-        **Ranking:** #{updated_leaderboard[updated_leaderboard['model_path'] == model_path].index[0] + 1} out of {len(updated_leaderboard)} models
         """
-        return results_msg, updated_leaderboard, leaderboard_plot, detailed_plot
     except Exception as e:
-        error_msg = f"❌ Unexpected error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-        print(error_msg)
         return error_msg, None, None, None
-def update_leaderboard_display(search_query: str = "") -> tuple:
-    """Update leaderboard display with optional search."""
-    global current_leaderboard
-    if current_leaderboard is None:
-        current_leaderboard = refresh_leaderboard()
-    # Apply search filter
-    if search_query:
-        filtered_df = search_models(current_leaderboard, search_query)
-    else:
-        filtered_df = current_leaderboard
-    # Create plots
-    leaderboard_plot = create_leaderboard_plot(filtered_df, 'quality_score')
-    summary_plot = create_summary_metrics_plot(filtered_df)
-    # Get summary stats
-    summary = get_leaderboard_summary(filtered_df)
-    summary_text = f"""
-    📊 **Leaderboard Summary**
-    - Total Models: {summary['total_models']}
-    - Average Quality Score: {summary['avg_quality_score']:.4f}
-    - Best Model: {summary['best_model']}
-    - Latest Submission: {summary['latest_submission'][:10] if summary['latest_submission'] != 'None' else 'None'}
-    """
-    return filtered_df, leaderboard_plot, summary_plot, summary_text
-# Initialize data
-print("Initializing SALT Translation Leaderboard...")
-load_salt_data()
-refresh_leaderboard()
 # Create Gradio interface
 with gr.Blocks(
@@ -202,17 +323,37 @@ with gr.Blocks(
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
-        max-width: 1200px !important;
     }
     .main-header {
         text-align: center;
         margin-bottom: 2rem;
     }
-    .metric-display {
         background: #f8f9fa;
         padding: 1rem;
-        border-radius: 0.5rem;
         margin: 0.5rem 0;
     }
     """
 ) as demo:
@@ -225,189 +366,382 @@ with gr.Blocks(
     {DESCRIPTION}
-    **Supported Languages:** Luganda (lug), Acholi (ach), Swahili (swa), English (eng)
     </div>
     """)
     with gr.Tabs():
-        # Tab 1: Submit Model
-        with gr.Tab("🚀 Submit Model", id="submit"):
             gr.Markdown("""
-            ### Submit Your Translation Model
-            Enter a HuggingFace model path (e.g., `microsoft/DialoGPT-medium`) or use `google-translate` to benchmark against Google Translate.
-            **Supported Model Types:** Gemma, Qwen, Llama, NLLB, Google Translate
             """)
             with gr.Row():
-                with gr.Column(scale=2):
-                    model_input = gr.Textbox(
-                        label="🤗 HuggingFace Model Path",
-                        placeholder="e.g., Sunbird/gemma3-12b-ug40-merged",
-                        info="Enter the full HuggingFace model path or 'google-translate'"
                     )
                     author_input = gr.Textbox(
-                        label="👤 Author/Organization",
                         placeholder="Your name or organization",
                         value="Anonymous"
                     )
-                    submit_btn = gr.Button(
-                        "🔄 Evaluate Model",
-                        variant="primary",
-                        size="lg"
                     )
                 with gr.Column(scale=1):
-                    gr.Markdown("""
-                    **📋 Evaluation Process:**
-                    1. Model validation
-                    2. Loading model weights
-                    3. Generating translations
-                    4. Calculating metrics
-                    5. Updating leaderboard
-                    ⏱️ **Expected time:** 5-15 minutes
-                    """)
             # Results section
-            with gr.Group():
-                results_output = gr.Markdown(label="📊 Results")
-                with gr.Row():
-                    with gr.Column():
-                        results_leaderboard = gr.Dataframe(
-                            label="📈 Updated Leaderboard",
-                            interactive=False
-                        )
-                with gr.Row():
-                    results_plot = gr.Plot(label="📊 Leaderboard Ranking")
-                    detailed_plot = gr.Plot(label="🔍 Detailed Performance")
-        # Tab 2: Leaderboard
         with gr.Tab("🏆 Leaderboard", id="leaderboard"):
             with gr.Row():
-                search_input = gr.Textbox(
-                    label="🔍 Search Models",
-                    placeholder="Search by model name, author, or path...",
-                    scale=3
-                )
-                refresh_btn = gr.Button("🔄 Refresh", scale=1)
-            summary_stats = gr.Markdown(label="📊 Summary")
             with gr.Row():
                 leaderboard_table = gr.Dataframe(
-                    label="🏆 Model Rankings",
                     interactive=False,
                     wrap=True
                 )
             with gr.Row():
-                leaderboard_viz = gr.Plot(label="📊 Performance Comparison")
-                summary_viz = gr.Plot(label="📈 Top Models Summary")
-        # Tab 3: Documentation
         with gr.Tab("📚 Documentation", id="docs"):
-            gr.Markdown("""
-            ## 📖 How to Use the SALT Translation Leaderboard
-            ### 🚀 Submitting Your Model
-            1. **Prepare your model**: Ensure your model is uploaded to HuggingFace Hub
-            2. **Enter model path**: Use the format `username/model-name`
-            3. **Add your details**: Provide your name or organization
-            4. **Submit**: Click "Evaluate Model" and wait for results
-            ### 📊 Metrics Explained
-            - **Quality Score**: Combined metric (0-1, higher is better)
-            - **BLEU**: Translation quality (0-100, higher is better)
-            - **ChrF**: Character-level F-score (0-1, higher is better)
-            - **ROUGE-L**: Longest common subsequence (0-1, higher is better)
-            - **CER/WER**: Character/Word Error Rate (0-1, lower is better)
-            ### 🎯 Supported Models
-            - **Gemma**: Google's Gemma models fine-tuned for translation
-            - **Qwen**: Alibaba's Qwen models
-            - **Llama**: Meta's Llama models
-            - **NLLB**: Facebook's No Language Left Behind models
-            - **Google Translate**: Baseline comparison
-            ### 📋 Dataset Information
-            **SALT Dataset**: Sunbird AI's comprehensive translation dataset
-            - **Languages**: Luganda, Acholi, Swahili, English
-            - **Evaluation Size**: {MAX_EVAL_SAMPLES} samples
-            - **Domains**: Multiple domains including news, literature, and conversations
-            ### 🔄 API Access
-            The leaderboard data is available via HuggingFace Datasets:
-            ```python
-            from datasets import load_dataset
-            leaderboard = load_dataset("{LEADERBOARD_DATASET}")
             ```
-            ### 🤝 Contributing
             This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
-            For issues or suggestions, please contact us or submit a GitHub issue.
-            ### 📜 License & Citation
             If you use this leaderboard in your research, please cite:
-            ```
             @misc{{salt_leaderboard_2024,
-              title={{SALT Translation Leaderboard}},
               author={{Sunbird AI}},
               year={{2024}},
               url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard}}
             }}
             ```
             """)
-    # Event handlers
     submit_btn.click(
-        fn=evaluate_submission,
-        inputs=[model_input, author_input],
-        outputs=[results_output, results_leaderboard, results_plot, detailed_plot],
-        show_progress=True
     )
     refresh_btn.click(
-        fn=update_leaderboard_display,
-        inputs=[search_input],
-        outputs=[leaderboard_table, leaderboard_viz, summary_viz, summary_stats]
     )
-    search_input.change(
-        fn=update_leaderboard_display,
-        inputs=[search_input],
-        outputs=[leaderboard_table, leaderboard_viz, summary_viz, summary_stats]
     )
-    # Load initial leaderboard data
     demo.load(
-        fn=update_leaderboard_display,
-        inputs=[],
-        outputs=[leaderboard_table, leaderboard_viz, summary_viz, summary_stats]
     )
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        show_error=True
     )

 # app.py
 import gradio as gr
 import pandas as pd
 import json
 import traceback
+from datetime import datetime
+from typing import Optional, Dict, Tuple
 # Import our modules
+from src.test_set import get_public_test_set, get_complete_test_set, create_test_set_download, validate_test_set_integrity
+from src.validation import validate_submission_complete
+from src.evaluation import evaluate_predictions, generate_evaluation_report, get_google_translate_baseline
+from src.leaderboard import (
+    load_leaderboard, add_model_to_leaderboard, get_leaderboard_stats,
+    filter_leaderboard, export_leaderboard, get_model_comparison
+)
+from src.plotting import (
+    create_leaderboard_ranking_plot, create_metrics_comparison_plot,
+    create_language_pair_heatmap, create_coverage_analysis_plot,
+    create_model_performance_timeline, create_google_comparison_plot,
+    create_detailed_model_analysis, create_submission_summary_plot
+)
+from src.utils import sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs
 from config import *
 # Global variables for caching
 current_leaderboard = None
+public_test_set = None
+complete_test_set = None
+def initialize_data():
+    """Initialize test sets and leaderboard data."""
+    global public_test_set, complete_test_set, current_leaderboard
     try:
+        print("🔄 Initializing SALT Translation Leaderboard...")
+        # Load test sets
+        print("📥 Loading test sets...")
+        public_test_set = get_public_test_set()
+        complete_test_set = get_complete_test_set()
+        # Load leaderboard
+        print("🏆 Loading leaderboard...")
+        current_leaderboard = load_leaderboard()
+        print(f"✅ Initialization complete!")
+        print(f"   - Test set: {len(public_test_set):,} samples")
+        print(f"   - Language pairs: {len(get_all_language_pairs())}")
+        print(f"   - Current models: {len(current_leaderboard)}")
+        return True
     except Exception as e:
+        print(f"❌ Initialization failed: {e}")
+        traceback.print_exc()
+        return False
+def download_test_set() -> Tuple[str, str]:
+    """Create downloadable test set and return file path and info."""
+    try:
+        global public_test_set
+        if public_test_set is None:
+            public_test_set = get_public_test_set()
+        # Create download file
+        download_path, stats = create_test_set_download()
+        # Create info message
+        info_msg = f"""
+        📥 **SALT Test Set Downloaded Successfully!**
+        **Dataset Statistics:**
+        - **Total Samples**: {stats['total_samples']:,}
+        - **Language Pairs**: {stats['language_pairs']}
+        - **Google Comparable**: {stats['google_comparable_samples']:,} samples
+        - **Languages**: {', '.join(stats['languages'])}
+        **File Format:**
+        - `sample_id`: Unique identifier for each sample
+        - `source_text`: Text to be translated
+        - `source_language`: Source language code
+        - `target_language`: Target language code
+        - `domain`: Content domain (if available)
+        - `google_comparable`: Whether this pair can be compared with Google Translate
+        **Next Steps:**
+        1. Run your model on the source texts
+        2. Create a CSV/JSON file with columns: `sample_id`, `prediction`
+        3. Upload your predictions using the "Submit Predictions" tab
+        """
+        return download_path, info_msg
+    except Exception as e:
+        error_msg = f"❌ Error creating test set download: {str(e)}"
+        return None, error_msg
+def validate_submission(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame]]:
+    """Validate uploaded prediction file."""
     try:
+        if file is None:
+            return "❌ Please upload a predictions file", None
+        if not model_name.strip():
+            return "❌ Please provide a model name", None
+        # Read file content
+        file_content = file.read()
+        filename = file.name
+        # Get test set for validation
+        global complete_test_set
+        if complete_test_set is None:
+            complete_test_set = get_complete_test_set()
+        # Validate submission
+        validation_result = validate_submission_complete(
+            file_content, filename, complete_test_set, model_name
+        )
+        if validation_result['valid']:
+            # Store validation info for later use
+            return validation_result['report'], validation_result['predictions']
+        else:
+            return validation_result['report'], None
+    except Exception as e:
+        error_msg = f"❌ Validation error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        return error_msg, None
+def evaluate_submission(
+    predictions_df: pd.DataFrame,
+    model_name: str,
+    author: str,
+    description: str,
+    validation_info: Dict
+) -> Tuple[str, pd.DataFrame, object, object]:
+    """Evaluate validated predictions and update leaderboard."""
+    try:
+        if predictions_df is None:
+            return "❌ No valid predictions to evaluate", None, None, None
+        # Get complete test set with targets
+        global complete_test_set, current_leaderboard
+        if complete_test_set is None:
+            complete_test_set = get_complete_test_set()
         # Run evaluation
+        print(f"🔄 Evaluating {model_name}...")
+        evaluation_results = evaluate_predictions(predictions_df, complete_test_set)
+        if evaluation_results.get('error'):
+            return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
+        # Add to leaderboard
+        print("🏆 Adding to leaderboard...")
+        model_type = "user_submission"  # Could be enhanced to detect model type
+        updated_leaderboard = add_model_to_leaderboard(
+            model_name=sanitize_model_name(model_name),
+            author=author or "Anonymous",
+            evaluation_results=evaluation_results,
+            validation_info=validation_info,
+            model_type=model_type,
+            description=description or ""
         )
         # Update global leaderboard
         current_leaderboard = updated_leaderboard
+        # Generate evaluation report
+        report = generate_evaluation_report(evaluation_results, model_name)
+        # Create visualization plots
+        summary_plot = create_submission_summary_plot(validation_info, evaluation_results)
+        ranking_plot = create_leaderboard_ranking_plot(updated_leaderboard)
+        # Format success message
+        rank = updated_leaderboard[updated_leaderboard['model_name'] == sanitize_model_name(model_name)].index[0] + 1
+        total_models = len(updated_leaderboard)
+        success_msg = f"""
+        🎉 **Evaluation Complete!**
+        **Your Results:**
+        - **Model**: {model_name}
+        - **Rank**: #{rank} out of {total_models} models
+        - **Quality Score**: {evaluation_results['averages'].get('quality_score', 0):.4f}
+        - **BLEU**: {evaluation_results['averages'].get('bleu', 0):.2f}
+        - **ChrF**: {evaluation_results['averages'].get('chrf', 0):.4f}
+        **Coverage:**
+        - **Samples Evaluated**: {evaluation_results['evaluated_samples']:,}
+        - **Language Pairs**: {evaluation_results['summary']['language_pairs_covered']}
+        - **Google Comparable**: {evaluation_results['summary']['google_comparable_pairs']} pairs
+        {report}
         """
+        return success_msg, updated_leaderboard, summary_plot, ranking_plot
     except Exception as e:
+        error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
         return error_msg, None, None, None
+def refresh_leaderboard_display(
+    search_query: str = "",
+    model_type_filter: str = "all",
+    min_coverage: float = 0.0,
+    google_only: bool = False
+) -> Tuple[pd.DataFrame, object, object, str]:
+    """Refresh and filter leaderboard display."""
+    try:
+        global current_leaderboard
+        if current_leaderboard is None:
+            current_leaderboard = load_leaderboard()
+        # Apply filters
+        filtered_df = filter_leaderboard(
+            current_leaderboard,
+            search_query=search_query,
+            model_type=model_type_filter,
+            min_coverage=min_coverage,
+            google_comparable_only=google_only
+        )
+        # Create plots
+        ranking_plot = create_leaderboard_ranking_plot(filtered_df)
+        comparison_plot = create_metrics_comparison_plot(filtered_df)
+        # Get stats
+        stats = get_leaderboard_stats(filtered_df)
+        stats_text = f"""
+        📊 **Leaderboard Statistics**
+        - **Total Models**: {stats['total_models']}
+        - **Average Quality Score**: {stats['avg_quality_score']:.4f}
+        - **Google Comparable Models**: {stats['google_comparable_models']}
+        **Best Model**: {stats['best_model']['name'] if stats['best_model'] else 'None'}
+        **Latest Submission**: {stats['latest_submission'][:10] if stats['latest_submission'] else 'None'}
+        """
+        return filtered_df, ranking_plot, comparison_plot, stats_text
+    except Exception as e:
+        error_msg = f"Error loading leaderboard: {str(e)}"
+        empty_df = pd.DataFrame()
+        return empty_df, None, None, error_msg
+def get_model_details(model_name: str) -> Tuple[str, object]:
+    """Get detailed analysis for a specific model."""
+    try:
+        global current_leaderboard
+        if current_leaderboard is None:
+            return "Leaderboard not loaded", None
+        # Find model
+        model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
+        if model_row.empty:
+            return f"Model '{model_name}' not found", None
+        model_info = model_row.iloc[0]
+        # Parse detailed metrics
+        try:
+            detailed_results = json.loads(model_info['detailed_metrics'])
+        except:
+            detailed_results = {}
+        # Create detailed plot
+        detail_plot = create_detailed_model_analysis(detailed_results, model_name)
+        # Format model details
+        details_text = f"""
+        # 🔍 Model Details: {model_name}
+        **Basic Information:**
+        - **Author**: {model_info['author']}
+        - **Submission Date**: {model_info['submission_date'][:10]}
+        - **Model Type**: {model_info['model_type']}
+        - **Description**: {model_info['description'] or 'No description provided'}
+        **Performance Metrics:**
+        - **Quality Score**: {model_info['quality_score']:.4f}
+        - **BLEU**: {model_info['bleu']:.2f}
+        - **ChrF**: {model_info['chrf']:.4f}
+        - **ROUGE-1**: {model_info['rouge1']:.4f}
+        - **ROUGE-L**: {model_info['rougeL']:.4f}
+        **Coverage Information:**
+        - **Total Samples**: {model_info['total_samples']:,}
+        - **Language Pairs Covered**: {model_info['language_pairs_covered']}
+        - **Google Comparable Pairs**: {model_info['google_pairs_covered']}
+        - **Coverage Rate**: {model_info['coverage_rate']:.1%}
+        **Google Translate Comparison:**
+        - **Google Quality Score**: {model_info['google_quality_score']:.4f}
+        - **Google BLEU**: {model_info['google_bleu']:.2f}
+        - **Google ChrF**: {model_info['google_chrf']:.4f}
+        """
+        return details_text, detail_plot
+    except Exception as e:
+        error_msg = f"Error getting model details: {str(e)}"
+        return error_msg, None
+# Initialize data on startup
+print("🚀 Starting SALT Translation Leaderboard...")
+initialization_success = initialize_data()
 # Create Gradio interface
 with gr.Blocks(
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
+        max-width: 1400px !important;
+        margin: 0 auto;
     }
     .main-header {
         text-align: center;
         margin-bottom: 2rem;
+        padding: 2rem;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border-radius: 10px;
     }
+    .metric-box {
         background: #f8f9fa;
         padding: 1rem;
+        border-radius: 8px;
         margin: 0.5rem 0;
+        border-left: 4px solid #007bff;
+    }
+    .error-box {
+        background: #f8d7da;
+        color: #721c24;
+        padding: 1rem;
+        border-radius: 8px;
+        border-left: 4px solid #dc3545;
+    }
+    .success-box {
+        background: #d4edda;
+        color: #155724;
+        padding: 1rem;
+        border-radius: 8px;
+        border-left: 4px solid #28a745;
     }
     """
 ) as demo:
     {DESCRIPTION}
+    **Supported Languages**: {len(ALL_UG40_LANGUAGES)} Ugandan languages | **Google Comparable**: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages
     </div>
     """)
+    # Status indicator
+    if initialization_success:
+        status_msg = "✅ System initialized successfully"
+    else:
+        status_msg = "❌ System initialization failed - some features may not work"
+    gr.Markdown(f"**Status**: {status_msg}")
     with gr.Tabs():
+        # Tab 1: Get Test Set
+        with gr.Tab("📥 Download Test Set", id="download"):
+            gr.Markdown("""
+            ## 📋 Get the SALT Translation Test Set
+            Download the standardized test set to evaluate your translation model.
+            The test set contains source texts in multiple Ugandan languages that you need to translate.
+            """)
+            with gr.Row():
+                download_btn = gr.Button("📥 Download Test Set", variant="primary", size="lg")
+            with gr.Row():
+                with gr.Column():
+                    download_file = gr.File(label="📂 Test Set File", interactive=False)
+                with gr.Column():
+                    download_info = gr.Markdown(label="ℹ️ Test Set Information")
             gr.Markdown("""
+            ### 📖 Instructions
+            1. **Download** the test set using the button above
+            2. **Run your model** on the source texts to generate translations
+            3. **Create a predictions file** with your model's outputs
+            4. **Submit** your predictions using the "Submit Predictions" tab
+            ### 📋 Required Prediction Format
+            Your predictions file must be a CSV/TSV/JSON with these columns:
+            - `sample_id`: The unique identifier from the test set
+            - `prediction`: Your model's translation for that sample
+            **Example CSV:**
+            ```
+            sample_id,prediction
+            salt_000001,Oli otya mukwano gwange?
+            salt_000002,Webale nyo olukya
+            ...
+            ```
+            """)
+        # Tab 2: Submit Predictions
+        with gr.Tab("🚀 Submit Predictions", id="submit"):
+            gr.Markdown("""
+            ## 🎯 Submit Your Model's Predictions
+            Upload your model's predictions on the SALT test set for evaluation.
             """)
             with gr.Row():
+                with gr.Column(scale=1):
+                    # Model information
+                    gr.Markdown("### 📝 Model Information")
+                    model_name_input = gr.Textbox(
+                        label="🤖 Model Name",
+                        placeholder="e.g., MyTranslator-v1.0",
+                        info="Unique name for your model"
                     )
                     author_input = gr.Textbox(
+                        label="👤 Author/Organization",
                         placeholder="Your name or organization",
                         value="Anonymous"
                     )
+                    description_input = gr.Textbox(
+                        label="📄 Description (Optional)",
+                        placeholder="Brief description of your model",
+                        lines=3
                     )
+                    # File upload
+                    gr.Markdown("### 📤 Upload Predictions")
+                    predictions_file = gr.File(
+                        label="📂 Predictions File",
+                        file_types=[".csv", ".tsv", ".json"],
+                        info="CSV/TSV/JSON file with your model's predictions"
+                    )
+                    validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
+                    submit_btn = gr.Button("🚀 Submit for Evaluation", variant="primary", interactive=False)
                 with gr.Column(scale=1):
+                    gr.Markdown("### 📊 Validation Results")
+                    validation_output = gr.Markdown()
             # Results section
+            gr.Markdown("### 🏆 Evaluation Results")
+            with gr.Row():
+                evaluation_output = gr.Markdown()
+            with gr.Row():
+                with gr.Column():
+                    submission_plot = gr.Plot(label="📈 Your Submission Analysis")
+                with gr.Column():
+                    updated_leaderboard_plot = gr.Plot(label="🏆 Updated Leaderboard")
+            with gr.Row():
+                results_table = gr.Dataframe(label="📊 Updated Leaderboard", interactive=False)
+        # Tab 3: Leaderboard
         with gr.Tab("🏆 Leaderboard", id="leaderboard"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    search_input = gr.Textbox(
+                        label="🔍 Search Models",
+                        placeholder="Search by model name, author...",
+                    )
+                with gr.Column(scale=1):
+                    model_type_dropdown = gr.Dropdown(
+                        label="🔧 Model Type",
+                        choices=["all", "user_submission", "baseline"],
+                        value="all"
+                    )
+                with gr.Column(scale=1):
+                    min_coverage_slider = gr.Slider(
+                        label="📊 Min Coverage",
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.0,
+                        step=0.1
+                    )
+                with gr.Column(scale=1):
+                    google_only_checkbox = gr.Checkbox(
+                        label="🤖 Google Comparable Only",
+                        value=False
+                    )
             with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
+            with gr.Row():
+                leaderboard_stats = gr.Markdown()
+            with gr.Row():
+                with gr.Column():
+                    leaderboard_plot = gr.Plot(label="🏆 Rankings")
+                with gr.Column():
+                    comparison_plot = gr.Plot(label="📊 Multi-Metric Comparison")
             with gr.Row():
                 leaderboard_table = gr.Dataframe(
+                    label="📈 Full Leaderboard",
                     interactive=False,
                     wrap=True
                 )
+        # Tab 4: Model Analysis
+        with gr.Tab("🔍 Model Analysis", id="analysis"):
+            with gr.Row():
+                model_select = gr.Dropdown(
+                    label="🤖 Select Model",
+                    choices=[],
+                    value=None,
+                    info="Choose a model for detailed analysis"
+                )
+                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
             with gr.Row():
+                model_details = gr.Markdown()
+            with gr.Row():
+                model_analysis_plot = gr.Plot(label="📊 Detailed Performance Analysis")
+        # Tab 5: Documentation
         with gr.Tab("📚 Documentation", id="docs"):
+            gr.Markdown(f"""
+            # 📖 SALT Translation Leaderboard Documentation
+            ## 🎯 Overview
+            The SALT Translation Leaderboard is a scientific evaluation platform for translation models on Ugandan languages.
+            Submit your model's predictions on our standardized test set to see how it compares with other models.
+            ## 🗣️ Supported Languages
+            **All UG40 Languages ({len(ALL_UG40_LANGUAGES)} total):**
+            {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in ALL_UG40_LANGUAGES])}
+            **Google Translate Comparable ({len(GOOGLE_SUPPORTED_LANGUAGES)} languages):**
+            {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in GOOGLE_SUPPORTED_LANGUAGES])}
+            ## 📊 Evaluation Metrics
+            ### Primary Metrics
+            - **Quality Score**: Composite metric (0-1, higher better) combining multiple metrics
+            - **BLEU**: Translation quality score (0-100, higher better)
+            - **ChrF**: Character-level F-score (0-1, higher better)
+            ### Secondary Metrics
+            - **ROUGE-1/ROUGE-L**: Recall-oriented metrics (0-1, higher better)
+            - **CER/WER**: Character/Word Error Rate (0-1, lower better)
+            - **Length Ratio**: Prediction/reference length ratio
+            ## 🔄 Submission Process
+            ### Step 1: Download Test Set
+            1. Go to "Download Test Set" tab
+            2. Click "Download Test Set" button
+            3. Save the `salt_test_set.csv` file
+            ### Step 2: Generate Predictions
+            1. Load the test set in your code
+            2. For each row, translate `source_text` from `source_language` to `target_language`
+            3. Save results as CSV with columns: `sample_id`, `prediction`
+            ### Step 3: Submit & Evaluate
+            1. Go to "Submit Predictions" tab
+            2. Fill in model information
+            3. Upload your predictions file
+            4. Validate and submit for evaluation
+            ## 📋 File Formats
+            ### Test Set Format
+            ```csv
+            sample_id,source_text,source_language,target_language,domain,google_comparable
+            salt_000001,"Hello world",eng,lug,general,true
+            salt_000002,"How are you?",eng,ach,conversation,true
+            ```
+            ### Predictions Format
+            ```csv
+            sample_id,prediction
+            salt_000001,"Amakuru ensi"
+            salt_000002,"Ibino nining?"
             ```
+            ## 🏆 Leaderboard Types
+            ### 1. Full UG40 Leaderboard
+            - Includes all {len(get_all_language_pairs())} language pairs
+            - Complete evaluation across all Ugandan languages
+            - Primary ranking system
+            ### 2. Google Translate Comparable
+            - Limited to {len(get_google_comparable_pairs())} pairs
+            - Only languages supported by Google Translate
+            - Allows direct comparison with Google Translate baseline
+            ## 🔬 Scientific Rigor
+            - **Standardized Evaluation**: Same test set for all models
+            - **Multiple Metrics**: Comprehensive evaluation beyond just BLEU
+            - **Coverage Tracking**: Transparency about what each model covers
+            - **Reproducible**: All evaluation code and data available
+            ## 🤝 Contributing
             This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
+            **Contact**: [[email protected]](mailto:[email protected])
+            **GitHub**: [Sunbird AI GitHub](https://github.com/sunbirdai)
+            ## 📄 Citation
             If you use this leaderboard in your research, please cite:
+            ```bibtex
             @misc{{salt_leaderboard_2024,
+              title={{SALT Translation Leaderboard: Evaluation of Translation Models on Ugandan Languages}},
               author={{Sunbird AI}},
               year={{2024}},
               url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard}}
             }}
             ```
+            ## 🔗 Related Resources
+            - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
+            - **Sunbird AI Models**: [Sunbird Organization](https://huggingface.co/Sunbird)
+            - **Research Papers**: [Sunbird AI Publications](https://sunbird.ai/research)
             """)
+    # Event handlers with state management
+    predictions_validated = gr.State(value=None)
+    validation_info_state = gr.State(value=None)
+    # Download test set
+    download_btn.click(
+        fn=download_test_set,
+        outputs=[download_file, download_info]
+    )
+    # Validate predictions
+    def handle_validation(file, model_name, author, description):
+        report, predictions = validate_submission(file, model_name, author, description)
+        is_valid = predictions is not None
+        return report, predictions, predictions, is_valid
+    validate_btn.click(
+        fn=handle_validation,
+        inputs=[predictions_file, model_name_input, author_input, description_input],
+        outputs=[validation_output, predictions_validated, validation_info_state, submit_btn]
+    )
+    # Submit for evaluation
+    def handle_submission(predictions, model_name, author, description, validation_info):
+        if predictions is None:
+            return "❌ Please validate your submission first", None, None, None
+        # Extract validation info dict
+        validation_dict = {
+            'coverage': getattr(validation_info, 'coverage', 0.8) if hasattr(validation_info, 'coverage') else 0.8,
+            'report': 'Validation passed'
+        }
+        return evaluate_submission(predictions, model_name, author, description, validation_dict)
     submit_btn.click(
+        fn=handle_submission,
+        inputs=[predictions_validated, model_name_input, author_input, description_input, validation_info_state],
+        outputs=[evaluation_output, results_table, submission_plot, updated_leaderboard_plot]
     )
+    # Refresh leaderboard
+    def update_leaderboard_and_dropdown(*args):
+        table, plot1, plot2, stats = refresh_leaderboard_display(*args)
+        # Update model dropdown choices
+        model_choices = table['model_name'].tolist() if not table.empty else []
+        return table, plot1, plot2, stats, gr.Dropdown(choices=model_choices)
     refresh_btn.click(
+        fn=update_leaderboard_and_dropdown,
+        inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
+        outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
     )
+    # Auto-refresh on filter changes
+    for input_component in [search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox]:
+        input_component.change(
+            fn=update_leaderboard_and_dropdown,
+            inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
+            outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
+        )
+    # Model analysis
+    analyze_btn.click(
+        fn=get_model_details,
+        inputs=[model_select],
+        outputs=[model_details, model_analysis_plot]
     )
+    # Load initial data
     demo.load(
+        fn=update_leaderboard_and_dropdown,
+        inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
+        outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
     )
+# Launch the application
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        show_error=True,
+        enable_queue=True
     )