# app.py import subprocess import sys import os from pathlib import Path def setup_salt(): """Clone and setup SALT library like in Colab.""" try: # Check if salt is already available import salt.dataset print("β SALT library already available") return True except ImportError: pass print("π₯ Setting up SALT library...") try: # Clone SALT repo if not exists salt_dir = Path("salt") if not salt_dir.exists(): print("π Cloning SALT repository...") subprocess.check_call( ["git", "clone", "https://github.com/sunbirdai/salt.git"] ) else: print("π SALT repository already exists") # Install SALT requirements salt_requirements = salt_dir / "requirements.txt" if salt_requirements.exists(): print("π¦ Installing SALT requirements...") subprocess.check_call( [ sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements), ] ) # Add SALT directory to Python path salt_path = str(salt_dir.absolute()) if salt_path not in sys.path: sys.path.insert(0, salt_path) print(f"π Added {salt_path} to Python path") # Test import import salt.dataset print("β SALT library setup completed successfully") return True except Exception as e: print(f"β Failed to setup SALT: {e}") return False # Setup SALT on startup print("π Starting SALT Translation Leaderboard - Scientific Edition...") if not setup_salt(): print("β Cannot continue without SALT library") print("π‘ Please check that git is available and GitHub is accessible") sys.exit(1) import gradio as gr import pandas as pd import json import traceback from datetime import datetime from typing import Optional, Dict, Tuple, List # Import our enhanced modules from src.test_set import ( get_public_test_set_scientific, get_complete_test_set_scientific, create_test_set_download_scientific, validate_test_set_integrity_scientific, get_track_test_set, ) from src.validation import validate_submission_scientific from src.evaluation import ( evaluate_predictions_scientific, generate_scientific_report, compare_models_statistically, ) from src.leaderboard import ( load_scientific_leaderboard, add_model_to_scientific_leaderboard, get_scientific_leaderboard_stats, get_track_leaderboard, prepare_track_leaderboard_display, perform_fair_comparison, export_scientific_leaderboard, ) from src.plotting import ( create_scientific_leaderboard_plot, create_language_pair_heatmap_scientific, create_statistical_comparison_plot, create_category_comparison_plot, create_adequacy_analysis_plot, create_cross_track_analysis_plot, create_scientific_model_detail_plot, ) from src.utils import ( sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs, get_track_language_pairs, format_metric_value, ) from config import * # Global variables for caching current_leaderboard = None public_test_set = None complete_test_set = None test_set_stats = None def initialize_scientific_data(): """Initialize scientific test sets and leaderboard data.""" global public_test_set, complete_test_set, current_leaderboard, test_set_stats try: print("π¬ Initializing SALT Translation Leaderboard - Scientific Edition...") # Load scientific test sets print("π₯ Loading scientific test sets...") public_test_set = get_public_test_set_scientific() complete_test_set = get_complete_test_set_scientific() # Load scientific leaderboard print("π Loading scientific leaderboard...") current_leaderboard = load_scientific_leaderboard() # Validate test set integrity print("π Validating test set integrity...") test_set_stats = validate_test_set_integrity_scientific() print(f"β Scientific initialization complete!") print(f" - Test set: {len(public_test_set):,} samples") print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}") print( f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}" ) print(f" - Current models: {len(current_leaderboard)}") return True except Exception as e: print(f"β Scientific initialization failed: {e}") traceback.print_exc() return False def download_scientific_test_set() -> Tuple[str, str]: """Create downloadable scientific test set and return file path and info.""" try: global public_test_set if public_test_set is None: public_test_set = get_public_test_set_scientific() # Create download file download_path, stats = create_test_set_download_scientific() # Create comprehensive info message adequacy = stats.get("adequacy_assessment", "unknown") adequacy_emoji = { "excellent": "π’", "good": "π‘", "fair": "π ", "insufficient": "π΄", "unknown": "βͺ", }.get(adequacy, "βͺ") info_msg = f""" ## π₯ SALT Scientific Test Set Downloaded Successfully! ### π¬ Scientific Edition Features: - **Stratified Sampling**: Ensures representative coverage across domains - **Statistical Weighting**: Samples weighted by track importance - **Track Balancing**: Optimized for fair cross-track comparison - **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}** ### π Dataset Statistics: - **Total Samples**: {stats['total_samples']:,} - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))}) - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%}) - **Domains**: {', '.join(stats.get('domains', ['general']))} ### π Track Breakdown: """ track_breakdown = stats.get("track_breakdown", {}) for track_name, track_info in track_breakdown.items(): status_emoji = ( "β " if track_info.get("statistical_adequacy", False) else "β οΈ" ) info_msg += f""" **{status_emoji} {track_info.get('name', track_name)}**: - Samples: {track_info.get('total_samples', 0):,} - Language Pairs: {track_info.get('language_pairs', 0)} - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)} - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'} """ info_msg += f""" ### π Enhanced File Format: - `sample_id`: Unique identifier for each sample - `source_text`: Text to be translated - `source_language`: Source language code - `target_language`: Target language code - `domain`: Content domain (if available) - `google_comparable`: Whether this pair can be compared with Google Translate - `tracks_included`: Comma-separated list of tracks that include this sample - `statistical_weight`: Statistical importance weight (1.0-5.0) ### π¬ Next Steps for Scientific Evaluation: 1. **Run your model** on the source texts to generate translations 2. **Create a predictions file** with columns: `sample_id`, `prediction` 3. **Optional**: Add `category` column to help with model classification 4. **Submit** your predictions using the appropriate track tab 5. **Analyze** results with statistical confidence intervals ### π‘ Tips for Best Results: - Ensure coverage of all language pairs for chosen track - Include confidence scores if available - Provide detailed model description for proper categorization - Consider submitting to multiple tracks for comprehensive evaluation """ return download_path, info_msg except Exception as e: error_msg = f"β Error creating scientific test set download: {str(e)}" return None, error_msg def validate_scientific_submission( file, model_name: str, author: str, description: str ) -> Tuple[str, Optional[pd.DataFrame], str]: """Validate uploaded prediction file with scientific rigor.""" try: if file is None: return "β Please upload a predictions file", None, "community" if not model_name.strip(): return "β Please provide a model name", None, "community" # Handle different file input types if isinstance(file, bytes): file_content = file elif isinstance(file, str): if os.path.exists(file): with open(file, "rb") as f: file_content = f.read() else: file_content = file.encode("utf-8") elif hasattr(file, "name") and os.path.exists(file.name): with open(file.name, "rb") as f: file_content = f.read() else: return "β Could not read uploaded file", None, "community" # Determine filename filename = ( getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv" ) # Load test set if needed global complete_test_set if complete_test_set is None: complete_test_set = get_complete_test_set_scientific() # Run enhanced scientific validation validation_result = validate_submission_scientific( file_content, filename, complete_test_set, model_name, author, description ) detected_category = validation_result.get("category", "community") if validation_result["valid"]: return ( validation_result["report"], validation_result["predictions"], detected_category, ) else: return validation_result["report"], None, detected_category except Exception as e: return ( f"β Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community", ) def evaluate_scientific_submission( predictions_df: pd.DataFrame, model_name: str, author: str, description: str, detected_category: str, validation_info: Dict, ) -> Tuple[str, pd.DataFrame, object, object]: """Evaluate validated predictions using scientific methodology.""" try: if predictions_df is None: return "β No valid predictions to evaluate", None, None, None # Get complete test set with targets global complete_test_set, current_leaderboard if complete_test_set is None: complete_test_set = get_complete_test_set_scientific() # Run scientific evaluation across all tracks print(f"π¬ Starting scientific evaluation for {model_name}...") evaluation_results = evaluate_predictions_scientific( predictions_df, complete_test_set, detected_category ) if any( track_data.get("error") for track_data in evaluation_results.get("tracks", {}).values() ): errors = [ track_data["error"] for track_data in evaluation_results["tracks"].values() if track_data.get("error") ] return f"β Evaluation errors: {'; '.join(errors)}", None, None, None # Add to scientific leaderboard print("π Adding to scientific leaderboard...") updated_leaderboard = add_model_to_scientific_leaderboard( model_name=sanitize_model_name(model_name), author=author or "Anonymous", evaluation_results=evaluation_results, model_category=detected_category, description=description or "", ) # Update global leaderboard current_leaderboard = updated_leaderboard # Generate scientific report report = generate_scientific_report(evaluation_results, model_name) # Create visualizations summary_plot = create_adequacy_analysis_plot(updated_leaderboard) cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard) # Prepare display leaderboard (Google-comparable track by default) google_leaderboard = get_track_leaderboard( updated_leaderboard, "google_comparable" ) display_leaderboard = prepare_track_leaderboard_display( google_leaderboard, "google_comparable" ) # Format success message with track-specific results success_msg = f""" ## π Scientific Evaluation Complete! ### π Model Information: - **Model**: {model_name} - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)} - **Author**: {author or 'Anonymous'} ### π Track Performance Summary: """ tracks = evaluation_results.get("tracks", {}) for track_name, track_data in tracks.items(): if not track_data.get("error"): track_config = EVALUATION_TRACKS[track_name] track_averages = track_data.get("track_averages", {}) summary = track_data.get("summary", {}) # Get rank in this track track_leaderboard = get_track_leaderboard( updated_leaderboard, track_name ) if not track_leaderboard.empty: model_row = track_leaderboard[ track_leaderboard["model_name"] == sanitize_model_name(model_name) ] rank = model_row.index[0] + 1 if not model_row.empty else "N/A" total_models = len(track_leaderboard) else: rank = "N/A" total_models = 0 quality_score = track_averages.get("quality_score", 0) bleu_score = track_averages.get("bleu", 0) samples = summary.get("total_samples", 0) success_msg += f""" **π {track_config['name']}**: - Rank: #{rank} out of {total_models} models - Quality Score: {quality_score:.4f} - BLEU: {bleu_score:.2f} - Samples: {samples:,} """ success_msg += f""" ### π¬ Scientific Adequacy: - **Cross-Track Consistency**: Available in detailed analysis - **Statistical Confidence**: 95% confidence intervals computed - **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')} {report} """ return success_msg, display_leaderboard, summary_plot, cross_track_plot except Exception as e: error_msg = f"β Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" return error_msg, None, None, None def refresh_track_leaderboard( track: str, search_query: str = "", category_filter: str = "all", min_adequacy: float = 0.0, show_ci: bool = True, ) -> Tuple[pd.DataFrame, object, object, str]: """Refresh leaderboard for a specific track with filters.""" try: global current_leaderboard if current_leaderboard is None: current_leaderboard = load_scientific_leaderboard() # Get track-specific leaderboard track_leaderboard = get_track_leaderboard( current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy, ) # Apply search filter if search_query: query_lower = search_query.lower() mask = track_leaderboard["model_name"].str.lower().str.contains( query_lower, na=False ) | track_leaderboard["author"].str.lower().str.contains( query_lower, na=False ) track_leaderboard = track_leaderboard[mask] # Prepare for display display_df = prepare_track_leaderboard_display(track_leaderboard, track) # Create plots ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track) comparison_plot = create_statistical_comparison_plot(track_leaderboard, track) # Get track statistics track_stats = get_scientific_leaderboard_stats(track_leaderboard, track) track_config = EVALUATION_TRACKS[track] stats_text = f""" ### π {track_config['name']} Statistics - **Total Models**: {track_stats.get('total_models', 0)} - **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])} - **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f} **Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')} **Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f} ### π¬ Scientific Notes: - All metrics include 95% confidence intervals - Statistical adequacy verified for reliable comparisons - {track_config['description']} """ return display_df, ranking_plot, comparison_plot, stats_text except Exception as e: error_msg = f"Error loading {track} leaderboard: {str(e)}" empty_df = pd.DataFrame() return empty_df, None, None, error_msg def get_scientific_model_details( model_name: str, track: str ) -> Tuple[str, object, object]: """Get detailed scientific analysis for a specific model.""" try: global current_leaderboard if current_leaderboard is None: return "Leaderboard not loaded", None, None # Find model model_row = current_leaderboard[current_leaderboard["model_name"] == model_name] if model_row.empty: return f"Model '{model_name}' not found", None, None model_info = model_row.iloc[0] # Parse detailed metrics for the requested track try: detailed_results = json.loads(model_info[f"detailed_{track}"]) except: detailed_results = {} # Create detailed plots detail_plot = create_scientific_model_detail_plot( detailed_results, model_name, track ) # Create language pair heatmap heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track) # Format model details with scientific information track_config = EVALUATION_TRACKS[track] category_info = MODEL_CATEGORIES.get(model_info["model_category"], {}) # Extract track-specific metrics quality_col = f"{track}_quality" bleu_col = f"{track}_bleu" chrf_col = f"{track}_chrf" ci_lower_col = f"{track}_ci_lower" ci_upper_col = f"{track}_ci_upper" samples_col = f"{track}_samples" pairs_col = f"{track}_pairs" adequate_col = f"{track}_adequate" details_text = f""" ## π¬ Scientific Model Analysis: {model_name} ### π Basic Information: - **Author**: {model_info['author']} - **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')} - **Submission Date**: {model_info['submission_date'][:10]} - **Description**: {model_info['description'] or 'No description provided'} ### π {track_config['name']} Performance: - **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))} - **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')} - **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')} ### π Coverage Information: - **Total Samples**: {model_info.get(samples_col, 0):,} - **Language Pairs Covered**: {model_info.get(pairs_col, 0)} - **Statistical Adequacy**: {'β Yes' if model_info.get(adequate_col, False) else 'β No'} ### π¬ Statistical Metadata: - **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%} - **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,} - **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f} ### π Cross-Track Performance: """ # Add other track performances for comparison for other_track in EVALUATION_TRACKS.keys(): if other_track != track: other_quality_col = f"{other_track}_quality" other_adequate_col = f"{other_track}_adequate" if model_info.get(other_adequate_col, False): other_quality = model_info.get(other_quality_col, 0) details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n" else: details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n" details_text += f""" ### π‘ Scientific Interpretation: - Performance metrics include 95% confidence intervals for reliability - Statistical adequacy ensures meaningful comparisons with other models - Cross-track analysis reveals model strengths across different language sets - Category classification helps contextualize performance expectations """ return details_text, detail_plot, heatmap_plot except Exception as e: error_msg = f"Error getting model details: {str(e)}" return error_msg, None, None def perform_model_comparison( model_names: List[str], track: str, comparison_type: str = "statistical" ) -> Tuple[str, object]: """Perform scientific comparison between selected models.""" try: global current_leaderboard if current_leaderboard is None: return "Leaderboard not loaded", None if len(model_names) < 2: return "Please select at least 2 models for comparison", None # Get models models = current_leaderboard[ current_leaderboard["model_name"].isin(model_names) ] if len(models) < 2: return "Selected models not found in leaderboard", None # Perform fair comparison comparison_result = perform_fair_comparison(current_leaderboard, model_names) if comparison_result.get("error"): return f"Comparison error: {comparison_result['error']}", None # Create comparison visualization if comparison_type == "statistical": comparison_plot = create_statistical_comparison_plot(models, track) else: comparison_plot = create_category_comparison_plot(models, track) # Format comparison report track_config = EVALUATION_TRACKS[track] comparison_text = f""" ## π¬ Scientific Model Comparison - {track_config['name']} ### π Models Compared: """ quality_col = f"{track}_quality" ci_lower_col = f"{track}_ci_lower" ci_upper_col = f"{track}_ci_upper" # Sort models by performance models_sorted = models.sort_values(quality_col, ascending=False) for i, (_, model) in enumerate(models_sorted.iterrows(), 1): category_info = MODEL_CATEGORIES.get(model["model_category"], {}) comparison_text += f""" **#{i}. {model['model_name']}** - Category: {category_info.get('name', 'Unknown')} - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])} - Author: {model['author']} """ # Add statistical analysis track_comparison = comparison_result.get("track_comparisons", {}).get(track, {}) if track_comparison: comparison_text += f""" ### π¬ Statistical Analysis: - **Models with adequate data**: {track_comparison.get('participating_models', 0)} - **Confidence intervals available**: Yes (95% level) - **Fair comparison possible**: {'β Yes' if comparison_result.get('fair_comparison_possible', False) else 'β οΈ Limited'} """ # Check for statistical significance (simplified) quality_scores = list(track_comparison.get("quality_scores", {}).values()) if len(quality_scores) >= 2: score_range = max(quality_scores) - min(quality_scores) if score_range > 0.05: # 5% difference threshold comparison_text += ( "- **Performance differences**: Potentially significant\n" ) else: comparison_text += "- **Performance differences**: Minimal\n" # Add recommendations recommendations = comparison_result.get("recommendations", []) if recommendations: comparison_text += "\n### π‘ Recommendations:\n" for rec in recommendations: comparison_text += f"- {rec}\n" return comparison_text, comparison_plot except Exception as e: error_msg = f"Error performing comparison: {str(e)}" return error_msg, None # Initialize data on startup print("π Starting SALT Translation Leaderboard - Scientific Edition...") initialization_success = initialize_scientific_data() # Create Gradio interface with scientific design with gr.Blocks( title=UI_CONFIG["title"], theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1600px !important; margin: 0 auto; } .scientific-header { text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white; border-radius: 10px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .track-tab { border-radius: 8px; margin: 0.5rem; padding: 1rem; border: 2px solid transparent; } .track-tab.google-comparable { border-color: #1f77b4; background: linear-gradient(45deg, #f0f9ff, #e0f2fe); } .track-tab.ug40-complete { border-color: #ff7f0e; background: linear-gradient(45deg, #fff7ed, #fed7aa); } .track-tab.language-pair-matrix { border-color: #2ca02c; background: linear-gradient(45deg, #f0fdf4, #dcfce7); } .metric-box { background: #f8fafc; padding: 1rem; border-radius: 8px; margin: 0.5rem 0; border-left: 4px solid #3b82f6; } .scientific-note { background: #fef3c7; border: 1px solid #f59e0b; border-radius: 8px; padding: 1rem; margin: 1rem 0; } .adequacy-excellent { border-left-color: #22c55e; } .adequacy-good { border-left-color: #eab308; } .adequacy-fair { border-left-color: #f97316; } .adequacy-insufficient { border-left-color: #ef4444; } """, ) as demo: # Scientific Header gr.HTML( f"""
Rigorous Evaluation with Statistical Significance Testing
Three-tier evaluation tracks β’ 95% Confidence intervals β’ Research-grade analysis
Supported Languages: {len(ALL_UG40_LANGUAGES)} Ugandan languages | Google Comparable: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages