leaderboard / config.py
akera's picture
Update config.py
d5b83bc verified
raw
history blame
6.16 kB
# config.py
import os
# HuggingFace settings
HF_TOKEN = os.getenv("HF_TOKEN")
LEADERBOARD_DATASET = "Sunbird/salt-translation-leaderboard"
TEST_SET_DATASET = "Sunbird/salt-translation-test-set"
SALT_DATASET = "sunbird/salt"
# Language settings - ALL UG40 LANGUAGES
ALL_UG40_LANGUAGES = ["ach", "eng", "lgg", "lug", "nyn", "rny", "teo", "swa"]
LANGUAGE_NAMES = {
"ach": "Acholi",
"eng": "English",
"lgg": "Lugbara",
"lug": "Luganda",
"nyn": "Runyankole",
"rny": "Runyoro",
"teo": "Ateso",
"swa": "Swahili",
}
# Google Translate supported subset (for fair comparison)
GOOGLE_SUPPORTED_LANGUAGES = ["lug", "ach", "swa", "eng"]
# Google Translate language mapping
GOOGLE_LANG_MAP = {"lug": "lg", "ach": "ach", "swa": "sw", "eng": "en"}
# SCIENTIFIC EVALUATION TRACKS
EVALUATION_TRACKS = {
"google_comparable": {
"name": "Google-Comparable Track",
"description": "Models evaluated only on language pairs supported by Google Translate",
"languages": GOOGLE_SUPPORTED_LANGUAGES,
"min_samples_per_pair": 50,
"statistical_power": 0.8,
"significance_level": 0.05,
},
"ug40_complete": {
"name": "UG40-Complete Track",
"description": "Models evaluated on all UG40 language pairs",
"languages": ALL_UG40_LANGUAGES,
"min_samples_per_pair": 30,
"statistical_power": 0.8,
"significance_level": 0.05,
},
"language_pair_matrix": {
"name": "Language-Pair Matrix",
"description": "Individual language pair analysis with statistical significance",
"languages": ALL_UG40_LANGUAGES,
"min_samples_per_pair": 20,
"statistical_power": 0.7,
"significance_level": 0.05,
},
}
# MODEL CATEGORIES
MODEL_CATEGORIES = {
"commercial": {
"name": "Commercial Systems",
"description": "Production translation systems",
"examples": ["google_translate", "azure_translator"],
"color": "#1f77b4",
},
"research": {
"name": "Research Models",
"description": "Academic and research institution models",
"examples": ["nllb", "m2m100"],
"color": "#ff7f0e",
},
"baseline": {
"name": "Baseline Models",
"description": "Simple baseline and reference models",
"examples": ["word_lookup", "frequency_baseline"],
"color": "#2ca02c",
},
"community": {
"name": "Community Submissions",
"description": "User-submitted models and fine-tuned variants",
"examples": ["user_submission"],
"color": "#d62728",
},
}
# STATISTICAL SETTINGS
STATISTICAL_CONFIG = {
"confidence_level": 0.95,
"bootstrap_samples": 1000,
"min_samples_for_ci": 20,
"effect_size_thresholds": {
"small": 0.2,
"medium": 0.5,
"large": 0.8,
},
"multiple_testing_correction": "bonferroni",
"outlier_detection": {
"method": "iqr",
"factor": 1.5,
},
}
# METRICS CONFIGURATION - Enhanced for statistical analysis
METRICS_CONFIG = {
"primary_metrics": ["bleu", "chrf", "quality_score"],
"secondary_metrics": ["rouge1", "rouge2", "rougeL", "cer", "wer", "len_ratio"],
"display_precision": 4,
"quality_score_components": ["bleu", "chrf", "cer", "wer", "rouge1", "rougeL"],
"error_metrics": ["cer", "wer"], # Lower is better
"score_metrics": ["bleu", "chrf", "quality_score", "rouge1", "rouge2", "rougeL"],
"statistical_metrics": [
"mean",
"std",
"median",
"ci_lower",
"ci_upper",
"p_value",
"effect_size",
],
}
# VALIDATION REQUIREMENTS
VALIDATION_CONFIG = {
"min_samples_per_track": {
"google_comparable": 200,
"ug40_complete": 400,
"language_pair_matrix": 50,
},
"max_missing_rate": 0.05, # 5% missing predictions allowed
"quality_thresholds": {
"min_valid_predictions": 0.95,
"max_duplicate_rate": 0.1,
"min_avg_length": 3,
"max_avg_length": 500,
},
}
# UI CONFIGURATION
UI_CONFIG = {
"title": "πŸ† SALT Translation Leaderboard - Scientific Edition",
"description": """
Rigorous evaluation of translation models on Ugandan languages with statistical significance testing.
Three evaluation tracks ensure fair comparison across different model capabilities and language support.
""",
"tracks": {
"google_comparable": {
"tab_name": "πŸ€– Google-Comparable Track",
"icon": "πŸ€–",
"color": "#1f77b4",
},
"ug40_complete": {
"tab_name": "🌍 UG40-Complete Track",
"icon": "🌍",
"color": "#ff7f0e",
},
"language_pair_matrix": {
"tab_name": "πŸ“Š Language-Pair Matrix",
"icon": "πŸ“Š",
"color": "#2ca02c",
},
},
}
# CHART CONFIGURATION - Research-grade styling
CHART_CONFIG = {
"statistical_colorscale": "RdYlBu_r",
"category_colors": {cat: info["color"] for cat, info in MODEL_CATEGORIES.items()},
"heatmap_config": {
"colorscale": "Viridis",
"show_values": True,
"font_size": 10,
},
"confidence_interval_config": {
"alpha": 0.3,
"line_width": 2,
"marker_size": 8,
},
"statistical_plot_config": {
"height": 600,
"width": 800,
"margin": {"l": 100, "r": 50, "t": 50, "b": 100},
},
}
# FILE FORMAT SPECIFICATIONS
PREDICTION_FORMAT = {
"required_columns": ["sample_id", "prediction"],
"optional_columns": ["model_name", "confidence", "category"],
"file_types": [".csv", ".tsv", ".json"],
"category_detection": {
"google": ["google", "translate"],
"nllb": ["nllb", "meta"],
"m2m": ["m2m", "facebook"],
"baseline": ["baseline", "simple", "lookup"],
},
}
# EVALUATION SETTINGS
MAX_TEST_SAMPLES = 500 # Per language pair
MIN_SAMPLES_PER_PAIR = 10 # Minimum for basic statistics
SAMPLE_SIZE_RECOMMENDATIONS = {
"basic_comparison": 50,
"statistical_significance": 100,
"publication_quality": 200,
}