File size: 6,156 Bytes
11a64ab
 
 
 
aa99a22
11a64ab
aa99a22
11a64ab
 
d5b83bc
 
11a64ab
 
d5b83bc
 
 
 
 
 
 
 
11a64ab
 
d5b83bc
 
aa99a22
11a64ab
d5b83bc
aa99a22
d5b83bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa99a22
d5b83bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa99a22
d5b83bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa99a22
 
d5b83bc
aa99a22
d5b83bc
 
 
 
 
 
 
 
 
 
 
 
 
 
8727da4
 
 
d5b83bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8727da4
 
d5b83bc
8727da4
d5b83bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a64ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# config.py
import os

# HuggingFace settings
HF_TOKEN = os.getenv("HF_TOKEN")
LEADERBOARD_DATASET = "Sunbird/salt-translation-leaderboard"
TEST_SET_DATASET = "Sunbird/salt-translation-test-set"
SALT_DATASET = "sunbird/salt"

# Language settings - ALL UG40 LANGUAGES
ALL_UG40_LANGUAGES = ["ach", "eng", "lgg", "lug", "nyn", "rny", "teo", "swa"]

LANGUAGE_NAMES = {
    "ach": "Acholi",
    "eng": "English",
    "lgg": "Lugbara",
    "lug": "Luganda",
    "nyn": "Runyankole",
    "rny": "Runyoro",
    "teo": "Ateso",
    "swa": "Swahili",
}

# Google Translate supported subset (for fair comparison)
GOOGLE_SUPPORTED_LANGUAGES = ["lug", "ach", "swa", "eng"]

# Google Translate language mapping
GOOGLE_LANG_MAP = {"lug": "lg", "ach": "ach", "swa": "sw", "eng": "en"}

# SCIENTIFIC EVALUATION TRACKS
EVALUATION_TRACKS = {
    "google_comparable": {
        "name": "Google-Comparable Track",
        "description": "Models evaluated only on language pairs supported by Google Translate",
        "languages": GOOGLE_SUPPORTED_LANGUAGES,
        "min_samples_per_pair": 50,
        "statistical_power": 0.8,
        "significance_level": 0.05,
    },
    "ug40_complete": {
        "name": "UG40-Complete Track",
        "description": "Models evaluated on all UG40 language pairs",
        "languages": ALL_UG40_LANGUAGES,
        "min_samples_per_pair": 30,
        "statistical_power": 0.8,
        "significance_level": 0.05,
    },
    "language_pair_matrix": {
        "name": "Language-Pair Matrix",
        "description": "Individual language pair analysis with statistical significance",
        "languages": ALL_UG40_LANGUAGES,
        "min_samples_per_pair": 20,
        "statistical_power": 0.7,
        "significance_level": 0.05,
    },
}

# MODEL CATEGORIES
MODEL_CATEGORIES = {
    "commercial": {
        "name": "Commercial Systems",
        "description": "Production translation systems",
        "examples": ["google_translate", "azure_translator"],
        "color": "#1f77b4",
    },
    "research": {
        "name": "Research Models",
        "description": "Academic and research institution models",
        "examples": ["nllb", "m2m100"],
        "color": "#ff7f0e",
    },
    "baseline": {
        "name": "Baseline Models",
        "description": "Simple baseline and reference models",
        "examples": ["word_lookup", "frequency_baseline"],
        "color": "#2ca02c",
    },
    "community": {
        "name": "Community Submissions",
        "description": "User-submitted models and fine-tuned variants",
        "examples": ["user_submission"],
        "color": "#d62728",
    },
}

# STATISTICAL SETTINGS
STATISTICAL_CONFIG = {
    "confidence_level": 0.95,
    "bootstrap_samples": 1000,
    "min_samples_for_ci": 20,
    "effect_size_thresholds": {
        "small": 0.2,
        "medium": 0.5,
        "large": 0.8,
    },
    "multiple_testing_correction": "bonferroni",
    "outlier_detection": {
        "method": "iqr",
        "factor": 1.5,
    },
}

# METRICS CONFIGURATION - Enhanced for statistical analysis
METRICS_CONFIG = {
    "primary_metrics": ["bleu", "chrf", "quality_score"],
    "secondary_metrics": ["rouge1", "rouge2", "rougeL", "cer", "wer", "len_ratio"],
    "display_precision": 4,
    "quality_score_components": ["bleu", "chrf", "cer", "wer", "rouge1", "rougeL"],
    "error_metrics": ["cer", "wer"],  # Lower is better
    "score_metrics": ["bleu", "chrf", "quality_score", "rouge1", "rouge2", "rougeL"],
    "statistical_metrics": [
        "mean",
        "std",
        "median",
        "ci_lower",
        "ci_upper",
        "p_value",
        "effect_size",
    ],
}

# VALIDATION REQUIREMENTS
VALIDATION_CONFIG = {
    "min_samples_per_track": {
        "google_comparable": 200,
        "ug40_complete": 400,
        "language_pair_matrix": 50,
    },
    "max_missing_rate": 0.05,  # 5% missing predictions allowed
    "quality_thresholds": {
        "min_valid_predictions": 0.95,
        "max_duplicate_rate": 0.1,
        "min_avg_length": 3,
        "max_avg_length": 500,
    },
}

# UI CONFIGURATION
UI_CONFIG = {
    "title": "πŸ† SALT Translation Leaderboard - Scientific Edition",
    "description": """
    Rigorous evaluation of translation models on Ugandan languages with statistical significance testing.
    Three evaluation tracks ensure fair comparison across different model capabilities and language support.
    """,
    "tracks": {
        "google_comparable": {
            "tab_name": "πŸ€– Google-Comparable Track",
            "icon": "πŸ€–",
            "color": "#1f77b4",
        },
        "ug40_complete": {
            "tab_name": "🌍 UG40-Complete Track",
            "icon": "🌍",
            "color": "#ff7f0e",
        },
        "language_pair_matrix": {
            "tab_name": "πŸ“Š Language-Pair Matrix",
            "icon": "πŸ“Š",
            "color": "#2ca02c",
        },
    },
}

# CHART CONFIGURATION - Research-grade styling
CHART_CONFIG = {
    "statistical_colorscale": "RdYlBu_r",
    "category_colors": {cat: info["color"] for cat, info in MODEL_CATEGORIES.items()},
    "heatmap_config": {
        "colorscale": "Viridis",
        "show_values": True,
        "font_size": 10,
    },
    "confidence_interval_config": {
        "alpha": 0.3,
        "line_width": 2,
        "marker_size": 8,
    },
    "statistical_plot_config": {
        "height": 600,
        "width": 800,
        "margin": {"l": 100, "r": 50, "t": 50, "b": 100},
    },
}

# FILE FORMAT SPECIFICATIONS
PREDICTION_FORMAT = {
    "required_columns": ["sample_id", "prediction"],
    "optional_columns": ["model_name", "confidence", "category"],
    "file_types": [".csv", ".tsv", ".json"],
    "category_detection": {
        "google": ["google", "translate"],
        "nllb": ["nllb", "meta"],
        "m2m": ["m2m", "facebook"],
        "baseline": ["baseline", "simple", "lookup"],
    },
}

# EVALUATION SETTINGS
MAX_TEST_SAMPLES = 500  # Per language pair
MIN_SAMPLES_PER_PAIR = 10  # Minimum for basic statistics
SAMPLE_SIZE_RECOMMENDATIONS = {
    "basic_comparison": 50,
    "statistical_significance": 100,
    "publication_quality": 200,
}