akera commited on
Commit
2158319
Β·
verified Β·
1 Parent(s): 37b3c92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +856 -388
app.py CHANGED
@@ -51,7 +51,7 @@ def setup_salt():
51
  return False
52
 
53
  # Setup SALT on startup
54
- print("πŸš€ Starting SALT Translation Leaderboard...")
55
  if not setup_salt():
56
  print("❌ Cannot continue without SALT library")
57
  print("πŸ’‘ Please check that git is available and GitHub is accessible")
@@ -62,458 +62,711 @@ import pandas as pd
62
  import json
63
  import traceback
64
  from datetime import datetime
65
- from typing import Optional, Dict, Tuple
66
 
67
- # Import our modules
68
- from src.test_set import get_public_test_set, get_complete_test_set, create_test_set_download, validate_test_set_integrity
69
- from src.validation import validate_submission_complete
70
- from src.evaluation import evaluate_predictions, generate_evaluation_report, get_google_translate_baseline
 
 
 
 
 
 
 
 
 
 
71
  from src.leaderboard import (
72
- load_leaderboard, add_model_to_leaderboard, get_leaderboard_stats,
73
- filter_leaderboard, export_leaderboard, get_model_comparison, prepare_leaderboard_display
 
 
 
 
 
74
  )
75
  from src.plotting import (
76
- create_leaderboard_ranking_plot, create_metrics_comparison_plot,
77
- create_language_pair_heatmap, create_coverage_analysis_plot,
78
- create_model_performance_timeline, create_google_comparison_plot,
79
- create_detailed_model_analysis, create_submission_summary_plot
 
 
 
 
 
 
 
 
 
 
80
  )
81
- from src.utils import sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs
82
  from config import *
83
 
84
  # Global variables for caching
85
  current_leaderboard = None
86
  public_test_set = None
87
  complete_test_set = None
 
88
 
89
- def initialize_data():
90
- """Initialize test sets and leaderboard data."""
91
- global public_test_set, complete_test_set, current_leaderboard
92
 
93
  try:
94
- print("πŸ”„ Initializing SALT Translation Leaderboard...")
 
 
 
 
 
95
 
96
- # Load test sets
97
- print("πŸ“₯ Loading test sets...")
98
- public_test_set = get_public_test_set()
99
- complete_test_set = get_complete_test_set()
100
 
101
- # Load leaderboard
102
- print("πŸ† Loading leaderboard...")
103
- current_leaderboard = load_leaderboard()
104
 
105
- print(f"βœ… Initialization complete!")
106
  print(f" - Test set: {len(public_test_set):,} samples")
107
- print(f" - Language pairs: {len(get_all_language_pairs())}")
 
108
  print(f" - Current models: {len(current_leaderboard)}")
109
 
110
  return True
111
 
112
  except Exception as e:
113
- print(f"❌ Initialization failed: {e}")
114
  traceback.print_exc()
115
  return False
116
 
117
- def download_test_set() -> Tuple[str, str]:
118
- """Create downloadable test set and return file path and info."""
119
 
120
  try:
121
  global public_test_set
122
  if public_test_set is None:
123
- public_test_set = get_public_test_set()
124
 
125
  # Create download file
126
- download_path, stats = create_test_set_download()
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Create info message
129
  info_msg = f"""
130
- ## πŸ“₯ SALT Test Set Downloaded Successfully!
 
 
 
 
 
 
131
 
132
- ### Dataset Statistics:
133
  - **Total Samples**: {stats['total_samples']:,}
134
- - **Language Pairs**: {stats['language_pairs']}
135
- - **Google Comparable**: {stats['google_comparable_samples']:,} samples
136
- - **Languages**: {', '.join(stats['languages'])}
137
 
138
- ### File Format:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  - `sample_id`: Unique identifier for each sample
140
  - `source_text`: Text to be translated
141
  - `source_language`: Source language code
142
  - `target_language`: Target language code
143
  - `domain`: Content domain (if available)
144
  - `google_comparable`: Whether this pair can be compared with Google Translate
 
 
 
 
 
 
 
 
 
145
 
146
- ### Next Steps:
147
- 1. Run your model on the source texts
148
- 2. Create a CSV/JSON file with columns: `sample_id`, `prediction`
149
- 3. Upload your predictions using the "Submit Predictions" tab
 
150
  """
151
 
152
  return download_path, info_msg
153
 
154
  except Exception as e:
155
- error_msg = f"❌ Error creating test set download: {str(e)}"
156
  return None, error_msg
157
 
158
- def validate_submission(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame]]:
159
- """Validate uploaded prediction file, supporting str paths, bytes, and Gradio wrappers."""
 
 
 
160
  try:
161
  if file is None:
162
- return "❌ Please upload a predictions file", None
163
  if not model_name.strip():
164
- return "❌ Please provide a model name", None
165
 
166
- # 1) Determine raw bytes
167
  if isinstance(file, bytes):
168
  file_content = file
169
  elif isinstance(file, str):
170
- # could be a path or raw text
171
  if os.path.exists(file):
172
  with open(file, "rb") as f:
173
  file_content = f.read()
174
  else:
175
  file_content = file.encode("utf-8")
176
  elif hasattr(file, "name") and os.path.exists(file.name):
177
- # tempfile._TemporaryFileWrapper from Gradio
178
  with open(file.name, "rb") as f:
179
  file_content = f.read()
180
  else:
181
- return "❌ Could not read uploaded file", None
182
 
183
- # 2) Infer filename for format-sniffing
184
  filename = (
185
  getattr(file, "name", None)
186
  or getattr(file, "filename", None)
187
  or "predictions.csv"
188
  )
189
 
190
- # 3) Load test set if needed
191
  global complete_test_set
192
  if complete_test_set is None:
193
- complete_test_set = get_complete_test_set()
194
 
195
- # 4) Run existing validation pipeline
196
- validation_result = validate_submission_complete(
197
- file_content, filename, complete_test_set, model_name
198
  )
199
 
 
 
200
  if validation_result["valid"]:
201
- return validation_result["report"], validation_result["predictions"]
202
  else:
203
- return validation_result["report"], None
204
 
205
  except Exception as e:
206
  return (
207
  f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
208
  None,
 
209
  )
210
 
211
- def evaluate_submission(
212
- predictions_df: pd.DataFrame,
213
- model_name: str,
214
- author: str,
215
  description: str,
216
- validation_info: Dict
217
- ) -> Tuple[str, pd.DataFrame, object, object]:
218
- """Evaluate validated predictions and update leaderboard."""
 
219
 
220
  try:
221
  if predictions_df is None:
222
- return "❌ No valid predictions to evaluate", None, None, None
223
 
224
  # Get complete test set with targets
225
  global complete_test_set, current_leaderboard
226
  if complete_test_set is None:
227
- complete_test_set = get_complete_test_set()
228
-
229
- # Run evaluation
230
- print(f"πŸ”„ Evaluating {model_name}...")
231
- evaluation_results = evaluate_predictions(predictions_df, complete_test_set)
232
 
233
- if evaluation_results.get('error'):
234
- return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
 
 
 
235
 
236
- # Add to leaderboard
237
- print("πŸ† Adding to leaderboard...")
238
- model_type = "user_submission" # Could be enhanced to detect model type
239
 
240
- updated_leaderboard = add_model_to_leaderboard(
 
 
241
  model_name=sanitize_model_name(model_name),
242
- author=author or "Anonymous",
243
  evaluation_results=evaluation_results,
244
- validation_info=validation_info,
245
- model_type=model_type,
246
  description=description or ""
247
  )
248
 
249
  # Update global leaderboard
250
  current_leaderboard = updated_leaderboard
251
 
252
- # Generate evaluation report
253
- report = generate_evaluation_report(evaluation_results, model_name)
254
 
255
- # Create visualization plots
256
- summary_plot = create_submission_summary_plot(validation_info, evaluation_results)
257
- ranking_plot = create_leaderboard_ranking_plot(updated_leaderboard)
258
 
259
- # Format success message
260
- rank = updated_leaderboard[updated_leaderboard['model_name'] == sanitize_model_name(model_name)].index[0] + 1
261
- total_models = len(updated_leaderboard)
262
 
 
263
  success_msg = f"""
264
- ## πŸŽ‰ Evaluation Complete!
265
 
266
- ### Your Results:
267
  - **Model**: {model_name}
268
- - **Rank**: #{rank} out of {total_models} models
269
- - **Quality Score**: {evaluation_results['averages'].get('quality_score', 0):.4f}
270
- - **BLEU**: {evaluation_results['averages'].get('bleu', 0):.2f}
271
- - **ChrF**: {evaluation_results['averages'].get('chrf', 0):.4f}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- ### Coverage:
274
- - **Samples Evaluated**: {evaluation_results['evaluated_samples']:,}
275
- - **Language Pairs**: {evaluation_results['summary']['language_pairs_covered']}
276
- - **Google Comparable**: {evaluation_results['summary']['google_comparable_pairs']} pairs
277
 
278
  {report}
279
  """
280
 
281
- return success_msg, prepare_leaderboard_display(updated_leaderboard), summary_plot, ranking_plot
282
-
283
  except Exception as e:
284
- error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
285
- return error_msg, None, None, None
286
 
287
- def refresh_leaderboard_display(
 
288
  search_query: str = "",
289
- model_type_filter: str = "all",
290
- min_coverage: float = 0.0,
291
- google_only: bool = False
292
  ) -> Tuple[pd.DataFrame, object, object, str]:
293
- """Refresh and filter leaderboard display."""
294
 
295
  try:
296
  global current_leaderboard
297
  if current_leaderboard is None:
298
- current_leaderboard = load_leaderboard()
299
-
300
- # Apply filters
301
- filtered_df = filter_leaderboard(
302
- current_leaderboard,
303
- search_query=search_query,
304
- model_type=model_type_filter,
305
- min_coverage=min_coverage,
306
- google_comparable_only=google_only
307
  )
308
 
309
- # Prepare for display (removes detailed_metrics column)
310
- display_df = prepare_leaderboard_display(filtered_df)
 
 
 
 
 
 
 
 
 
311
 
312
  # Create plots
313
- ranking_plot = create_leaderboard_ranking_plot(filtered_df)
314
- comparison_plot = create_metrics_comparison_plot(filtered_df)
 
 
 
 
315
 
316
- # Get stats
317
- stats = get_leaderboard_stats(filtered_df)
318
  stats_text = f"""
319
- ### πŸ“Š Leaderboard Statistics
320
 
321
- - **Total Models**: {stats['total_models']}
322
- - **Average Quality Score**: {stats['avg_quality_score']:.4f}
323
- - **Google Comparable Models**: {stats['google_comparable_models']}
324
 
325
- **Best Model**: {stats['best_model']['name'] if stats['best_model'] else 'None'}
326
- **Latest Submission**: {stats['latest_submission'][:10] if stats['latest_submission'] else 'None'}
 
 
 
 
 
327
  """
328
 
329
  return display_df, ranking_plot, comparison_plot, stats_text
330
 
331
  except Exception as e:
332
- error_msg = f"Error loading leaderboard: {str(e)}"
333
  empty_df = pd.DataFrame()
334
  return empty_df, None, None, error_msg
335
 
336
- def get_model_details(model_name: str) -> Tuple[str, object]:
337
- """Get detailed analysis for a specific model."""
338
 
339
  try:
340
  global current_leaderboard
341
  if current_leaderboard is None:
342
- return "Leaderboard not loaded", None
343
 
344
  # Find model
345
  model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
346
 
347
  if model_row.empty:
348
- return f"Model '{model_name}' not found", None
349
 
350
  model_info = model_row.iloc[0]
351
 
352
- # Parse detailed metrics
353
  try:
354
- detailed_results = json.loads(model_info['detailed_metrics'])
355
  except:
356
  detailed_results = {}
357
 
358
- # Create detailed plot
359
- detail_plot = create_detailed_model_analysis(detailed_results, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
- # Format model details
362
  details_text = f"""
363
- ## πŸ” Model Details: {model_name}
364
 
365
- ### Basic Information:
366
  - **Author**: {model_info['author']}
 
367
  - **Submission Date**: {model_info['submission_date'][:10]}
368
- - **Model Type**: {model_info['model_type']}
369
  - **Description**: {model_info['description'] or 'No description provided'}
370
 
371
- ### Performance Metrics:
372
- - **Quality Score**: {model_info['quality_score']:.4f}
373
- - **BLEU**: {model_info['bleu']:.2f}
374
- - **ChrF**: {model_info['chrf']:.4f}
375
- - **ROUGE-1**: {model_info['rouge1']:.4f}
376
- - **ROUGE-L**: {model_info['rougeL']:.4f}
377
-
378
- ### Coverage Information:
379
- - **Total Samples**: {model_info['total_samples']:,}
380
- - **Language Pairs Covered**: {model_info['language_pairs_covered']}
381
- - **Google Comparable Pairs**: {model_info['google_pairs_covered']}
382
- - **Coverage Rate**: {model_info['coverage_rate']:.1%}
383
-
384
- ### Google Translate Comparison:
385
- - **Google Quality Score**: {model_info['google_quality_score']:.4f}
386
- - **Google BLEU**: {model_info['google_bleu']:.2f}
387
- - **Google ChrF**: {model_info['google_chrf']:.4f}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  """
389
 
390
- return details_text, detail_plot
391
 
392
  except Exception as e:
393
  error_msg = f"Error getting model details: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  return error_msg, None
395
 
396
  # Initialize data on startup
397
- print("πŸš€ Starting SALT Translation Leaderboard...")
398
- initialization_success = initialize_data()
399
 
400
- # Create Gradio interface
401
  with gr.Blocks(
402
- title=TITLE,
403
  theme=gr.themes.Soft(),
404
  css="""
405
  .gradio-container {
406
- max-width: 1400px !important;
407
  margin: 0 auto;
408
  }
409
- .main-header {
410
  text-align: center;
411
  margin-bottom: 2rem;
412
  padding: 2rem;
413
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
414
  color: white;
415
  border-radius: 10px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  }
417
  .metric-box {
418
- background: #f8f9fa;
419
  padding: 1rem;
420
  border-radius: 8px;
421
  margin: 0.5rem 0;
422
- border-left: 4px solid #007bff;
423
  }
424
- .error-box {
425
- background: #f8d7da;
426
- color: #721c24;
427
- padding: 1rem;
428
  border-radius: 8px;
429
- border-left: 4px solid #dc3545;
430
- }
431
- .success-box {
432
- background: #d4edda;
433
- color: #155724;
434
  padding: 1rem;
435
- border-radius: 8px;
436
- border-left: 4px solid #28a745;
437
  }
 
 
 
 
438
  """
439
  ) as demo:
440
 
441
- # Header
442
  gr.HTML(f"""
443
- <div class="main-header">
444
- <h1>{TITLE}</h1>
445
- <p>{DESCRIPTION}</p>
 
446
  <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
447
  </div>
448
  """)
449
 
450
  # Status indicator
451
  if initialization_success:
452
- status_msg = "βœ… System initialized successfully"
 
 
453
  else:
454
  status_msg = "❌ System initialization failed - some features may not work"
455
 
456
- gr.Markdown(f"**Status**: {status_msg}")
 
 
 
 
457
 
 
 
 
 
 
 
 
 
458
  with gr.Tabs():
459
 
460
- # Tab 1: Get Test Set
461
  with gr.Tab("πŸ“₯ Download Test Set", id="download"):
462
  gr.Markdown("""
463
- ## πŸ“‹ Get the SALT Translation Test Set
464
 
465
- Download the standardized test set to evaluate your translation model.
466
- The test set contains source texts in multiple Ugandan languages that you need to translate.
467
  """)
468
 
469
  with gr.Row():
470
- download_btn = gr.Button("πŸ“₯ Download Test Set", variant="primary", size="lg")
471
 
472
  with gr.Row():
473
  with gr.Column():
474
  download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
475
  with gr.Column():
476
  download_info = gr.Markdown(label="ℹ️ Test Set Information")
477
-
478
- gr.Markdown("""
479
- ### πŸ“– Instructions
480
-
481
- 1. **Download** the test set using the button above
482
- 2. **Run your model** on the source texts to generate translations
483
- 3. **Create a predictions file** with your model's outputs
484
- 4. **Submit** your predictions using the "Submit Predictions" tab
485
-
486
- ### πŸ“‹ Required Prediction Format
487
-
488
- Your predictions file must be a CSV/TSV/JSON with these columns:
489
- - `sample_id`: The unique identifier from the test set
490
- - `prediction`: Your model's translation for that sample
491
-
492
- **Example CSV:**
493
- ```
494
- sample_id,prediction
495
- salt_000001,Oli otya mukwano gwange?
496
- salt_000002,Webale nyo olukya
497
- ...
498
- ```
499
- """)
500
 
501
- # Tab 2: Submit Predictions
502
  with gr.Tab("πŸš€ Submit Predictions", id="submit"):
503
  gr.Markdown("""
504
- ## 🎯 Submit Your Model's Predictions
505
 
506
- Upload your model's predictions on the SALT test set for evaluation.
507
  """)
508
 
509
  with gr.Row():
510
  with gr.Column(scale=1):
511
- # Model information
512
  gr.Markdown("### πŸ“ Model Information")
513
 
514
  model_name_input = gr.Textbox(
515
  label="πŸ€– Model Name",
516
- placeholder="e.g., MyTranslator-v1.0",
517
  info="Unique name for your model"
518
  )
519
 
@@ -524,313 +777,528 @@ with gr.Blocks(
524
  )
525
 
526
  description_input = gr.Textbox(
527
- label="πŸ“„ Description (Optional)",
528
- placeholder="Brief description of your model",
529
- lines=3
 
530
  )
531
 
532
- # File upload
533
  gr.Markdown("### πŸ“€ Upload Predictions")
534
- gr.Markdown("Upload a CSV/TSV/JSON file with your model's predictions")
535
-
536
  predictions_file = gr.File(
537
  label="πŸ“‚ Predictions File",
538
  file_types=[".csv", ".tsv", ".json"]
539
  )
540
 
541
  validate_btn = gr.Button("βœ… Validate Submission", variant="secondary")
542
- submit_btn = gr.Button("πŸš€ Submit for Evaluation", variant="primary", interactive=False)
543
 
544
  with gr.Column(scale=1):
545
  gr.Markdown("### πŸ“Š Validation Results")
546
  validation_output = gr.Markdown()
547
 
548
  # Results section
549
- gr.Markdown("### πŸ† Evaluation Results")
550
 
551
  with gr.Row():
552
  evaluation_output = gr.Markdown()
553
 
554
  with gr.Row():
555
  with gr.Column():
556
- submission_plot = gr.Plot(label="πŸ“ˆ Your Submission Analysis")
557
  with gr.Column():
558
- updated_leaderboard_plot = gr.Plot(label="πŸ† Updated Leaderboard")
559
 
560
  with gr.Row():
561
- results_table = gr.Dataframe(label="πŸ“Š Updated Leaderboard", interactive=False)
562
 
563
- # Tab 3: Leaderboard
564
- with gr.Tab("πŸ† Leaderboard", id="leaderboard"):
 
 
 
 
 
 
 
 
 
 
 
 
 
565
  with gr.Row():
566
- with gr.Column(scale=3):
567
- search_input = gr.Textbox(
568
- label="πŸ” Search Models",
569
- placeholder="Search by model name, author...",
570
- )
571
  with gr.Column(scale=1):
572
- model_type_dropdown = gr.Dropdown(
573
- label="πŸ”§ Model Type",
574
- choices=["all", "user_submission", "baseline"],
575
  value="all"
576
  )
577
  with gr.Column(scale=1):
578
- min_coverage_slider = gr.Slider(
579
- label="πŸ“Š Min Coverage",
580
- minimum=0.0,
581
- maximum=1.0,
582
- value=0.0,
583
- step=0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
  )
585
  with gr.Column(scale=1):
586
- google_only_checkbox = gr.Checkbox(
587
- label="πŸ€– Google Comparable Only",
588
- value=False
589
  )
 
 
590
 
591
  with gr.Row():
592
- refresh_btn = gr.Button("πŸ”„ Refresh", variant="secondary")
593
 
594
  with gr.Row():
595
- leaderboard_stats = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  with gr.Row():
598
  with gr.Column():
599
- leaderboard_plot = gr.Plot(label="πŸ† Rankings")
600
  with gr.Column():
601
- comparison_plot = gr.Plot(label="πŸ“Š Multi-Metric Comparison")
602
 
603
  with gr.Row():
604
- leaderboard_table = gr.Dataframe(
605
- label="πŸ“ˆ Full Leaderboard",
606
- interactive=False,
607
- wrap=True
608
- )
609
-
610
- # Tab 4: Model Analysis
611
- with gr.Tab("πŸ” Model Analysis", id="analysis"):
 
 
 
612
  with gr.Row():
613
- model_select = gr.Dropdown(
614
- label="πŸ€– Select Model",
615
- choices=[],
616
- value=None,
617
- info="Choose a model for detailed analysis"
618
- )
619
- analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
 
 
 
 
 
 
 
 
 
620
 
621
  with gr.Row():
622
  model_details = gr.Markdown()
623
 
624
  with gr.Row():
625
- model_analysis_plot = gr.Plot(label="πŸ“Š Detailed Performance Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
 
627
- # Tab 5: Documentation
628
- with gr.Tab("πŸ“š Documentation", id="docs"):
629
  gr.Markdown(f"""
630
- # πŸ“– SALT Translation Leaderboard Documentation
631
 
632
  ## 🎯 Overview
633
 
634
- The SALT Translation Leaderboard is a scientific evaluation platform for translation models on Ugandan languages.
635
- Submit your model's predictions on our standardized test set to see how it compares with other models.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
- ## πŸ—£οΈ Supported Languages
 
 
 
638
 
639
- **All UG40 Languages ({len(ALL_UG40_LANGUAGES)} total):**
640
- {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in ALL_UG40_LANGUAGES])}
641
 
642
- **Google Translate Comparable ({len(GOOGLE_SUPPORTED_LANGUAGES)} languages):**
643
- {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in GOOGLE_SUPPORTED_LANGUAGES])}
 
 
 
 
644
 
645
  ## πŸ“Š Evaluation Metrics
646
 
647
  ### Primary Metrics
648
- - **Quality Score**: Composite metric (0-1, higher better) combining multiple metrics
649
- - **BLEU**: Translation quality score (0-100, higher better)
650
- - **ChrF**: Character-level F-score (0-1, higher better)
651
 
652
  ### Secondary Metrics
653
- - **ROUGE-1/ROUGE-L**: Recall-oriented metrics (0-1, higher better)
654
- - **CER/WER**: Character/Word Error Rate (0-1, lower better)
655
  - **Length Ratio**: Prediction/reference length ratio
656
 
 
 
657
  ## πŸ”„ Submission Process
658
 
659
- ### Step 1: Download Test Set
660
- 1. Go to "Download Test Set" tab
661
- 2. Click "Download Test Set" button
662
- 3. Save the `salt_test_set.csv` file
663
 
664
  ### Step 2: Generate Predictions
665
- 1. Load the test set in your code
666
  2. For each row, translate `source_text` from `source_language` to `target_language`
667
  3. Save results as CSV with columns: `sample_id`, `prediction`
 
668
 
669
  ### Step 3: Submit & Evaluate
670
- 1. Go to "Submit Predictions" tab
671
- 2. Fill in model information
672
- 3. Upload your predictions file
673
- 4. Validate and submit for evaluation
674
 
675
- ## πŸ“‹ File Formats
676
 
677
- ### Test Set Format
678
  ```csv
679
- sample_id,source_text,source_language,target_language,domain,google_comparable
680
- salt_000001,"Hello world",eng,lug,general,true
681
- salt_000002,"How are you?",eng,ach,conversation,true
 
682
  ```
683
 
684
  ### Predictions Format
685
  ```csv
686
- sample_id,prediction
687
- salt_000001,"Amakuru ensi"
688
- salt_000002,"Ibino nining?"
 
689
  ```
690
 
691
- ## πŸ† Leaderboard Types
 
 
 
 
 
 
 
 
 
 
692
 
693
- ### 1. Full UG40 Leaderboard
694
- - Includes all {len(get_all_language_pairs())} language pairs
695
- - Complete evaluation across all Ugandan languages
696
- - Primary ranking system
697
 
698
- ### 2. Google Translate Comparable
699
- - Limited to {len(get_google_comparable_pairs())} pairs
700
- - Only languages supported by Google Translate
701
- - Allows direct comparison with Google Translate baseline
702
 
703
- ## πŸ”¬ Scientific Rigor
 
 
 
704
 
705
- - **Standardized Evaluation**: Same test set for all models
706
- - **Multiple Metrics**: Comprehensive evaluation beyond just BLEU
707
- - **Coverage Tracking**: Transparency about what each model covers
708
- - **Reproducible**: All evaluation code and data available
 
709
 
710
- ## 🀝 Contributing
 
 
 
 
711
 
712
- This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
713
 
714
- **Contact**: [research@sunbird.ai](mailto:[email protected])
715
- **GitHub**: [Sunbird AI GitHub](https://github.com/sunbirdai)
 
 
 
 
716
 
717
  ## πŸ“„ Citation
718
 
719
  If you use this leaderboard in your research, please cite:
720
 
721
  ```bibtex
722
- @misc{{salt_leaderboard_2024,
723
- title={{SALT Translation Leaderboard: Evaluation of Translation Models on Ugandan Languages}},
724
  author={{Sunbird AI}},
725
  year={{2024}},
726
- url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard}}
 
727
  }}
728
  ```
729
 
730
  ## πŸ”— Related Resources
731
 
732
  - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
733
- - **Sunbird AI Models**: [Sunbird Organization](https://huggingface.co/Sunbird)
734
- - **Research Papers**: [Sunbird AI Publications](https://sunbird.ai/research)
 
 
 
 
 
735
  """)
736
 
737
- # Event handlers with state management
738
  predictions_validated = gr.State(value=None)
739
  validation_info_state = gr.State(value=None)
 
740
 
741
  # Download test set
742
  download_btn.click(
743
- fn=download_test_set,
744
  outputs=[download_file, download_info]
745
  )
746
 
747
  # Validate predictions
748
- def handle_validation(file, model_name, author, description):
749
- report, predictions = validate_submission(file, model_name, author, description)
750
  valid = predictions is not None
751
-
752
- # Build the four returns:
753
- if valid:
754
- return (
755
- report,
756
- predictions, # predictions_validated state
757
- predictions, # validation_info_state (you can store whatever you like here)
758
- gr.update(interactive=True)
759
- )
760
- else:
761
- return (
762
- report,
763
- None,
764
- None,
765
- gr.update(interactive=False) # <β€” this *disables* the button
766
- )
767
 
768
  validate_btn.click(
769
- fn=handle_validation,
770
  inputs=[predictions_file, model_name_input, author_input, description_input],
771
- outputs=[validation_output, predictions_validated, validation_info_state, submit_btn]
772
  )
773
 
774
  # Submit for evaluation
775
- def handle_submission(predictions, model_name, author, description, validation_info):
776
  if predictions is None:
777
- return "❌ Please validate your submission first", None, None, None
778
-
779
- # Extract validation info dict
780
- validation_dict = {
781
- 'coverage': getattr(validation_info, 'coverage', 0.8) if hasattr(validation_info, 'coverage') else 0.8,
782
- 'report': 'Validation passed'
783
- }
784
 
785
- return evaluate_submission(predictions, model_name, author, description, validation_dict)
 
 
786
 
787
  submit_btn.click(
788
- fn=handle_submission,
789
- inputs=[predictions_validated, model_name_input, author_input, description_input, validation_info_state],
790
- outputs=[evaluation_output, results_table, submission_plot, updated_leaderboard_plot]
791
  )
792
 
793
- # Refresh leaderboard
794
- def update_leaderboard_and_dropdown(*args):
795
- table, plot1, plot2, stats = refresh_leaderboard_display(*args)
796
-
797
- # Update model dropdown choices
798
- if current_leaderboard is not None and not current_leaderboard.empty:
799
- model_choices = current_leaderboard['model_name'].tolist()
800
- else:
801
- model_choices = []
802
-
803
- return table, plot1, plot2, stats, gr.Dropdown(choices=model_choices)
804
 
805
- refresh_btn.click(
806
- fn=update_leaderboard_and_dropdown,
807
- inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
808
- outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
 
 
 
 
809
  )
810
 
811
- # Auto-refresh on filter changes
812
- for input_component in [search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox]:
813
- input_component.change(
814
- fn=update_leaderboard_and_dropdown,
815
- inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
816
- outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
817
- )
 
 
 
 
 
 
818
 
819
  # Model analysis
820
  analyze_btn.click(
821
- fn=get_model_details,
822
- inputs=[model_select],
823
- outputs=[model_details, model_analysis_plot]
 
 
 
 
 
 
 
824
  )
825
 
826
- # Load initial data
 
 
 
 
 
 
 
 
 
 
 
 
827
  demo.load(
828
- fn=update_leaderboard_and_dropdown,
829
- inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
830
- outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
 
 
 
 
 
831
  )
832
 
833
- # Launch the application
834
  if __name__ == "__main__":
835
  demo.launch(
836
  server_name="0.0.0.0",
 
51
  return False
52
 
53
  # Setup SALT on startup
54
+ print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
55
  if not setup_salt():
56
  print("❌ Cannot continue without SALT library")
57
  print("πŸ’‘ Please check that git is available and GitHub is accessible")
 
62
  import json
63
  import traceback
64
  from datetime import datetime
65
+ from typing import Optional, Dict, Tuple, List
66
 
67
+ # Import our enhanced modules
68
+ from src.test_set import (
69
+ get_public_test_set_scientific,
70
+ get_complete_test_set_scientific,
71
+ create_test_set_download_scientific,
72
+ validate_test_set_integrity_scientific,
73
+ get_track_test_set
74
+ )
75
+ from src.validation import validate_submission_scientific
76
+ from src.evaluation import (
77
+ evaluate_predictions_scientific,
78
+ generate_scientific_report,
79
+ compare_models_statistically
80
+ )
81
  from src.leaderboard import (
82
+ load_scientific_leaderboard,
83
+ add_model_to_scientific_leaderboard,
84
+ get_scientific_leaderboard_stats,
85
+ get_track_leaderboard,
86
+ prepare_track_leaderboard_display,
87
+ perform_fair_comparison,
88
+ export_scientific_leaderboard
89
  )
90
  from src.plotting import (
91
+ create_scientific_leaderboard_plot,
92
+ create_language_pair_heatmap_scientific,
93
+ create_statistical_comparison_plot,
94
+ create_category_comparison_plot,
95
+ create_adequacy_analysis_plot,
96
+ create_cross_track_analysis_plot,
97
+ create_scientific_model_detail_plot
98
+ )
99
+ from src.utils import (
100
+ sanitize_model_name,
101
+ get_all_language_pairs,
102
+ get_google_comparable_pairs,
103
+ get_track_language_pairs,
104
+ format_metric_value
105
  )
 
106
  from config import *
107
 
108
  # Global variables for caching
109
  current_leaderboard = None
110
  public_test_set = None
111
  complete_test_set = None
112
+ test_set_stats = None
113
 
114
+ def initialize_scientific_data():
115
+ """Initialize scientific test sets and leaderboard data."""
116
+ global public_test_set, complete_test_set, current_leaderboard, test_set_stats
117
 
118
  try:
119
+ print("πŸ”¬ Initializing SALT Translation Leaderboard - Scientific Edition...")
120
+
121
+ # Load scientific test sets
122
+ print("πŸ“₯ Loading scientific test sets...")
123
+ public_test_set = get_public_test_set_scientific()
124
+ complete_test_set = get_complete_test_set_scientific()
125
 
126
+ # Load scientific leaderboard
127
+ print("πŸ† Loading scientific leaderboard...")
128
+ current_leaderboard = load_scientific_leaderboard()
 
129
 
130
+ # Validate test set integrity
131
+ print("πŸ” Validating test set integrity...")
132
+ test_set_stats = validate_test_set_integrity_scientific()
133
 
134
+ print(f"βœ… Scientific initialization complete!")
135
  print(f" - Test set: {len(public_test_set):,} samples")
136
+ print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
137
+ print(f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
138
  print(f" - Current models: {len(current_leaderboard)}")
139
 
140
  return True
141
 
142
  except Exception as e:
143
+ print(f"❌ Scientific initialization failed: {e}")
144
  traceback.print_exc()
145
  return False
146
 
147
+ def download_scientific_test_set() -> Tuple[str, str]:
148
+ """Create downloadable scientific test set and return file path and info."""
149
 
150
  try:
151
  global public_test_set
152
  if public_test_set is None:
153
+ public_test_set = get_public_test_set_scientific()
154
 
155
  # Create download file
156
+ download_path, stats = create_test_set_download_scientific()
157
+
158
+ # Create comprehensive info message
159
+ adequacy = stats.get('adequacy_assessment', 'unknown')
160
+ adequacy_emoji = {
161
+ 'excellent': '🟒',
162
+ 'good': '🟑',
163
+ 'fair': '🟠',
164
+ 'insufficient': 'πŸ”΄',
165
+ 'unknown': 'βšͺ'
166
+ }.get(adequacy, 'βšͺ')
167
 
 
168
  info_msg = f"""
169
+ ## πŸ“₯ SALT Scientific Test Set Downloaded Successfully!
170
+
171
+ ### πŸ”¬ Scientific Edition Features:
172
+ - **Stratified Sampling**: Ensures representative coverage across domains
173
+ - **Statistical Weighting**: Samples weighted by track importance
174
+ - **Track Balancing**: Optimized for fair cross-track comparison
175
+ - **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**
176
 
177
+ ### πŸ“Š Dataset Statistics:
178
  - **Total Samples**: {stats['total_samples']:,}
179
+ - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
180
+ - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
181
+ - **Domains**: {', '.join(stats.get('domains', ['general']))}
182
 
183
+ ### 🏁 Track Breakdown:
184
+ """
185
+
186
+ track_breakdown = stats.get('track_breakdown', {})
187
+ for track_name, track_info in track_breakdown.items():
188
+ status_emoji = 'βœ…' if track_info.get('statistical_adequacy', False) else '⚠️'
189
+ info_msg += f"""
190
+ **{status_emoji} {track_info.get('name', track_name)}**:
191
+ - Samples: {track_info.get('total_samples', 0):,}
192
+ - Language Pairs: {track_info.get('language_pairs', 0)}
193
+ - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
194
+ - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
195
+ """
196
+
197
+ info_msg += f"""
198
+
199
+ ### πŸ“‹ Enhanced File Format:
200
  - `sample_id`: Unique identifier for each sample
201
  - `source_text`: Text to be translated
202
  - `source_language`: Source language code
203
  - `target_language`: Target language code
204
  - `domain`: Content domain (if available)
205
  - `google_comparable`: Whether this pair can be compared with Google Translate
206
+ - `tracks_included`: Comma-separated list of tracks that include this sample
207
+ - `statistical_weight`: Statistical importance weight (1.0-5.0)
208
+
209
+ ### πŸ”¬ Next Steps for Scientific Evaluation:
210
+ 1. **Run your model** on the source texts to generate translations
211
+ 2. **Create a predictions file** with columns: `sample_id`, `prediction`
212
+ 3. **Optional**: Add `category` column to help with model classification
213
+ 4. **Submit** your predictions using the appropriate track tab
214
+ 5. **Analyze** results with statistical confidence intervals
215
 
216
+ ### πŸ’‘ Tips for Best Results:
217
+ - Ensure coverage of all language pairs for chosen track
218
+ - Include confidence scores if available
219
+ - Provide detailed model description for proper categorization
220
+ - Consider submitting to multiple tracks for comprehensive evaluation
221
  """
222
 
223
  return download_path, info_msg
224
 
225
  except Exception as e:
226
+ error_msg = f"❌ Error creating scientific test set download: {str(e)}"
227
  return None, error_msg
228
 
229
+ def validate_scientific_submission(
230
+ file, model_name: str, author: str, description: str
231
+ ) -> Tuple[str, Optional[pd.DataFrame], str]:
232
+ """Validate uploaded prediction file with scientific rigor."""
233
+
234
  try:
235
  if file is None:
236
+ return "❌ Please upload a predictions file", None, "community"
237
  if not model_name.strip():
238
+ return "❌ Please provide a model name", None, "community"
239
 
240
+ # Handle different file input types
241
  if isinstance(file, bytes):
242
  file_content = file
243
  elif isinstance(file, str):
 
244
  if os.path.exists(file):
245
  with open(file, "rb") as f:
246
  file_content = f.read()
247
  else:
248
  file_content = file.encode("utf-8")
249
  elif hasattr(file, "name") and os.path.exists(file.name):
 
250
  with open(file.name, "rb") as f:
251
  file_content = f.read()
252
  else:
253
+ return "❌ Could not read uploaded file", None, "community"
254
 
255
+ # Determine filename
256
  filename = (
257
  getattr(file, "name", None)
258
  or getattr(file, "filename", None)
259
  or "predictions.csv"
260
  )
261
 
262
+ # Load test set if needed
263
  global complete_test_set
264
  if complete_test_set is None:
265
+ complete_test_set = get_complete_test_set_scientific()
266
 
267
+ # Run enhanced scientific validation
268
+ validation_result = validate_submission_scientific(
269
+ file_content, filename, complete_test_set, model_name, author, description
270
  )
271
 
272
+ detected_category = validation_result.get("category", "community")
273
+
274
  if validation_result["valid"]:
275
+ return validation_result["report"], validation_result["predictions"], detected_category
276
  else:
277
+ return validation_result["report"], None, detected_category
278
 
279
  except Exception as e:
280
  return (
281
  f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
282
  None,
283
+ "community"
284
  )
285
 
286
+ def evaluate_scientific_submission(
287
+ predictions_df: pd.DataFrame,
288
+ model_name: str,
289
+ author: str,
290
  description: str,
291
+ detected_category: str,
292
+ validation_info: Dict,
293
+ ) -> Tuple[str, pd.DataFrame, object, object, object]:
294
+ """Evaluate validated predictions using scientific methodology."""
295
 
296
  try:
297
  if predictions_df is None:
298
+ return "❌ No valid predictions to evaluate", None, None, None, None
299
 
300
  # Get complete test set with targets
301
  global complete_test_set, current_leaderboard
302
  if complete_test_set is None:
303
+ complete_test_set = get_complete_test_set_scientific()
 
 
 
 
304
 
305
+ # Run scientific evaluation across all tracks
306
+ print(f"πŸ”¬ Starting scientific evaluation for {model_name}...")
307
+ evaluation_results = evaluate_predictions_scientific(
308
+ predictions_df, complete_test_set, detected_category
309
+ )
310
 
311
+ if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
312
+ errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
313
+ return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None, None
314
 
315
+ # Add to scientific leaderboard
316
+ print("πŸ† Adding to scientific leaderboard...")
317
+ updated_leaderboard = add_model_to_scientific_leaderboard(
318
  model_name=sanitize_model_name(model_name),
319
+ author=author or "Anonymous",
320
  evaluation_results=evaluation_results,
321
+ model_category=detected_category,
 
322
  description=description or ""
323
  )
324
 
325
  # Update global leaderboard
326
  current_leaderboard = updated_leaderboard
327
 
328
+ # Generate scientific report
329
+ report = generate_scientific_report(evaluation_results, model_name)
330
 
331
+ # Create visualizations
332
+ summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
333
+ cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
334
 
335
+ # Prepare display leaderboard (Google-comparable track by default)
336
+ google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
337
+ display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
338
 
339
+ # Format success message with track-specific results
340
  success_msg = f"""
341
+ ## πŸŽ‰ Scientific Evaluation Complete!
342
 
343
+ ### πŸ“Š Model Information:
344
  - **Model**: {model_name}
345
+ - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
346
+ - **Author**: {author or 'Anonymous'}
347
+
348
+ ### πŸ† Track Performance Summary:
349
+ """
350
+
351
+ tracks = evaluation_results.get('tracks', {})
352
+ for track_name, track_data in tracks.items():
353
+ if not track_data.get('error'):
354
+ track_config = EVALUATION_TRACKS[track_name]
355
+ track_averages = track_data.get('track_averages', {})
356
+ summary = track_data.get('summary', {})
357
+
358
+ # Get rank in this track
359
+ track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
360
+ if not track_leaderboard.empty:
361
+ model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
362
+ rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
363
+ total_models = len(track_leaderboard)
364
+ else:
365
+ rank = "N/A"
366
+ total_models = 0
367
+
368
+ quality_score = track_averages.get('quality_score', 0)
369
+ bleu_score = track_averages.get('bleu', 0)
370
+ samples = summary.get('total_samples', 0)
371
+
372
+ success_msg += f"""
373
+ **🏁 {track_config['name']}**:
374
+ - Rank: #{rank} out of {total_models} models
375
+ - Quality Score: {quality_score:.4f}
376
+ - BLEU: {bleu_score:.2f}
377
+ - Samples: {samples:,}
378
+ """
379
+
380
+ success_msg += f"""
381
 
382
+ ### πŸ”¬ Scientific Adequacy:
383
+ - **Cross-Track Consistency**: Available in detailed analysis
384
+ - **Statistical Confidence**: 95% confidence intervals computed
385
+ - **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}
386
 
387
  {report}
388
  """
389
 
390
+ return success_msg, display_leaderboard, summary_plot, cross_track_plot, updated_leaderboard
391
+
392
  except Exception as e:
393
+ error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
394
+ return error_msg, None, None, None, None
395
 
396
+ def refresh_track_leaderboard(
397
+ track: str,
398
  search_query: str = "",
399
+ category_filter: str = "all",
400
+ min_adequacy: float = 0.0,
401
+ show_ci: bool = True
402
  ) -> Tuple[pd.DataFrame, object, object, str]:
403
+ """Refresh leaderboard for a specific track with filters."""
404
 
405
  try:
406
  global current_leaderboard
407
  if current_leaderboard is None:
408
+ current_leaderboard = load_scientific_leaderboard()
409
+
410
+ # Get track-specific leaderboard
411
+ track_leaderboard = get_track_leaderboard(
412
+ current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
 
 
 
 
413
  )
414
 
415
+ # Apply search filter
416
+ if search_query:
417
+ query_lower = search_query.lower()
418
+ mask = (
419
+ track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
420
+ track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
421
+ )
422
+ track_leaderboard = track_leaderboard[mask]
423
+
424
+ # Prepare for display
425
+ display_df = prepare_track_leaderboard_display(track_leaderboard, track)
426
 
427
  # Create plots
428
+ ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
429
+ comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
430
+
431
+ # Get track statistics
432
+ track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
433
+ track_config = EVALUATION_TRACKS[track]
434
 
 
 
435
  stats_text = f"""
436
+ ### πŸ“Š {track_config['name']} Statistics
437
 
438
+ - **Total Models**: {track_stats.get('total_models', 0)}
439
+ - **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
440
+ - **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}
441
 
442
+ **Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
443
+ **Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}
444
+
445
+ ### πŸ”¬ Scientific Notes:
446
+ - All metrics include 95% confidence intervals
447
+ - Statistical adequacy verified for reliable comparisons
448
+ - {track_config['description']}
449
  """
450
 
451
  return display_df, ranking_plot, comparison_plot, stats_text
452
 
453
  except Exception as e:
454
+ error_msg = f"Error loading {track} leaderboard: {str(e)}"
455
  empty_df = pd.DataFrame()
456
  return empty_df, None, None, error_msg
457
 
458
+ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
459
+ """Get detailed scientific analysis for a specific model."""
460
 
461
  try:
462
  global current_leaderboard
463
  if current_leaderboard is None:
464
+ return "Leaderboard not loaded", None, None
465
 
466
  # Find model
467
  model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
468
 
469
  if model_row.empty:
470
+ return f"Model '{model_name}' not found", None, None
471
 
472
  model_info = model_row.iloc[0]
473
 
474
+ # Parse detailed metrics for the requested track
475
  try:
476
+ detailed_results = json.loads(model_info[f'detailed_{track}'])
477
  except:
478
  detailed_results = {}
479
 
480
+ # Create detailed plots
481
+ detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
482
+
483
+ # Create language pair heatmap
484
+ heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
485
+
486
+ # Format model details with scientific information
487
+ track_config = EVALUATION_TRACKS[track]
488
+ category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
489
+
490
+ # Extract track-specific metrics
491
+ quality_col = f"{track}_quality"
492
+ bleu_col = f"{track}_bleu"
493
+ chrf_col = f"{track}_chrf"
494
+ ci_lower_col = f"{track}_ci_lower"
495
+ ci_upper_col = f"{track}_ci_upper"
496
+ samples_col = f"{track}_samples"
497
+ pairs_col = f"{track}_pairs"
498
+ adequate_col = f"{track}_adequate"
499
 
 
500
  details_text = f"""
501
+ ## πŸ”¬ Scientific Model Analysis: {model_name}
502
 
503
+ ### πŸ“‹ Basic Information:
504
  - **Author**: {model_info['author']}
505
+ - **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
506
  - **Submission Date**: {model_info['submission_date'][:10]}
 
507
  - **Description**: {model_info['description'] or 'No description provided'}
508
 
509
+ ### 🏁 {track_config['name']} Performance:
510
+ - **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))}
511
+ - **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')}
512
+ - **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')}
513
+
514
+ ### πŸ“Š Coverage Information:
515
+ - **Total Samples**: {model_info.get(samples_col, 0):,}
516
+ - **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
517
+ - **Statistical Adequacy**: {'βœ… Yes' if model_info.get(adequate_col, False) else '❌ No'}
518
+
519
+ ### πŸ”¬ Statistical Metadata:
520
+ - **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
521
+ - **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
522
+ - **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}
523
+
524
+ ### πŸ“ˆ Cross-Track Performance:
525
+ """
526
+
527
+ # Add other track performances for comparison
528
+ for other_track in EVALUATION_TRACKS.keys():
529
+ if other_track != track:
530
+ other_quality_col = f"{other_track}_quality"
531
+ other_adequate_col = f"{other_track}_adequate"
532
+
533
+ if model_info.get(other_adequate_col, False):
534
+ other_quality = model_info.get(other_quality_col, 0)
535
+ details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
536
+ else:
537
+ details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
538
+
539
+ details_text += f"""
540
+
541
+ ### πŸ’‘ Scientific Interpretation:
542
+ - Performance metrics include 95% confidence intervals for reliability
543
+ - Statistical adequacy ensures meaningful comparisons with other models
544
+ - Cross-track analysis reveals model strengths across different language sets
545
+ - Category classification helps contextualize performance expectations
546
  """
547
 
548
+ return details_text, detail_plot, heatmap_plot
549
 
550
  except Exception as e:
551
  error_msg = f"Error getting model details: {str(e)}"
552
+ return error_msg, None, None
553
+
554
+ def perform_model_comparison(
555
+ model_names: List[str], track: str, comparison_type: str = "statistical"
556
+ ) -> Tuple[str, object]:
557
+ """Perform scientific comparison between selected models."""
558
+
559
+ try:
560
+ global current_leaderboard
561
+ if current_leaderboard is None:
562
+ return "Leaderboard not loaded", None
563
+
564
+ if len(model_names) < 2:
565
+ return "Please select at least 2 models for comparison", None
566
+
567
+ # Get models
568
+ models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
569
+
570
+ if len(models) < 2:
571
+ return "Selected models not found in leaderboard", None
572
+
573
+ # Perform fair comparison
574
+ comparison_result = perform_fair_comparison(current_leaderboard, model_names)
575
+
576
+ if comparison_result.get('error'):
577
+ return f"Comparison error: {comparison_result['error']}", None
578
+
579
+ # Create comparison visualization
580
+ if comparison_type == "statistical":
581
+ comparison_plot = create_statistical_comparison_plot(models, track)
582
+ else:
583
+ comparison_plot = create_category_comparison_plot(models, track)
584
+
585
+ # Format comparison report
586
+ track_config = EVALUATION_TRACKS[track]
587
+ comparison_text = f"""
588
+ ## πŸ”¬ Scientific Model Comparison - {track_config['name']}
589
+
590
+ ### πŸ“Š Models Compared:
591
+ """
592
+
593
+ quality_col = f"{track}_quality"
594
+ ci_lower_col = f"{track}_ci_lower"
595
+ ci_upper_col = f"{track}_ci_upper"
596
+
597
+ # Sort models by performance
598
+ models_sorted = models.sort_values(quality_col, ascending=False)
599
+
600
+ for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
601
+ category_info = MODEL_CATEGORIES.get(model['model_category'], {})
602
+
603
+ comparison_text += f"""
604
+ **#{i}. {model['model_name']}**
605
+ - Category: {category_info.get('name', 'Unknown')}
606
+ - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
607
+ - Author: {model['author']}
608
+ """
609
+
610
+ # Add statistical analysis
611
+ track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
612
+ if track_comparison:
613
+ comparison_text += f"""
614
+
615
+ ### πŸ”¬ Statistical Analysis:
616
+ - **Models with adequate data**: {track_comparison.get('participating_models', 0)}
617
+ - **Confidence intervals available**: Yes (95% level)
618
+ - **Fair comparison possible**: {'βœ… Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
619
+ """
620
+
621
+ # Check for statistical significance (simplified)
622
+ quality_scores = list(track_comparison.get('quality_scores', {}).values())
623
+ if len(quality_scores) >= 2:
624
+ score_range = max(quality_scores) - min(quality_scores)
625
+ if score_range > 0.05: # 5% difference threshold
626
+ comparison_text += "- **Performance differences**: Potentially significant\n"
627
+ else:
628
+ comparison_text += "- **Performance differences**: Minimal\n"
629
+
630
+ # Add recommendations
631
+ recommendations = comparison_result.get('recommendations', [])
632
+ if recommendations:
633
+ comparison_text += "\n### πŸ’‘ Recommendations:\n"
634
+ for rec in recommendations:
635
+ comparison_text += f"- {rec}\n"
636
+
637
+ return comparison_text, comparison_plot
638
+
639
+ except Exception as e:
640
+ error_msg = f"Error performing comparison: {str(e)}"
641
  return error_msg, None
642
 
643
  # Initialize data on startup
644
+ print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
645
+ initialization_success = initialize_scientific_data()
646
 
647
+ # Create Gradio interface with scientific design
648
  with gr.Blocks(
649
+ title=UI_CONFIG["title"],
650
  theme=gr.themes.Soft(),
651
  css="""
652
  .gradio-container {
653
+ max-width: 1600px !important;
654
  margin: 0 auto;
655
  }
656
+ .scientific-header {
657
  text-align: center;
658
  margin-bottom: 2rem;
659
  padding: 2rem;
660
+ background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
661
  color: white;
662
  border-radius: 10px;
663
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
664
+ }
665
+ .track-tab {
666
+ border-radius: 8px;
667
+ margin: 0.5rem;
668
+ padding: 1rem;
669
+ border: 2px solid transparent;
670
+ }
671
+ .track-tab.google-comparable {
672
+ border-color: #1f77b4;
673
+ background: linear-gradient(45deg, #f0f9ff, #e0f2fe);
674
+ }
675
+ .track-tab.ug40-complete {
676
+ border-color: #ff7f0e;
677
+ background: linear-gradient(45deg, #fff7ed, #fed7aa);
678
+ }
679
+ .track-tab.language-pair-matrix {
680
+ border-color: #2ca02c;
681
+ background: linear-gradient(45deg, #f0fdf4, #dcfce7);
682
  }
683
  .metric-box {
684
+ background: #f8fafc;
685
  padding: 1rem;
686
  border-radius: 8px;
687
  margin: 0.5rem 0;
688
+ border-left: 4px solid #3b82f6;
689
  }
690
+ .scientific-note {
691
+ background: #fef3c7;
692
+ border: 1px solid #f59e0b;
 
693
  border-radius: 8px;
 
 
 
 
 
694
  padding: 1rem;
695
+ margin: 1rem 0;
 
696
  }
697
+ .adequacy-excellent { border-left-color: #22c55e; }
698
+ .adequacy-good { border-left-color: #eab308; }
699
+ .adequacy-fair { border-left-color: #f97316; }
700
+ .adequacy-insufficient { border-left-color: #ef4444; }
701
  """
702
  ) as demo:
703
 
704
+ # Scientific Header
705
  gr.HTML(f"""
706
+ <div class="scientific-header">
707
+ <h1>πŸ† SALT Translation Leaderboard - Scientific Edition</h1>
708
+ <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
709
+ <p>Three-tier evaluation tracks β€’ 95% Confidence intervals β€’ Research-grade analysis</p>
710
  <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
711
  </div>
712
  """)
713
 
714
  # Status indicator
715
  if initialization_success:
716
+ status_msg = "βœ… Scientific system initialized successfully"
717
+ adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
718
+ status_msg += f" | Test set adequacy: {adequacy_info.title()}"
719
  else:
720
  status_msg = "❌ System initialization failed - some features may not work"
721
 
722
+ gr.Markdown(f"**System Status**: {status_msg}")
723
+
724
+ # Add scientific overview
725
+ gr.Markdown("""
726
+ ## πŸ”¬ Scientific Evaluation Framework
727
 
728
+ This leaderboard implements rigorous scientific methodology for translation model evaluation:
729
+
730
+ - **Three Evaluation Tracks**: Fair comparison across different model capabilities
731
+ - **Statistical Significance**: 95% confidence intervals and effect size analysis
732
+ - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
733
+ - **Cross-Track Consistency**: Validate model performance across language sets
734
+ """)
735
+
736
  with gr.Tabs():
737
 
738
+ # Tab 1: Download Test Set
739
  with gr.Tab("πŸ“₯ Download Test Set", id="download"):
740
  gr.Markdown("""
741
+ ## πŸ“‹ Get the SALT Scientific Test Set
742
 
743
+ Download our scientifically designed test set with stratified sampling and statistical weighting.
 
744
  """)
745
 
746
  with gr.Row():
747
+ download_btn = gr.Button("πŸ“₯ Download Scientific Test Set", variant="primary", size="lg")
748
 
749
  with gr.Row():
750
  with gr.Column():
751
  download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
752
  with gr.Column():
753
  download_info = gr.Markdown(label="ℹ️ Test Set Information")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
 
755
+ # Tab 2: Submit Predictions
756
  with gr.Tab("πŸš€ Submit Predictions", id="submit"):
757
  gr.Markdown("""
758
+ ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
759
 
760
+ Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
761
  """)
762
 
763
  with gr.Row():
764
  with gr.Column(scale=1):
 
765
  gr.Markdown("### πŸ“ Model Information")
766
 
767
  model_name_input = gr.Textbox(
768
  label="πŸ€– Model Name",
769
+ placeholder="e.g., MyTranslator-v2.0",
770
  info="Unique name for your model"
771
  )
772
 
 
777
  )
778
 
779
  description_input = gr.Textbox(
780
+ label="πŸ“„ Model Description",
781
+ placeholder="Architecture, training data, special features...",
782
+ lines=4,
783
+ info="Detailed description helps with proper categorization"
784
  )
785
 
 
786
  gr.Markdown("### πŸ“€ Upload Predictions")
 
 
787
  predictions_file = gr.File(
788
  label="πŸ“‚ Predictions File",
789
  file_types=[".csv", ".tsv", ".json"]
790
  )
791
 
792
  validate_btn = gr.Button("βœ… Validate Submission", variant="secondary")
793
+ submit_btn = gr.Button("πŸš€ Submit for Scientific Evaluation", variant="primary", interactive=False)
794
 
795
  with gr.Column(scale=1):
796
  gr.Markdown("### πŸ“Š Validation Results")
797
  validation_output = gr.Markdown()
798
 
799
  # Results section
800
+ gr.Markdown("### πŸ† Scientific Evaluation Results")
801
 
802
  with gr.Row():
803
  evaluation_output = gr.Markdown()
804
 
805
  with gr.Row():
806
  with gr.Column():
807
+ submission_plot = gr.Plot(label="πŸ“ˆ Submission Analysis")
808
  with gr.Column():
809
+ cross_track_plot = gr.Plot(label="πŸ”„ Cross-Track Analysis")
810
 
811
  with gr.Row():
812
+ results_table = gr.Dataframe(label="πŸ“Š Updated Leaderboard (Google-Comparable Track)", interactive=False)
813
 
814
+ # Tab 3: Google-Comparable Track
815
+ with gr.Tab("πŸ€– Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
816
+ gr.Markdown(f"""
817
+ ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
818
+
819
+ **Fair comparison with commercial translation systems**
820
+
821
+ This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
822
+ enabling direct comparison with commercial baselines.
823
+
824
+ - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
825
+ - **Purpose**: Commercial system comparison and baseline establishment
826
+ - **Statistical Power**: High (optimized sample sizes)
827
+ """)
828
+
829
  with gr.Row():
830
+ with gr.Column(scale=2):
831
+ google_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
 
 
 
832
  with gr.Column(scale=1):
833
+ google_category = gr.Dropdown(
834
+ label="🏷️ Category Filter",
835
+ choices=["all"] + list(MODEL_CATEGORIES.keys()),
836
  value="all"
837
  )
838
  with gr.Column(scale=1):
839
+ google_adequacy = gr.Slider(
840
+ label="πŸ“Š Min Adequacy",
841
+ minimum=0.0, maximum=1.0, value=0.0, step=0.1
842
+ )
843
+ with gr.Column(scale=1):
844
+ google_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
845
+
846
+ with gr.Row():
847
+ google_stats = gr.Markdown()
848
+
849
+ with gr.Row():
850
+ with gr.Column():
851
+ google_ranking_plot = gr.Plot(label="πŸ† Google-Comparable Rankings")
852
+ with gr.Column():
853
+ google_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
854
+
855
+ with gr.Row():
856
+ google_leaderboard = gr.Dataframe(label="πŸ“ˆ Google-Comparable Leaderboard", interactive=False)
857
+
858
+ # Tab 4: UG40-Complete Track
859
+ with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
860
+ gr.Markdown(f"""
861
+ ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
862
+
863
+ **Comprehensive evaluation across all Ugandan languages**
864
+
865
+ This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
866
+ providing the most comprehensive assessment of Ugandan language translation capabilities.
867
+
868
+ - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
869
+ - **Purpose**: Comprehensive Ugandan language capability assessment
870
+ - **Coverage**: Complete linguistic landscape of Uganda
871
+ """)
872
+
873
+ with gr.Row():
874
+ with gr.Column(scale=2):
875
+ ug40_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
876
+ with gr.Column(scale=1):
877
+ ug40_category = gr.Dropdown(
878
+ label="🏷️ Category Filter",
879
+ choices=["all"] + list(MODEL_CATEGORIES.keys()),
880
+ value="all"
881
  )
882
  with gr.Column(scale=1):
883
+ ug40_adequacy = gr.Slider(
884
+ label="πŸ“Š Min Adequacy",
885
+ minimum=0.0, maximum=1.0, value=0.0, step=0.1
886
  )
887
+ with gr.Column(scale=1):
888
+ ug40_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
889
 
890
  with gr.Row():
891
+ ug40_stats = gr.Markdown()
892
 
893
  with gr.Row():
894
+ with gr.Column():
895
+ ug40_ranking_plot = gr.Plot(label="πŸ† UG40-Complete Rankings")
896
+ with gr.Column():
897
+ ug40_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
898
+
899
+ with gr.Row():
900
+ ug40_leaderboard = gr.Dataframe(label="πŸ“ˆ UG40-Complete Leaderboard", interactive=False)
901
+
902
+ # Tab 5: Language-Pair Matrix
903
+ with gr.Tab("πŸ“Š Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
904
+ gr.Markdown(f"""
905
+ ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
906
+
907
+ **Detailed language pair analysis with statistical significance**
908
+
909
+ This view provides granular analysis of model performance across individual language pairs
910
+ with statistical significance testing and effect size analysis.
911
+
912
+ - **Resolution**: Individual language pair performance
913
+ - **Purpose**: Detailed linguistic analysis and model diagnostics
914
+ - **Statistics**: Pairwise significance testing available
915
+ """)
916
+
917
+ with gr.Row():
918
+ with gr.Column(scale=2):
919
+ matrix_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
920
+ with gr.Column(scale=1):
921
+ matrix_category = gr.Dropdown(
922
+ label="🏷️ Category Filter",
923
+ choices=["all"] + list(MODEL_CATEGORIES.keys()),
924
+ value="all"
925
+ )
926
+ with gr.Column(scale=1):
927
+ matrix_adequacy = gr.Slider(
928
+ label="πŸ“Š Min Adequacy",
929
+ minimum=0.0, maximum=1.0, value=0.0, step=0.1
930
+ )
931
+ with gr.Column(scale=1):
932
+ matrix_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
933
+
934
+ with gr.Row():
935
+ matrix_stats = gr.Markdown()
936
 
937
  with gr.Row():
938
  with gr.Column():
939
+ matrix_ranking_plot = gr.Plot(label="πŸ† Language-Pair Matrix Rankings")
940
  with gr.Column():
941
+ matrix_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
942
 
943
  with gr.Row():
944
+ matrix_leaderboard = gr.Dataframe(label="πŸ“ˆ Language-Pair Matrix Leaderboard", interactive=False)
945
+
946
+ # Tab 6: Model Analysis
947
+ with gr.Tab("πŸ” Scientific Model Analysis", id="analysis"):
948
+ gr.Markdown("""
949
+ ## πŸ”¬ Detailed Scientific Model Analysis
950
+
951
+ Comprehensive analysis of individual models with statistical confidence intervals,
952
+ cross-track performance, and detailed language pair breakdowns.
953
+ """)
954
+
955
  with gr.Row():
956
+ with gr.Column(scale=2):
957
+ model_select = gr.Dropdown(
958
+ label="πŸ€– Select Model",
959
+ choices=[],
960
+ value=None,
961
+ info="Choose a model for detailed scientific analysis"
962
+ )
963
+ with gr.Column(scale=1):
964
+ track_select = gr.Dropdown(
965
+ label="🏁 Analysis Track",
966
+ choices=list(EVALUATION_TRACKS.keys()),
967
+ value="google_comparable",
968
+ info="Track for detailed analysis"
969
+ )
970
+ with gr.Column(scale=1):
971
+ analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
972
 
973
  with gr.Row():
974
  model_details = gr.Markdown()
975
 
976
  with gr.Row():
977
+ with gr.Column():
978
+ model_analysis_plot = gr.Plot(label="πŸ“Š Detailed Performance Analysis")
979
+ with gr.Column():
980
+ model_heatmap_plot = gr.Plot(label="πŸ—ΊοΈ Language Pair Heatmap")
981
+
982
+ # Tab 7: Model Comparison
983
+ with gr.Tab("βš–οΈ Scientific Model Comparison", id="comparison"):
984
+ gr.Markdown("""
985
+ ## πŸ”¬ Scientific Model Comparison
986
+
987
+ Compare multiple models with statistical significance testing and fair comparison analysis.
988
+ Only models evaluated on the same language pairs are compared for scientific validity.
989
+ """)
990
+
991
+ with gr.Row():
992
+ with gr.Column(scale=2):
993
+ comparison_models = gr.CheckboxGroup(
994
+ label="πŸ€– Select Models to Compare",
995
+ choices=[],
996
+ value=[],
997
+ info="Select 2-6 models for comparison"
998
+ )
999
+ with gr.Column(scale=1):
1000
+ comparison_track = gr.Dropdown(
1001
+ label="🏁 Comparison Track",
1002
+ choices=list(EVALUATION_TRACKS.keys()),
1003
+ value="google_comparable"
1004
+ )
1005
+ comparison_type = gr.Radio(
1006
+ label="πŸ“Š Comparison Type",
1007
+ choices=["statistical", "category"],
1008
+ value="statistical"
1009
+ )
1010
+ compare_btn = gr.Button("βš–οΈ Compare Models", variant="primary")
1011
+
1012
+ with gr.Row():
1013
+ comparison_output = gr.Markdown()
1014
+
1015
+ with gr.Row():
1016
+ comparison_plot = gr.Plot(label="πŸ“Š Model Comparison Analysis")
1017
 
1018
+ # Tab 8: Documentation
1019
+ with gr.Tab("πŸ“š Scientific Documentation", id="docs"):
1020
  gr.Markdown(f"""
1021
+ # πŸ“– SALT Translation Leaderboard - Scientific Edition Documentation
1022
 
1023
  ## 🎯 Overview
1024
 
1025
+ The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
1026
+ for translation models on Ugandan languages, designed for research publication and scientific analysis.
1027
+
1028
+ ## πŸ”¬ Scientific Methodology
1029
+
1030
+ ### Three-Tier Evaluation System
1031
+
1032
+ **1. πŸ€– Google-Comparable Track**
1033
+ - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
1034
+ - **Pairs**: {len(get_google_comparable_pairs())} language pairs
1035
+ - **Purpose**: Fair comparison with commercial translation systems
1036
+ - **Statistical Power**: High (β‰₯200 samples per pair recommended)
1037
+
1038
+ **2. 🌍 UG40-Complete Track**
1039
+ - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
1040
+ - **Pairs**: {len(get_all_language_pairs())} language pairs
1041
+ - **Purpose**: Comprehensive Ugandan language capability assessment
1042
+ - **Statistical Power**: Moderate (β‰₯100 samples per pair recommended)
1043
+
1044
+ **3. πŸ“Š Language-Pair Matrix**
1045
+ - **Resolution**: Individual language pair analysis
1046
+ - **Purpose**: Detailed linguistic analysis and model diagnostics
1047
+ - **Statistics**: Pairwise significance testing with multiple comparison correction
1048
+
1049
+ ### Statistical Rigor
1050
 
1051
+ - **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
1052
+ - **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
1053
+ - **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
1054
+ - **Statistical Power**: Estimated based on sample sizes and effect sizes
1055
 
1056
+ ### Model Categories
 
1057
 
1058
+ Models are automatically categorized for fair comparison:
1059
+
1060
+ - **🏒 Commercial**: Production translation systems (Google Translate, Azure, etc.)
1061
+ - **πŸ”¬ Research**: Academic and research institution models (NLLB, M2M-100, etc.)
1062
+ - **πŸ“Š Baseline**: Simple baseline and reference models
1063
+ - **πŸ‘₯ Community**: User-submitted models and fine-tuned variants
1064
 
1065
  ## πŸ“Š Evaluation Metrics
1066
 
1067
  ### Primary Metrics
1068
+ - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
1069
+ - **BLEU**: Bilingual Evaluation Understudy (0-100)
1070
+ - **ChrF**: Character-level F-score (0-1)
1071
 
1072
  ### Secondary Metrics
1073
+ - **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
1074
+ - **CER/WER**: Character/Word Error Rate (lower is better)
1075
  - **Length Ratio**: Prediction/reference length ratio
1076
 
1077
+ All metrics include 95% confidence intervals for statistical reliability.
1078
+
1079
  ## πŸ”„ Submission Process
1080
 
1081
+ ### Step 1: Download Scientific Test Set
1082
+ 1. Click "Download Scientific Test Set" in the first tab
1083
+ 2. Review test set adequacy and track breakdown
1084
+ 3. Save the enhanced test set with statistical weights
1085
 
1086
  ### Step 2: Generate Predictions
1087
+ 1. Load the test set in your evaluation pipeline
1088
  2. For each row, translate `source_text` from `source_language` to `target_language`
1089
  3. Save results as CSV with columns: `sample_id`, `prediction`
1090
+ 4. Optional: Add `category` column for automatic classification
1091
 
1092
  ### Step 3: Submit & Evaluate
1093
+ 1. Fill in detailed model information (improves categorization)
1094
+ 2. Upload your predictions file
1095
+ 3. Review validation report with track-specific adequacy assessment
1096
+ 4. Submit for scientific evaluation across all tracks
1097
 
1098
+ ## πŸ“‹ Enhanced File Formats
1099
 
1100
+ ### Scientific Test Set Format
1101
  ```csv
1102
+ sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
1103
+ salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
1104
+ salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
1105
+ salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
1106
  ```
1107
 
1108
  ### Predictions Format
1109
  ```csv
1110
+ sample_id,prediction,category
1111
+ salt_000001,"Amakuru ensi","community"
1112
+ salt_000002,"Ibino nining?","community"
1113
+ salt_000003,"Ejok nanu","community"
1114
  ```
1115
 
1116
+ ## πŸ† Scientific Leaderboard Features
1117
+
1118
+ ### Fair Comparison
1119
+ - Models only compared within the same category and track
1120
+ - Statistical significance testing prevents misleading rankings
1121
+ - Confidence intervals show measurement uncertainty
1122
+
1123
+ ### Cross-Track Analysis
1124
+ - Consistency analysis across evaluation tracks
1125
+ - Identification of model strengths and weaknesses
1126
+ - Language-specific performance patterns
1127
 
1128
+ ### Publication Quality
1129
+ - All visualizations include error bars and statistical annotations
1130
+ - Comprehensive methodology documentation
1131
+ - Reproducible evaluation pipeline
1132
 
1133
+ ## πŸ”¬ Statistical Interpretation Guide
 
 
 
1134
 
1135
+ ### Confidence Intervals
1136
+ - **Non-overlapping CIs**: Likely significant difference
1137
+ - **Overlapping CIs**: May or may not be significant (requires formal testing)
1138
+ - **Wide CIs**: High uncertainty (need more data)
1139
 
1140
+ ### Effect Sizes
1141
+ - **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
1142
+ - **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
1143
+ - **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference
1144
+ - **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
1145
 
1146
+ ### Statistical Adequacy
1147
+ - **Excellent**: High statistical power (>0.8) for all comparisons
1148
+ - **Good**: Adequate power for most comparisons
1149
+ - **Fair**: Limited power, interpret with caution
1150
+ - **Insufficient**: Results not reliable for scientific conclusions
1151
 
1152
+ ## 🀝 Contributing to Science
1153
 
1154
+ This leaderboard is designed for the research community. When using results:
1155
+
1156
+ 1. **Always report confidence intervals** along with point estimates
1157
+ 2. **Acknowledge statistical adequacy** when interpreting results
1158
+ 3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
1159
+ 4. **Consider effect sizes** not just statistical significance
1160
 
1161
  ## πŸ“„ Citation
1162
 
1163
  If you use this leaderboard in your research, please cite:
1164
 
1165
  ```bibtex
1166
+ @misc{{salt_leaderboard_scientific_2024,
1167
+ title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
1168
  author={{Sunbird AI}},
1169
  year={{2024}},
1170
+ url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
1171
+ note={{Three-tier evaluation system with statistical significance testing}}
1172
  }}
1173
  ```
1174
 
1175
  ## πŸ”— Related Resources
1176
 
1177
  - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
1178
+ - **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
1179
+ - **Statistical Methodology**: See our technical paper on rigorous MT evaluation
1180
+ - **Open Source Code**: Available on GitHub for reproducibility
1181
+
1182
+ ---
1183
+
1184
+ *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
1185
  """)
1186
 
1187
+ # Event handlers with enhanced scientific functionality
1188
  predictions_validated = gr.State(value=None)
1189
  validation_info_state = gr.State(value=None)
1190
+ detected_category_state = gr.State(value="community")
1191
 
1192
  # Download test set
1193
  download_btn.click(
1194
+ fn=download_scientific_test_set,
1195
  outputs=[download_file, download_info]
1196
  )
1197
 
1198
  # Validate predictions
1199
+ def handle_scientific_validation(file, model_name, author, description):
1200
+ report, predictions, category = validate_scientific_submission(file, model_name, author, description)
1201
  valid = predictions is not None
1202
+
1203
+ return (
1204
+ report,
1205
+ predictions,
1206
+ {"category": category, "validation_passed": valid},
1207
+ category,
1208
+ gr.update(interactive=valid)
1209
+ )
 
 
 
 
 
 
 
 
1210
 
1211
  validate_btn.click(
1212
+ fn=handle_scientific_validation,
1213
  inputs=[predictions_file, model_name_input, author_input, description_input],
1214
+ outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
1215
  )
1216
 
1217
  # Submit for evaluation
1218
+ def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
1219
  if predictions is None:
1220
+ return "❌ Please validate your submission first", None, None, None, None
 
 
 
 
 
 
1221
 
1222
+ return evaluate_scientific_submission(
1223
+ predictions, model_name, author, description, category, validation_info
1224
+ )
1225
 
1226
  submit_btn.click(
1227
+ fn=handle_scientific_submission,
1228
+ inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
1229
+ outputs=[evaluation_output, results_table, submission_plot, cross_track_plot, current_leaderboard]
1230
  )
1231
 
1232
+ # Track leaderboard refresh functions
1233
+ def refresh_google_track(*args):
1234
+ return refresh_track_leaderboard("google_comparable", *args)
1235
+
1236
+ def refresh_ug40_track(*args):
1237
+ return refresh_track_leaderboard("ug40_complete", *args)
 
 
 
 
 
1238
 
1239
+ def refresh_matrix_track(*args):
1240
+ return refresh_track_leaderboard("language_pair_matrix", *args)
1241
+
1242
+ # Google-Comparable Track
1243
+ google_refresh.click(
1244
+ fn=refresh_google_track,
1245
+ inputs=[google_search, google_category, google_adequacy],
1246
+ outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
1247
  )
1248
 
1249
+ # UG40-Complete Track
1250
+ ug40_refresh.click(
1251
+ fn=refresh_ug40_track,
1252
+ inputs=[ug40_search, ug40_category, ug40_adequacy],
1253
+ outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
1254
+ )
1255
+
1256
+ # Language-Pair Matrix Track
1257
+ matrix_refresh.click(
1258
+ fn=refresh_matrix_track,
1259
+ inputs=[matrix_search, matrix_category, matrix_adequacy],
1260
+ outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
1261
+ )
1262
 
1263
  # Model analysis
1264
  analyze_btn.click(
1265
+ fn=get_scientific_model_details,
1266
+ inputs=[model_select, track_select],
1267
+ outputs=[model_details, model_analysis_plot, model_heatmap_plot]
1268
+ )
1269
+
1270
+ # Model comparison
1271
+ compare_btn.click(
1272
+ fn=perform_model_comparison,
1273
+ inputs=[comparison_models, comparison_track, comparison_type],
1274
+ outputs=[comparison_output, comparison_plot]
1275
  )
1276
 
1277
+ # Update dropdown choices when leaderboard changes
1278
+ def update_dropdown_choices():
1279
+ if current_leaderboard is not None and not current_leaderboard.empty:
1280
+ model_choices = current_leaderboard['model_name'].tolist()
1281
+ else:
1282
+ model_choices = []
1283
+
1284
+ return (
1285
+ gr.Dropdown(choices=model_choices),
1286
+ gr.CheckboxGroup(choices=model_choices)
1287
+ )
1288
+
1289
+ # Load initial data and update dropdowns
1290
  demo.load(
1291
+ fn=lambda: (
1292
+ refresh_google_track("", "all", 0.0),
1293
+ update_dropdown_choices()
1294
+ ),
1295
+ outputs=[
1296
+ [google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats],
1297
+ [model_select, comparison_models]
1298
+ ]
1299
  )
1300
 
1301
+ # Launch the scientific application
1302
  if __name__ == "__main__":
1303
  demo.launch(
1304
  server_name="0.0.0.0",