akera commited on
Commit
aed11c8
Β·
verified Β·
1 Parent(s): 988dfa3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +447 -286
app.py CHANGED
@@ -4,52 +4,64 @@ import sys
4
  import os
5
  from pathlib import Path
6
 
 
7
  def setup_salt():
8
  """Clone and setup SALT library like in Colab."""
9
  try:
10
  # Check if salt is already available
11
  import salt.dataset
 
12
  print("βœ… SALT library already available")
13
  return True
14
  except ImportError:
15
  pass
16
-
17
  print("πŸ“₯ Setting up SALT library...")
18
-
19
  try:
20
  # Clone SALT repo if not exists
21
  salt_dir = Path("salt")
22
  if not salt_dir.exists():
23
  print("πŸ”„ Cloning SALT repository...")
24
- subprocess.check_call([
25
- "git", "clone", "https://github.com/sunbirdai/salt.git"
26
- ])
27
  else:
28
  print("πŸ“ SALT repository already exists")
29
-
30
  # Install SALT requirements
31
  salt_requirements = salt_dir / "requirements.txt"
32
  if salt_requirements.exists():
33
  print("πŸ“¦ Installing SALT requirements...")
34
- subprocess.check_call([
35
- sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
36
- ])
37
-
 
 
 
 
 
 
 
 
38
  # Add SALT directory to Python path
39
  salt_path = str(salt_dir.absolute())
40
  if salt_path not in sys.path:
41
  sys.path.insert(0, salt_path)
42
  print(f"πŸ”— Added {salt_path} to Python path")
43
-
44
  # Test import
45
  import salt.dataset
 
46
  print("βœ… SALT library setup completed successfully")
47
  return True
48
-
49
  except Exception as e:
50
  print(f"❌ Failed to setup SALT: {e}")
51
  return False
52
 
 
53
  # Setup SALT on startup
54
  print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
55
  if not setup_salt():
@@ -66,42 +78,42 @@ from typing import Optional, Dict, Tuple, List
66
 
67
  # Import our enhanced modules
68
  from src.test_set import (
69
- get_public_test_set_scientific,
70
  get_complete_test_set_scientific,
71
- create_test_set_download_scientific,
72
  validate_test_set_integrity_scientific,
73
- get_track_test_set
74
  )
75
  from src.validation import validate_submission_scientific
76
  from src.evaluation import (
77
- evaluate_predictions_scientific,
78
  generate_scientific_report,
79
- compare_models_statistically
80
  )
81
  from src.leaderboard import (
82
- load_scientific_leaderboard,
83
  add_model_to_scientific_leaderboard,
84
- get_scientific_leaderboard_stats,
85
  get_track_leaderboard,
86
  prepare_track_leaderboard_display,
87
  perform_fair_comparison,
88
- export_scientific_leaderboard
89
  )
90
  from src.plotting import (
91
- create_scientific_leaderboard_plot,
92
  create_language_pair_heatmap_scientific,
93
  create_statistical_comparison_plot,
94
  create_category_comparison_plot,
95
  create_adequacy_analysis_plot,
96
  create_cross_track_analysis_plot,
97
- create_scientific_model_detail_plot
98
  )
99
  from src.utils import (
100
- sanitize_model_name,
101
- get_all_language_pairs,
102
  get_google_comparable_pairs,
103
  get_track_language_pairs,
104
- format_metric_value
105
  )
106
  from config import *
107
 
@@ -111,60 +123,64 @@ public_test_set = None
111
  complete_test_set = None
112
  test_set_stats = None
113
 
 
114
  def initialize_scientific_data():
115
  """Initialize scientific test sets and leaderboard data."""
116
  global public_test_set, complete_test_set, current_leaderboard, test_set_stats
117
-
118
  try:
119
  print("πŸ”¬ Initializing SALT Translation Leaderboard - Scientific Edition...")
120
-
121
  # Load scientific test sets
122
  print("πŸ“₯ Loading scientific test sets...")
123
  public_test_set = get_public_test_set_scientific()
124
  complete_test_set = get_complete_test_set_scientific()
125
-
126
  # Load scientific leaderboard
127
  print("πŸ† Loading scientific leaderboard...")
128
  current_leaderboard = load_scientific_leaderboard()
129
-
130
  # Validate test set integrity
131
  print("πŸ” Validating test set integrity...")
132
  test_set_stats = validate_test_set_integrity_scientific()
133
-
134
  print(f"βœ… Scientific initialization complete!")
135
  print(f" - Test set: {len(public_test_set):,} samples")
136
  print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
137
- print(f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
 
 
138
  print(f" - Current models: {len(current_leaderboard)}")
139
-
140
  return True
141
-
142
  except Exception as e:
143
  print(f"❌ Scientific initialization failed: {e}")
144
  traceback.print_exc()
145
  return False
146
 
 
147
  def download_scientific_test_set() -> Tuple[str, str]:
148
  """Create downloadable scientific test set and return file path and info."""
149
-
150
  try:
151
  global public_test_set
152
  if public_test_set is None:
153
  public_test_set = get_public_test_set_scientific()
154
-
155
  # Create download file
156
  download_path, stats = create_test_set_download_scientific()
157
-
158
  # Create comprehensive info message
159
- adequacy = stats.get('adequacy_assessment', 'unknown')
160
  adequacy_emoji = {
161
- 'excellent': '🟒',
162
- 'good': '🟑',
163
- 'fair': '🟠',
164
- 'insufficient': 'πŸ”΄',
165
- 'unknown': 'βšͺ'
166
- }.get(adequacy, 'βšͺ')
167
-
168
  info_msg = f"""
169
  ## πŸ“₯ SALT Scientific Test Set Downloaded Successfully!
170
 
@@ -182,10 +198,12 @@ def download_scientific_test_set() -> Tuple[str, str]:
182
 
183
  ### 🏁 Track Breakdown:
184
  """
185
-
186
- track_breakdown = stats.get('track_breakdown', {})
187
  for track_name, track_info in track_breakdown.items():
188
- status_emoji = 'βœ…' if track_info.get('statistical_adequacy', False) else '⚠️'
 
 
189
  info_msg += f"""
190
  **{status_emoji} {track_info.get('name', track_name)}**:
191
  - Samples: {track_info.get('total_samples', 0):,}
@@ -193,7 +211,7 @@ def download_scientific_test_set() -> Tuple[str, str]:
193
  - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
194
  - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
195
  """
196
-
197
  info_msg += f"""
198
 
199
  ### πŸ“‹ Enhanced File Format:
@@ -219,18 +237,19 @@ def download_scientific_test_set() -> Tuple[str, str]:
219
  - Provide detailed model description for proper categorization
220
  - Consider submitting to multiple tracks for comprehensive evaluation
221
  """
222
-
223
  return download_path, info_msg
224
-
225
  except Exception as e:
226
  error_msg = f"❌ Error creating scientific test set download: {str(e)}"
227
  return None, error_msg
228
 
 
229
  def validate_scientific_submission(
230
  file, model_name: str, author: str, description: str
231
  ) -> Tuple[str, Optional[pd.DataFrame], str]:
232
  """Validate uploaded prediction file with scientific rigor."""
233
-
234
  try:
235
  if file is None:
236
  return "❌ Please upload a predictions file", None, "community"
@@ -270,9 +289,13 @@ def validate_scientific_submission(
270
  )
271
 
272
  detected_category = validation_result.get("category", "community")
273
-
274
  if validation_result["valid"]:
275
- return validation_result["report"], validation_result["predictions"], detected_category
 
 
 
 
276
  else:
277
  return validation_result["report"], None, detected_category
278
 
@@ -280,9 +303,10 @@ def validate_scientific_submission(
280
  return (
281
  f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
282
  None,
283
- "community"
284
  )
285
 
 
286
  def evaluate_scientific_submission(
287
  predictions_df: pd.DataFrame,
288
  model_name: str,
@@ -292,26 +316,33 @@ def evaluate_scientific_submission(
292
  validation_info: Dict,
293
  ) -> Tuple[str, pd.DataFrame, object, object]:
294
  """Evaluate validated predictions using scientific methodology."""
295
-
296
  try:
297
  if predictions_df is None:
298
  return "❌ No valid predictions to evaluate", None, None, None
299
-
300
  # Get complete test set with targets
301
  global complete_test_set, current_leaderboard
302
  if complete_test_set is None:
303
  complete_test_set = get_complete_test_set_scientific()
304
-
305
  # Run scientific evaluation across all tracks
306
  print(f"πŸ”¬ Starting scientific evaluation for {model_name}...")
307
  evaluation_results = evaluate_predictions_scientific(
308
  predictions_df, complete_test_set, detected_category
309
  )
310
-
311
- if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
312
- errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
 
 
 
 
 
 
 
313
  return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
314
-
315
  # Add to scientific leaderboard
316
  print("πŸ† Adding to scientific leaderboard...")
317
  updated_leaderboard = add_model_to_scientific_leaderboard(
@@ -319,23 +350,27 @@ def evaluate_scientific_submission(
319
  author=author or "Anonymous",
320
  evaluation_results=evaluation_results,
321
  model_category=detected_category,
322
- description=description or ""
323
  )
324
-
325
  # Update global leaderboard
326
  current_leaderboard = updated_leaderboard
327
-
328
  # Generate scientific report
329
  report = generate_scientific_report(evaluation_results, model_name)
330
-
331
  # Create visualizations
332
  summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
333
  cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
334
-
335
  # Prepare display leaderboard (Google-comparable track by default)
336
- google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
337
- display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
338
-
 
 
 
 
339
  # Format success message with track-specific results
340
  success_msg = f"""
341
  ## πŸŽ‰ Scientific Evaluation Complete!
@@ -347,28 +382,33 @@ def evaluate_scientific_submission(
347
 
348
  ### πŸ† Track Performance Summary:
349
  """
350
-
351
- tracks = evaluation_results.get('tracks', {})
352
  for track_name, track_data in tracks.items():
353
- if not track_data.get('error'):
354
  track_config = EVALUATION_TRACKS[track_name]
355
- track_averages = track_data.get('track_averages', {})
356
- summary = track_data.get('summary', {})
357
-
358
  # Get rank in this track
359
- track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
 
 
360
  if not track_leaderboard.empty:
361
- model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
 
 
 
362
  rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
363
  total_models = len(track_leaderboard)
364
  else:
365
  rank = "N/A"
366
  total_models = 0
367
-
368
- quality_score = track_averages.get('quality_score', 0)
369
- bleu_score = track_averages.get('bleu', 0)
370
- samples = summary.get('total_samples', 0)
371
-
372
  success_msg += f"""
373
  **🏁 {track_config['name']}**:
374
  - Rank: #{rank} out of {total_models} models
@@ -376,7 +416,7 @@ def evaluate_scientific_submission(
376
  - BLEU: {bleu_score:.2f}
377
  - Samples: {samples:,}
378
  """
379
-
380
  success_msg += f"""
381
 
382
  ### πŸ”¬ Scientific Adequacy:
@@ -386,52 +426,57 @@ def evaluate_scientific_submission(
386
 
387
  {report}
388
  """
389
-
390
  return success_msg, display_leaderboard, summary_plot, cross_track_plot
391
 
392
  except Exception as e:
393
  error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
394
  return error_msg, None, None, None
395
 
 
396
  def refresh_track_leaderboard(
397
  track: str,
398
  search_query: str = "",
399
  category_filter: str = "all",
400
  min_adequacy: float = 0.0,
401
- show_ci: bool = True
402
  ) -> Tuple[pd.DataFrame, object, object, str]:
403
  """Refresh leaderboard for a specific track with filters."""
404
-
405
  try:
406
  global current_leaderboard
407
  if current_leaderboard is None:
408
  current_leaderboard = load_scientific_leaderboard()
409
-
410
  # Get track-specific leaderboard
411
  track_leaderboard = get_track_leaderboard(
412
- current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
 
 
 
413
  )
414
-
415
  # Apply search filter
416
  if search_query:
417
  query_lower = search_query.lower()
418
- mask = (
419
- track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
420
- track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
 
421
  )
422
  track_leaderboard = track_leaderboard[mask]
423
-
424
  # Prepare for display
425
  display_df = prepare_track_leaderboard_display(track_leaderboard, track)
426
-
427
  # Create plots
428
  ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
429
  comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
430
-
431
  # Get track statistics
432
  track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
433
  track_config = EVALUATION_TRACKS[track]
434
-
435
  stats_text = f"""
436
  ### πŸ“Š {track_config['name']} Statistics
437
 
@@ -447,46 +492,51 @@ def refresh_track_leaderboard(
447
  - Statistical adequacy verified for reliable comparisons
448
  - {track_config['description']}
449
  """
450
-
451
  return display_df, ranking_plot, comparison_plot, stats_text
452
-
453
  except Exception as e:
454
  error_msg = f"Error loading {track} leaderboard: {str(e)}"
455
  empty_df = pd.DataFrame()
456
  return empty_df, None, None, error_msg
457
 
458
- def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
 
 
 
459
  """Get detailed scientific analysis for a specific model."""
460
-
461
  try:
462
  global current_leaderboard
463
  if current_leaderboard is None:
464
  return "Leaderboard not loaded", None, None
465
-
466
  # Find model
467
- model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
468
-
469
  if model_row.empty:
470
  return f"Model '{model_name}' not found", None, None
471
-
472
  model_info = model_row.iloc[0]
473
-
474
  # Parse detailed metrics for the requested track
475
  try:
476
- detailed_results = json.loads(model_info[f'detailed_{track}'])
477
  except:
478
  detailed_results = {}
479
-
480
  # Create detailed plots
481
- detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
482
-
 
 
483
  # Create language pair heatmap
484
  heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
485
-
486
  # Format model details with scientific information
487
  track_config = EVALUATION_TRACKS[track]
488
- category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
489
-
490
  # Extract track-specific metrics
491
  quality_col = f"{track}_quality"
492
  bleu_col = f"{track}_bleu"
@@ -496,7 +546,7 @@ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, obje
496
  samples_col = f"{track}_samples"
497
  pairs_col = f"{track}_pairs"
498
  adequate_col = f"{track}_adequate"
499
-
500
  details_text = f"""
501
  ## πŸ”¬ Scientific Model Analysis: {model_name}
502
 
@@ -523,19 +573,19 @@ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, obje
523
 
524
  ### πŸ“ˆ Cross-Track Performance:
525
  """
526
-
527
  # Add other track performances for comparison
528
  for other_track in EVALUATION_TRACKS.keys():
529
  if other_track != track:
530
  other_quality_col = f"{other_track}_quality"
531
  other_adequate_col = f"{other_track}_adequate"
532
-
533
  if model_info.get(other_adequate_col, False):
534
  other_quality = model_info.get(other_quality_col, 0)
535
  details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
536
  else:
537
  details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
538
-
539
  details_text += f"""
540
 
541
  ### πŸ’‘ Scientific Interpretation:
@@ -544,44 +594,47 @@ def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, obje
544
  - Cross-track analysis reveals model strengths across different language sets
545
  - Category classification helps contextualize performance expectations
546
  """
547
-
548
  return details_text, detail_plot, heatmap_plot
549
-
550
  except Exception as e:
551
  error_msg = f"Error getting model details: {str(e)}"
552
  return error_msg, None, None
553
 
 
554
  def perform_model_comparison(
555
  model_names: List[str], track: str, comparison_type: str = "statistical"
556
  ) -> Tuple[str, object]:
557
  """Perform scientific comparison between selected models."""
558
-
559
  try:
560
  global current_leaderboard
561
  if current_leaderboard is None:
562
  return "Leaderboard not loaded", None
563
-
564
  if len(model_names) < 2:
565
  return "Please select at least 2 models for comparison", None
566
-
567
  # Get models
568
- models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
569
-
 
 
570
  if len(models) < 2:
571
  return "Selected models not found in leaderboard", None
572
-
573
  # Perform fair comparison
574
  comparison_result = perform_fair_comparison(current_leaderboard, model_names)
575
-
576
- if comparison_result.get('error'):
577
  return f"Comparison error: {comparison_result['error']}", None
578
-
579
  # Create comparison visualization
580
  if comparison_type == "statistical":
581
  comparison_plot = create_statistical_comparison_plot(models, track)
582
  else:
583
  comparison_plot = create_category_comparison_plot(models, track)
584
-
585
  # Format comparison report
586
  track_config = EVALUATION_TRACKS[track]
587
  comparison_text = f"""
@@ -589,26 +642,26 @@ def perform_model_comparison(
589
 
590
  ### πŸ“Š Models Compared:
591
  """
592
-
593
  quality_col = f"{track}_quality"
594
  ci_lower_col = f"{track}_ci_lower"
595
  ci_upper_col = f"{track}_ci_upper"
596
-
597
  # Sort models by performance
598
  models_sorted = models.sort_values(quality_col, ascending=False)
599
-
600
  for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
601
- category_info = MODEL_CATEGORIES.get(model['model_category'], {})
602
-
603
  comparison_text += f"""
604
  **#{i}. {model['model_name']}**
605
  - Category: {category_info.get('name', 'Unknown')}
606
  - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
607
  - Author: {model['author']}
608
  """
609
-
610
  # Add statistical analysis
611
- track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
612
  if track_comparison:
613
  comparison_text += f"""
614
 
@@ -617,29 +670,32 @@ def perform_model_comparison(
617
  - **Confidence intervals available**: Yes (95% level)
618
  - **Fair comparison possible**: {'βœ… Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
619
  """
620
-
621
  # Check for statistical significance (simplified)
622
- quality_scores = list(track_comparison.get('quality_scores', {}).values())
623
  if len(quality_scores) >= 2:
624
  score_range = max(quality_scores) - min(quality_scores)
625
  if score_range > 0.05: # 5% difference threshold
626
- comparison_text += "- **Performance differences**: Potentially significant\n"
 
 
627
  else:
628
  comparison_text += "- **Performance differences**: Minimal\n"
629
-
630
  # Add recommendations
631
- recommendations = comparison_result.get('recommendations', [])
632
  if recommendations:
633
  comparison_text += "\n### πŸ’‘ Recommendations:\n"
634
  for rec in recommendations:
635
  comparison_text += f"- {rec}\n"
636
-
637
  return comparison_text, comparison_plot
638
-
639
  except Exception as e:
640
  error_msg = f"Error performing comparison: {str(e)}"
641
  return error_msg, None
642
 
 
643
  # Initialize data on startup
644
  print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
645
  initialization_success = initialize_scientific_data()
@@ -698,31 +754,36 @@ with gr.Blocks(
698
  .adequacy-good { border-left-color: #eab308; }
699
  .adequacy-fair { border-left-color: #f97316; }
700
  .adequacy-insufficient { border-left-color: #ef4444; }
701
- """
702
  ) as demo:
703
-
704
  # Scientific Header
705
- gr.HTML(f"""
 
706
  <div class="scientific-header">
707
  <h1>πŸ† SALT Translation Leaderboard - Scientific Edition</h1>
708
  <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
709
  <p>Three-tier evaluation tracks β€’ 95% Confidence intervals β€’ Research-grade analysis</p>
710
  <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
711
  </div>
712
- """)
713
-
 
714
  # Status indicator
715
  if initialization_success:
716
  status_msg = "βœ… Scientific system initialized successfully"
717
- adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
 
 
718
  status_msg += f" | Test set adequacy: {adequacy_info.title()}"
719
  else:
720
  status_msg = "❌ System initialization failed - some features may not work"
721
-
722
  gr.Markdown(f"**System Status**: {status_msg}")
723
-
724
  # Add scientific overview
725
- gr.Markdown("""
 
726
  ## πŸ”¬ Scientific Evaluation Framework
727
 
728
  This leaderboard implements rigorous scientific methodology for translation model evaluation:
@@ -731,89 +792,110 @@ with gr.Blocks(
731
  - **Statistical Significance**: 95% confidence intervals and effect size analysis
732
  - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
733
  - **Cross-Track Consistency**: Validate model performance across language sets
734
- """)
 
735
 
736
  with gr.Tabs():
737
-
738
  # Tab 1: Download Test Set
739
  with gr.Tab("πŸ“₯ Download Test Set", id="download"):
740
- gr.Markdown("""
 
741
  ## πŸ“‹ Get the SALT Scientific Test Set
742
 
743
  Download our scientifically designed test set with stratified sampling and statistical weighting.
744
- """)
745
-
 
746
  with gr.Row():
747
- download_btn = gr.Button("πŸ“₯ Download Scientific Test Set", variant="primary", size="lg")
748
-
 
 
749
  with gr.Row():
750
  with gr.Column():
751
  download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
752
  with gr.Column():
753
  download_info = gr.Markdown(label="ℹ️ Test Set Information")
754
-
755
- # Tab 2: Submit Predictions
756
  with gr.Tab("πŸš€ Submit Predictions", id="submit"):
757
- gr.Markdown("""
 
758
  ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
759
 
760
  Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
761
- """)
762
-
 
763
  with gr.Row():
764
  with gr.Column(scale=1):
765
  gr.Markdown("### πŸ“ Model Information")
766
-
767
  model_name_input = gr.Textbox(
768
  label="πŸ€– Model Name",
769
  placeholder="e.g., MyTranslator-v2.0",
770
- info="Unique name for your model"
771
  )
772
-
773
  author_input = gr.Textbox(
774
- label="πŸ‘€ Author/Organization",
775
  placeholder="Your name or organization",
776
- value="Anonymous"
777
  )
778
-
779
  description_input = gr.Textbox(
780
  label="πŸ“„ Model Description",
781
  placeholder="Architecture, training data, special features...",
782
  lines=4,
783
- info="Detailed description helps with proper categorization"
784
  )
785
-
786
  gr.Markdown("### πŸ“€ Upload Predictions")
787
  predictions_file = gr.File(
788
  label="πŸ“‚ Predictions File",
789
- file_types=[".csv", ".tsv", ".json"]
790
  )
791
-
792
- validate_btn = gr.Button("βœ… Validate Submission", variant="secondary")
793
- submit_btn = gr.Button("πŸš€ Submit for Scientific Evaluation", variant="primary", interactive=False)
794
-
 
 
 
 
 
 
795
  with gr.Column(scale=1):
796
  gr.Markdown("### πŸ“Š Validation Results")
797
  validation_output = gr.Markdown()
798
-
799
  # Results section
800
  gr.Markdown("### πŸ† Scientific Evaluation Results")
801
-
802
  with gr.Row():
803
  evaluation_output = gr.Markdown()
804
-
805
  with gr.Row():
806
  with gr.Column():
807
  submission_plot = gr.Plot(label="πŸ“ˆ Submission Analysis")
808
  with gr.Column():
809
  cross_track_plot = gr.Plot(label="πŸ”„ Cross-Track Analysis")
810
-
811
  with gr.Row():
812
- results_table = gr.Dataframe(label="πŸ“Š Updated Leaderboard (Google-Comparable Track)", interactive=False)
813
-
 
 
 
814
  # Tab 3: Google-Comparable Track
815
- with gr.Tab("πŸ€– Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
816
- gr.Markdown(f"""
 
 
 
 
 
817
  ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
818
 
819
  **Fair comparison with commercial translation systems**
@@ -824,40 +906,54 @@ with gr.Blocks(
824
  - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
825
  - **Purpose**: Commercial system comparison and baseline establishment
826
  - **Statistical Power**: High (optimized sample sizes)
827
- """)
828
-
 
829
  with gr.Row():
830
  with gr.Column(scale=2):
831
- google_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
 
 
 
832
  with gr.Column(scale=1):
833
  google_category = gr.Dropdown(
834
  label="🏷️ Category Filter",
835
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
836
- value="all"
837
  )
838
  with gr.Column(scale=1):
839
  google_adequacy = gr.Slider(
840
  label="πŸ“Š Min Adequacy",
841
- minimum=0.0, maximum=1.0, value=0.0, step=0.1
 
 
 
842
  )
843
  with gr.Column(scale=1):
844
  google_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
845
-
846
  with gr.Row():
847
  google_stats = gr.Markdown()
848
-
849
  with gr.Row():
850
  with gr.Column():
851
  google_ranking_plot = gr.Plot(label="πŸ† Google-Comparable Rankings")
852
  with gr.Column():
853
  google_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
854
-
855
  with gr.Row():
856
- google_leaderboard = gr.Dataframe(label="πŸ“ˆ Google-Comparable Leaderboard", interactive=False)
857
-
 
 
858
  # Tab 4: UG40-Complete Track
859
- with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
860
- gr.Markdown(f"""
 
 
 
 
 
861
  ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
862
 
863
  **Comprehensive evaluation across all Ugandan languages**
@@ -868,40 +964,54 @@ with gr.Blocks(
868
  - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
869
  - **Purpose**: Comprehensive Ugandan language capability assessment
870
  - **Coverage**: Complete linguistic landscape of Uganda
871
- """)
872
-
 
873
  with gr.Row():
874
  with gr.Column(scale=2):
875
- ug40_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
 
 
 
876
  with gr.Column(scale=1):
877
  ug40_category = gr.Dropdown(
878
  label="🏷️ Category Filter",
879
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
880
- value="all"
881
  )
882
  with gr.Column(scale=1):
883
  ug40_adequacy = gr.Slider(
884
  label="πŸ“Š Min Adequacy",
885
- minimum=0.0, maximum=1.0, value=0.0, step=0.1
 
 
 
886
  )
887
  with gr.Column(scale=1):
888
  ug40_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
889
-
890
  with gr.Row():
891
  ug40_stats = gr.Markdown()
892
-
893
  with gr.Row():
894
  with gr.Column():
895
  ug40_ranking_plot = gr.Plot(label="πŸ† UG40-Complete Rankings")
896
  with gr.Column():
897
  ug40_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
898
-
899
  with gr.Row():
900
- ug40_leaderboard = gr.Dataframe(label="πŸ“ˆ UG40-Complete Leaderboard", interactive=False)
901
-
 
 
902
  # Tab 5: Language-Pair Matrix
903
- with gr.Tab("πŸ“Š Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
904
- gr.Markdown(f"""
 
 
 
 
 
905
  ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
906
 
907
  **Detailed language pair analysis with statistical significance**
@@ -912,112 +1022,130 @@ with gr.Blocks(
912
  - **Resolution**: Individual language pair performance
913
  - **Purpose**: Detailed linguistic analysis and model diagnostics
914
  - **Statistics**: Pairwise significance testing available
915
- """)
916
-
 
917
  with gr.Row():
918
  with gr.Column(scale=2):
919
- matrix_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
 
 
 
920
  with gr.Column(scale=1):
921
  matrix_category = gr.Dropdown(
922
  label="🏷️ Category Filter",
923
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
924
- value="all"
925
  )
926
  with gr.Column(scale=1):
927
  matrix_adequacy = gr.Slider(
928
  label="πŸ“Š Min Adequacy",
929
- minimum=0.0, maximum=1.0, value=0.0, step=0.1
 
 
 
930
  )
931
  with gr.Column(scale=1):
932
  matrix_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
933
-
934
  with gr.Row():
935
  matrix_stats = gr.Markdown()
936
-
937
  with gr.Row():
938
  with gr.Column():
939
- matrix_ranking_plot = gr.Plot(label="πŸ† Language-Pair Matrix Rankings")
 
 
940
  with gr.Column():
941
  matrix_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
942
-
943
  with gr.Row():
944
- matrix_leaderboard = gr.Dataframe(label="πŸ“ˆ Language-Pair Matrix Leaderboard", interactive=False)
945
-
 
 
946
  # Tab 6: Model Analysis
947
  with gr.Tab("πŸ” Scientific Model Analysis", id="analysis"):
948
- gr.Markdown("""
 
949
  ## πŸ”¬ Detailed Scientific Model Analysis
950
 
951
  Comprehensive analysis of individual models with statistical confidence intervals,
952
  cross-track performance, and detailed language pair breakdowns.
953
- """)
954
-
 
955
  with gr.Row():
956
  with gr.Column(scale=2):
957
  model_select = gr.Dropdown(
958
  label="πŸ€– Select Model",
959
  choices=[],
960
  value=None,
961
- info="Choose a model for detailed scientific analysis"
962
  )
963
  with gr.Column(scale=1):
964
  track_select = gr.Dropdown(
965
  label="🏁 Analysis Track",
966
  choices=list(EVALUATION_TRACKS.keys()),
967
  value="google_comparable",
968
- info="Track for detailed analysis"
969
  )
970
  with gr.Column(scale=1):
971
  analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
972
-
973
  with gr.Row():
974
  model_details = gr.Markdown()
975
-
976
  with gr.Row():
977
  with gr.Column():
978
- model_analysis_plot = gr.Plot(label="πŸ“Š Detailed Performance Analysis")
 
 
979
  with gr.Column():
980
  model_heatmap_plot = gr.Plot(label="πŸ—ΊοΈ Language Pair Heatmap")
981
-
982
  # Tab 7: Model Comparison
983
  with gr.Tab("βš–οΈ Scientific Model Comparison", id="comparison"):
984
- gr.Markdown("""
 
985
  ## πŸ”¬ Scientific Model Comparison
986
 
987
  Compare multiple models with statistical significance testing and fair comparison analysis.
988
  Only models evaluated on the same language pairs are compared for scientific validity.
989
- """)
990
-
 
991
  with gr.Row():
992
  with gr.Column(scale=2):
993
  comparison_models = gr.CheckboxGroup(
994
  label="πŸ€– Select Models to Compare",
995
  choices=[],
996
  value=[],
997
- info="Select 2-6 models for comparison"
998
  )
999
  with gr.Column(scale=1):
1000
  comparison_track = gr.Dropdown(
1001
  label="🏁 Comparison Track",
1002
  choices=list(EVALUATION_TRACKS.keys()),
1003
- value="google_comparable"
1004
  )
1005
  comparison_type = gr.Radio(
1006
  label="πŸ“Š Comparison Type",
1007
  choices=["statistical", "category"],
1008
- value="statistical"
1009
  )
1010
  compare_btn = gr.Button("βš–οΈ Compare Models", variant="primary")
1011
-
1012
  with gr.Row():
1013
  comparison_output = gr.Markdown()
1014
-
1015
  with gr.Row():
1016
  comparison_plot = gr.Plot(label="πŸ“Š Model Comparison Analysis")
1017
-
1018
  # Tab 8: Documentation
1019
  with gr.Tab("πŸ“š Scientific Documentation", id="docs"):
1020
- gr.Markdown(f"""
 
1021
  # πŸ“– SALT Translation Leaderboard - Scientific Edition Documentation
1022
 
1023
  ## 🎯 Overview
@@ -1182,131 +1310,164 @@ with gr.Blocks(
1182
  ---
1183
 
1184
  *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
1185
- """)
1186
-
 
1187
  # Event handlers with enhanced scientific functionality
1188
  predictions_validated = gr.State(value=None)
1189
  validation_info_state = gr.State(value=None)
1190
  detected_category_state = gr.State(value="community")
1191
-
1192
  # Download test set
1193
  download_btn.click(
1194
- fn=download_scientific_test_set,
1195
- outputs=[download_file, download_info]
1196
  )
1197
-
1198
  # Validate predictions
1199
  def handle_scientific_validation(file, model_name, author, description):
1200
- report, predictions, category = validate_scientific_submission(file, model_name, author, description)
1201
- valid = predictions is not None
1202
-
 
 
 
 
 
 
 
 
 
1203
  return (
1204
  report,
1205
  predictions,
1206
- {"category": category, "validation_passed": valid},
1207
  category,
1208
- gr.update(interactive=valid)
1209
  )
1210
-
1211
  validate_btn.click(
1212
  fn=handle_scientific_validation,
1213
  inputs=[predictions_file, model_name_input, author_input, description_input],
1214
- outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
 
 
 
 
 
 
1215
  )
1216
-
1217
  # Submit for evaluation
1218
- def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
 
 
1219
  if predictions is None:
1220
  return "❌ Please validate your submission first", None, None, None
1221
-
1222
  return evaluate_scientific_submission(
1223
  predictions, model_name, author, description, category, validation_info
1224
  )
1225
-
1226
  submit_btn.click(
1227
  fn=handle_scientific_submission,
1228
- inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
1229
- outputs=[evaluation_output, results_table, submission_plot, cross_track_plot]
 
 
 
 
 
 
 
1230
  )
1231
-
1232
  # Track leaderboard refresh functions
1233
  def refresh_google_track(*args):
1234
  return refresh_track_leaderboard("google_comparable", *args)
1235
-
1236
  def refresh_ug40_track(*args):
1237
  return refresh_track_leaderboard("ug40_complete", *args)
1238
-
1239
  def refresh_matrix_track(*args):
1240
  return refresh_track_leaderboard("language_pair_matrix", *args)
1241
-
1242
  # Google-Comparable Track
1243
  google_refresh.click(
1244
  fn=refresh_google_track,
1245
  inputs=[google_search, google_category, google_adequacy],
1246
- outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
 
 
 
 
 
1247
  )
1248
-
1249
  # UG40-Complete Track
1250
  ug40_refresh.click(
1251
  fn=refresh_ug40_track,
1252
  inputs=[ug40_search, ug40_category, ug40_adequacy],
1253
- outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
1254
  )
1255
-
1256
  # Language-Pair Matrix Track
1257
  matrix_refresh.click(
1258
  fn=refresh_matrix_track,
1259
  inputs=[matrix_search, matrix_category, matrix_adequacy],
1260
- outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
 
 
 
 
 
1261
  )
1262
-
1263
  # Model analysis
1264
  analyze_btn.click(
1265
  fn=get_scientific_model_details,
1266
  inputs=[model_select, track_select],
1267
- outputs=[model_details, model_analysis_plot, model_heatmap_plot]
1268
  )
1269
-
1270
  # Model comparison
1271
  compare_btn.click(
1272
  fn=perform_model_comparison,
1273
  inputs=[comparison_models, comparison_track, comparison_type],
1274
- outputs=[comparison_output, comparison_plot]
1275
  )
1276
-
1277
  # Load initial data and update dropdowns
1278
  def load_initial_data():
1279
  # Load initial Google track data
1280
  google_data = refresh_google_track("", "all", 0.0)
1281
-
1282
  # Update dropdown choices
1283
  if current_leaderboard is not None and not current_leaderboard.empty:
1284
- model_choices = current_leaderboard['model_name'].tolist()
1285
  else:
1286
  model_choices = []
1287
-
1288
  return (
1289
  google_data[0], # google_leaderboard
1290
- google_data[1], # google_ranking_plot
1291
  google_data[2], # google_comparison_plot
1292
  google_data[3], # google_stats
1293
  gr.Dropdown(choices=model_choices), # model_select
1294
- gr.CheckboxGroup(choices=model_choices) # comparison_models
1295
  )
1296
-
1297
  demo.load(
1298
  fn=load_initial_data,
1299
  outputs=[
1300
- google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats,
1301
- model_select, comparison_models
1302
- ]
 
 
 
 
1303
  )
1304
 
1305
  # Launch the scientific application
1306
  if __name__ == "__main__":
1307
- demo.launch(
1308
- server_name="0.0.0.0",
1309
- server_port=7860,
1310
- share=False,
1311
- show_error=True
1312
- )
 
4
  import os
5
  from pathlib import Path
6
 
7
+
8
  def setup_salt():
9
  """Clone and setup SALT library like in Colab."""
10
  try:
11
  # Check if salt is already available
12
  import salt.dataset
13
+
14
  print("βœ… SALT library already available")
15
  return True
16
  except ImportError:
17
  pass
18
+
19
  print("πŸ“₯ Setting up SALT library...")
20
+
21
  try:
22
  # Clone SALT repo if not exists
23
  salt_dir = Path("salt")
24
  if not salt_dir.exists():
25
  print("πŸ”„ Cloning SALT repository...")
26
+ subprocess.check_call(
27
+ ["git", "clone", "https://github.com/sunbirdai/salt.git"]
28
+ )
29
  else:
30
  print("πŸ“ SALT repository already exists")
31
+
32
  # Install SALT requirements
33
  salt_requirements = salt_dir / "requirements.txt"
34
  if salt_requirements.exists():
35
  print("πŸ“¦ Installing SALT requirements...")
36
+ subprocess.check_call(
37
+ [
38
+ sys.executable,
39
+ "-m",
40
+ "pip",
41
+ "install",
42
+ "-q",
43
+ "-r",
44
+ str(salt_requirements),
45
+ ]
46
+ )
47
+
48
  # Add SALT directory to Python path
49
  salt_path = str(salt_dir.absolute())
50
  if salt_path not in sys.path:
51
  sys.path.insert(0, salt_path)
52
  print(f"πŸ”— Added {salt_path} to Python path")
53
+
54
  # Test import
55
  import salt.dataset
56
+
57
  print("βœ… SALT library setup completed successfully")
58
  return True
59
+
60
  except Exception as e:
61
  print(f"❌ Failed to setup SALT: {e}")
62
  return False
63
 
64
+
65
  # Setup SALT on startup
66
  print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
67
  if not setup_salt():
 
78
 
79
  # Import our enhanced modules
80
  from src.test_set import (
81
+ get_public_test_set_scientific,
82
  get_complete_test_set_scientific,
83
+ create_test_set_download_scientific,
84
  validate_test_set_integrity_scientific,
85
+ get_track_test_set,
86
  )
87
  from src.validation import validate_submission_scientific
88
  from src.evaluation import (
89
+ evaluate_predictions_scientific,
90
  generate_scientific_report,
91
+ compare_models_statistically,
92
  )
93
  from src.leaderboard import (
94
+ load_scientific_leaderboard,
95
  add_model_to_scientific_leaderboard,
96
+ get_scientific_leaderboard_stats,
97
  get_track_leaderboard,
98
  prepare_track_leaderboard_display,
99
  perform_fair_comparison,
100
+ export_scientific_leaderboard,
101
  )
102
  from src.plotting import (
103
+ create_scientific_leaderboard_plot,
104
  create_language_pair_heatmap_scientific,
105
  create_statistical_comparison_plot,
106
  create_category_comparison_plot,
107
  create_adequacy_analysis_plot,
108
  create_cross_track_analysis_plot,
109
+ create_scientific_model_detail_plot,
110
  )
111
  from src.utils import (
112
+ sanitize_model_name,
113
+ get_all_language_pairs,
114
  get_google_comparable_pairs,
115
  get_track_language_pairs,
116
+ format_metric_value,
117
  )
118
  from config import *
119
 
 
123
  complete_test_set = None
124
  test_set_stats = None
125
 
126
+
127
  def initialize_scientific_data():
128
  """Initialize scientific test sets and leaderboard data."""
129
  global public_test_set, complete_test_set, current_leaderboard, test_set_stats
130
+
131
  try:
132
  print("πŸ”¬ Initializing SALT Translation Leaderboard - Scientific Edition...")
133
+
134
  # Load scientific test sets
135
  print("πŸ“₯ Loading scientific test sets...")
136
  public_test_set = get_public_test_set_scientific()
137
  complete_test_set = get_complete_test_set_scientific()
138
+
139
  # Load scientific leaderboard
140
  print("πŸ† Loading scientific leaderboard...")
141
  current_leaderboard = load_scientific_leaderboard()
142
+
143
  # Validate test set integrity
144
  print("πŸ” Validating test set integrity...")
145
  test_set_stats = validate_test_set_integrity_scientific()
146
+
147
  print(f"βœ… Scientific initialization complete!")
148
  print(f" - Test set: {len(public_test_set):,} samples")
149
  print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
150
+ print(
151
+ f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}"
152
+ )
153
  print(f" - Current models: {len(current_leaderboard)}")
154
+
155
  return True
156
+
157
  except Exception as e:
158
  print(f"❌ Scientific initialization failed: {e}")
159
  traceback.print_exc()
160
  return False
161
 
162
+
163
  def download_scientific_test_set() -> Tuple[str, str]:
164
  """Create downloadable scientific test set and return file path and info."""
165
+
166
  try:
167
  global public_test_set
168
  if public_test_set is None:
169
  public_test_set = get_public_test_set_scientific()
170
+
171
  # Create download file
172
  download_path, stats = create_test_set_download_scientific()
173
+
174
  # Create comprehensive info message
175
+ adequacy = stats.get("adequacy_assessment", "unknown")
176
  adequacy_emoji = {
177
+ "excellent": "🟒",
178
+ "good": "🟑",
179
+ "fair": "🟠",
180
+ "insufficient": "πŸ”΄",
181
+ "unknown": "βšͺ",
182
+ }.get(adequacy, "βšͺ")
183
+
184
  info_msg = f"""
185
  ## πŸ“₯ SALT Scientific Test Set Downloaded Successfully!
186
 
 
198
 
199
  ### 🏁 Track Breakdown:
200
  """
201
+
202
+ track_breakdown = stats.get("track_breakdown", {})
203
  for track_name, track_info in track_breakdown.items():
204
+ status_emoji = (
205
+ "βœ…" if track_info.get("statistical_adequacy", False) else "⚠️"
206
+ )
207
  info_msg += f"""
208
  **{status_emoji} {track_info.get('name', track_name)}**:
209
  - Samples: {track_info.get('total_samples', 0):,}
 
211
  - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
212
  - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
213
  """
214
+
215
  info_msg += f"""
216
 
217
  ### πŸ“‹ Enhanced File Format:
 
237
  - Provide detailed model description for proper categorization
238
  - Consider submitting to multiple tracks for comprehensive evaluation
239
  """
240
+
241
  return download_path, info_msg
242
+
243
  except Exception as e:
244
  error_msg = f"❌ Error creating scientific test set download: {str(e)}"
245
  return None, error_msg
246
 
247
+
248
  def validate_scientific_submission(
249
  file, model_name: str, author: str, description: str
250
  ) -> Tuple[str, Optional[pd.DataFrame], str]:
251
  """Validate uploaded prediction file with scientific rigor."""
252
+
253
  try:
254
  if file is None:
255
  return "❌ Please upload a predictions file", None, "community"
 
289
  )
290
 
291
  detected_category = validation_result.get("category", "community")
292
+
293
  if validation_result["valid"]:
294
+ return (
295
+ validation_result["report"],
296
+ validation_result["predictions"],
297
+ detected_category,
298
+ )
299
  else:
300
  return validation_result["report"], None, detected_category
301
 
 
303
  return (
304
  f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
305
  None,
306
+ "community",
307
  )
308
 
309
+
310
  def evaluate_scientific_submission(
311
  predictions_df: pd.DataFrame,
312
  model_name: str,
 
316
  validation_info: Dict,
317
  ) -> Tuple[str, pd.DataFrame, object, object]:
318
  """Evaluate validated predictions using scientific methodology."""
319
+
320
  try:
321
  if predictions_df is None:
322
  return "❌ No valid predictions to evaluate", None, None, None
323
+
324
  # Get complete test set with targets
325
  global complete_test_set, current_leaderboard
326
  if complete_test_set is None:
327
  complete_test_set = get_complete_test_set_scientific()
328
+
329
  # Run scientific evaluation across all tracks
330
  print(f"πŸ”¬ Starting scientific evaluation for {model_name}...")
331
  evaluation_results = evaluate_predictions_scientific(
332
  predictions_df, complete_test_set, detected_category
333
  )
334
+
335
+ if any(
336
+ track_data.get("error")
337
+ for track_data in evaluation_results.get("tracks", {}).values()
338
+ ):
339
+ errors = [
340
+ track_data["error"]
341
+ for track_data in evaluation_results["tracks"].values()
342
+ if track_data.get("error")
343
+ ]
344
  return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
345
+
346
  # Add to scientific leaderboard
347
  print("πŸ† Adding to scientific leaderboard...")
348
  updated_leaderboard = add_model_to_scientific_leaderboard(
 
350
  author=author or "Anonymous",
351
  evaluation_results=evaluation_results,
352
  model_category=detected_category,
353
+ description=description or "",
354
  )
355
+
356
  # Update global leaderboard
357
  current_leaderboard = updated_leaderboard
358
+
359
  # Generate scientific report
360
  report = generate_scientific_report(evaluation_results, model_name)
361
+
362
  # Create visualizations
363
  summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
364
  cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
365
+
366
  # Prepare display leaderboard (Google-comparable track by default)
367
+ google_leaderboard = get_track_leaderboard(
368
+ updated_leaderboard, "google_comparable"
369
+ )
370
+ display_leaderboard = prepare_track_leaderboard_display(
371
+ google_leaderboard, "google_comparable"
372
+ )
373
+
374
  # Format success message with track-specific results
375
  success_msg = f"""
376
  ## πŸŽ‰ Scientific Evaluation Complete!
 
382
 
383
  ### πŸ† Track Performance Summary:
384
  """
385
+
386
+ tracks = evaluation_results.get("tracks", {})
387
  for track_name, track_data in tracks.items():
388
+ if not track_data.get("error"):
389
  track_config = EVALUATION_TRACKS[track_name]
390
+ track_averages = track_data.get("track_averages", {})
391
+ summary = track_data.get("summary", {})
392
+
393
  # Get rank in this track
394
+ track_leaderboard = get_track_leaderboard(
395
+ updated_leaderboard, track_name
396
+ )
397
  if not track_leaderboard.empty:
398
+ model_row = track_leaderboard[
399
+ track_leaderboard["model_name"]
400
+ == sanitize_model_name(model_name)
401
+ ]
402
  rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
403
  total_models = len(track_leaderboard)
404
  else:
405
  rank = "N/A"
406
  total_models = 0
407
+
408
+ quality_score = track_averages.get("quality_score", 0)
409
+ bleu_score = track_averages.get("bleu", 0)
410
+ samples = summary.get("total_samples", 0)
411
+
412
  success_msg += f"""
413
  **🏁 {track_config['name']}**:
414
  - Rank: #{rank} out of {total_models} models
 
416
  - BLEU: {bleu_score:.2f}
417
  - Samples: {samples:,}
418
  """
419
+
420
  success_msg += f"""
421
 
422
  ### πŸ”¬ Scientific Adequacy:
 
426
 
427
  {report}
428
  """
429
+
430
  return success_msg, display_leaderboard, summary_plot, cross_track_plot
431
 
432
  except Exception as e:
433
  error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
434
  return error_msg, None, None, None
435
 
436
+
437
  def refresh_track_leaderboard(
438
  track: str,
439
  search_query: str = "",
440
  category_filter: str = "all",
441
  min_adequacy: float = 0.0,
442
+ show_ci: bool = True,
443
  ) -> Tuple[pd.DataFrame, object, object, str]:
444
  """Refresh leaderboard for a specific track with filters."""
445
+
446
  try:
447
  global current_leaderboard
448
  if current_leaderboard is None:
449
  current_leaderboard = load_scientific_leaderboard()
450
+
451
  # Get track-specific leaderboard
452
  track_leaderboard = get_track_leaderboard(
453
+ current_leaderboard,
454
+ track,
455
+ category_filter=category_filter,
456
+ min_adequacy=min_adequacy,
457
  )
458
+
459
  # Apply search filter
460
  if search_query:
461
  query_lower = search_query.lower()
462
+ mask = track_leaderboard["model_name"].str.lower().str.contains(
463
+ query_lower, na=False
464
+ ) | track_leaderboard["author"].str.lower().str.contains(
465
+ query_lower, na=False
466
  )
467
  track_leaderboard = track_leaderboard[mask]
468
+
469
  # Prepare for display
470
  display_df = prepare_track_leaderboard_display(track_leaderboard, track)
471
+
472
  # Create plots
473
  ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
474
  comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
475
+
476
  # Get track statistics
477
  track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
478
  track_config = EVALUATION_TRACKS[track]
479
+
480
  stats_text = f"""
481
  ### πŸ“Š {track_config['name']} Statistics
482
 
 
492
  - Statistical adequacy verified for reliable comparisons
493
  - {track_config['description']}
494
  """
495
+
496
  return display_df, ranking_plot, comparison_plot, stats_text
497
+
498
  except Exception as e:
499
  error_msg = f"Error loading {track} leaderboard: {str(e)}"
500
  empty_df = pd.DataFrame()
501
  return empty_df, None, None, error_msg
502
 
503
+
504
+ def get_scientific_model_details(
505
+ model_name: str, track: str
506
+ ) -> Tuple[str, object, object]:
507
  """Get detailed scientific analysis for a specific model."""
508
+
509
  try:
510
  global current_leaderboard
511
  if current_leaderboard is None:
512
  return "Leaderboard not loaded", None, None
513
+
514
  # Find model
515
+ model_row = current_leaderboard[current_leaderboard["model_name"] == model_name]
516
+
517
  if model_row.empty:
518
  return f"Model '{model_name}' not found", None, None
519
+
520
  model_info = model_row.iloc[0]
521
+
522
  # Parse detailed metrics for the requested track
523
  try:
524
+ detailed_results = json.loads(model_info[f"detailed_{track}"])
525
  except:
526
  detailed_results = {}
527
+
528
  # Create detailed plots
529
+ detail_plot = create_scientific_model_detail_plot(
530
+ detailed_results, model_name, track
531
+ )
532
+
533
  # Create language pair heatmap
534
  heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
535
+
536
  # Format model details with scientific information
537
  track_config = EVALUATION_TRACKS[track]
538
+ category_info = MODEL_CATEGORIES.get(model_info["model_category"], {})
539
+
540
  # Extract track-specific metrics
541
  quality_col = f"{track}_quality"
542
  bleu_col = f"{track}_bleu"
 
546
  samples_col = f"{track}_samples"
547
  pairs_col = f"{track}_pairs"
548
  adequate_col = f"{track}_adequate"
549
+
550
  details_text = f"""
551
  ## πŸ”¬ Scientific Model Analysis: {model_name}
552
 
 
573
 
574
  ### πŸ“ˆ Cross-Track Performance:
575
  """
576
+
577
  # Add other track performances for comparison
578
  for other_track in EVALUATION_TRACKS.keys():
579
  if other_track != track:
580
  other_quality_col = f"{other_track}_quality"
581
  other_adequate_col = f"{other_track}_adequate"
582
+
583
  if model_info.get(other_adequate_col, False):
584
  other_quality = model_info.get(other_quality_col, 0)
585
  details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
586
  else:
587
  details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
588
+
589
  details_text += f"""
590
 
591
  ### πŸ’‘ Scientific Interpretation:
 
594
  - Cross-track analysis reveals model strengths across different language sets
595
  - Category classification helps contextualize performance expectations
596
  """
597
+
598
  return details_text, detail_plot, heatmap_plot
599
+
600
  except Exception as e:
601
  error_msg = f"Error getting model details: {str(e)}"
602
  return error_msg, None, None
603
 
604
+
605
  def perform_model_comparison(
606
  model_names: List[str], track: str, comparison_type: str = "statistical"
607
  ) -> Tuple[str, object]:
608
  """Perform scientific comparison between selected models."""
609
+
610
  try:
611
  global current_leaderboard
612
  if current_leaderboard is None:
613
  return "Leaderboard not loaded", None
614
+
615
  if len(model_names) < 2:
616
  return "Please select at least 2 models for comparison", None
617
+
618
  # Get models
619
+ models = current_leaderboard[
620
+ current_leaderboard["model_name"].isin(model_names)
621
+ ]
622
+
623
  if len(models) < 2:
624
  return "Selected models not found in leaderboard", None
625
+
626
  # Perform fair comparison
627
  comparison_result = perform_fair_comparison(current_leaderboard, model_names)
628
+
629
+ if comparison_result.get("error"):
630
  return f"Comparison error: {comparison_result['error']}", None
631
+
632
  # Create comparison visualization
633
  if comparison_type == "statistical":
634
  comparison_plot = create_statistical_comparison_plot(models, track)
635
  else:
636
  comparison_plot = create_category_comparison_plot(models, track)
637
+
638
  # Format comparison report
639
  track_config = EVALUATION_TRACKS[track]
640
  comparison_text = f"""
 
642
 
643
  ### πŸ“Š Models Compared:
644
  """
645
+
646
  quality_col = f"{track}_quality"
647
  ci_lower_col = f"{track}_ci_lower"
648
  ci_upper_col = f"{track}_ci_upper"
649
+
650
  # Sort models by performance
651
  models_sorted = models.sort_values(quality_col, ascending=False)
652
+
653
  for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
654
+ category_info = MODEL_CATEGORIES.get(model["model_category"], {})
655
+
656
  comparison_text += f"""
657
  **#{i}. {model['model_name']}**
658
  - Category: {category_info.get('name', 'Unknown')}
659
  - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
660
  - Author: {model['author']}
661
  """
662
+
663
  # Add statistical analysis
664
+ track_comparison = comparison_result.get("track_comparisons", {}).get(track, {})
665
  if track_comparison:
666
  comparison_text += f"""
667
 
 
670
  - **Confidence intervals available**: Yes (95% level)
671
  - **Fair comparison possible**: {'βœ… Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
672
  """
673
+
674
  # Check for statistical significance (simplified)
675
+ quality_scores = list(track_comparison.get("quality_scores", {}).values())
676
  if len(quality_scores) >= 2:
677
  score_range = max(quality_scores) - min(quality_scores)
678
  if score_range > 0.05: # 5% difference threshold
679
+ comparison_text += (
680
+ "- **Performance differences**: Potentially significant\n"
681
+ )
682
  else:
683
  comparison_text += "- **Performance differences**: Minimal\n"
684
+
685
  # Add recommendations
686
+ recommendations = comparison_result.get("recommendations", [])
687
  if recommendations:
688
  comparison_text += "\n### πŸ’‘ Recommendations:\n"
689
  for rec in recommendations:
690
  comparison_text += f"- {rec}\n"
691
+
692
  return comparison_text, comparison_plot
693
+
694
  except Exception as e:
695
  error_msg = f"Error performing comparison: {str(e)}"
696
  return error_msg, None
697
 
698
+
699
  # Initialize data on startup
700
  print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
701
  initialization_success = initialize_scientific_data()
 
754
  .adequacy-good { border-left-color: #eab308; }
755
  .adequacy-fair { border-left-color: #f97316; }
756
  .adequacy-insufficient { border-left-color: #ef4444; }
757
+ """,
758
  ) as demo:
759
+
760
  # Scientific Header
761
+ gr.HTML(
762
+ f"""
763
  <div class="scientific-header">
764
  <h1>πŸ† SALT Translation Leaderboard - Scientific Edition</h1>
765
  <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
766
  <p>Three-tier evaluation tracks β€’ 95% Confidence intervals β€’ Research-grade analysis</p>
767
  <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
768
  </div>
769
+ """
770
+ )
771
+
772
  # Status indicator
773
  if initialization_success:
774
  status_msg = "βœ… Scientific system initialized successfully"
775
+ adequacy_info = test_set_stats.get("scientific_adequacy", {}).get(
776
+ "overall_adequacy", "unknown"
777
+ )
778
  status_msg += f" | Test set adequacy: {adequacy_info.title()}"
779
  else:
780
  status_msg = "❌ System initialization failed - some features may not work"
781
+
782
  gr.Markdown(f"**System Status**: {status_msg}")
783
+
784
  # Add scientific overview
785
+ gr.Markdown(
786
+ """
787
  ## πŸ”¬ Scientific Evaluation Framework
788
 
789
  This leaderboard implements rigorous scientific methodology for translation model evaluation:
 
792
  - **Statistical Significance**: 95% confidence intervals and effect size analysis
793
  - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
794
  - **Cross-Track Consistency**: Validate model performance across language sets
795
+ """
796
+ )
797
 
798
  with gr.Tabs():
799
+
800
  # Tab 1: Download Test Set
801
  with gr.Tab("πŸ“₯ Download Test Set", id="download"):
802
+ gr.Markdown(
803
+ """
804
  ## πŸ“‹ Get the SALT Scientific Test Set
805
 
806
  Download our scientifically designed test set with stratified sampling and statistical weighting.
807
+ """
808
+ )
809
+
810
  with gr.Row():
811
+ download_btn = gr.Button(
812
+ "πŸ“₯ Download Scientific Test Set", variant="primary", size="lg"
813
+ )
814
+
815
  with gr.Row():
816
  with gr.Column():
817
  download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
818
  with gr.Column():
819
  download_info = gr.Markdown(label="ℹ️ Test Set Information")
820
+
821
+ # Tab 2: Submit Predictions
822
  with gr.Tab("πŸš€ Submit Predictions", id="submit"):
823
+ gr.Markdown(
824
+ """
825
  ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
826
 
827
  Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
828
+ """
829
+ )
830
+
831
  with gr.Row():
832
  with gr.Column(scale=1):
833
  gr.Markdown("### πŸ“ Model Information")
834
+
835
  model_name_input = gr.Textbox(
836
  label="πŸ€– Model Name",
837
  placeholder="e.g., MyTranslator-v2.0",
838
+ info="Unique name for your model",
839
  )
840
+
841
  author_input = gr.Textbox(
842
+ label="πŸ‘€ Author/Organization",
843
  placeholder="Your name or organization",
844
+ value="Anonymous",
845
  )
846
+
847
  description_input = gr.Textbox(
848
  label="πŸ“„ Model Description",
849
  placeholder="Architecture, training data, special features...",
850
  lines=4,
851
+ info="Detailed description helps with proper categorization",
852
  )
853
+
854
  gr.Markdown("### πŸ“€ Upload Predictions")
855
  predictions_file = gr.File(
856
  label="πŸ“‚ Predictions File",
857
+ file_types=[".csv", ".tsv", ".json"],
858
  )
859
+
860
+ validate_btn = gr.Button(
861
+ "βœ… Validate Submission", variant="secondary"
862
+ )
863
+ submit_btn = gr.Button(
864
+ "πŸš€ Submit for Scientific Evaluation",
865
+ variant="primary",
866
+ interactive=False,
867
+ )
868
+
869
  with gr.Column(scale=1):
870
  gr.Markdown("### πŸ“Š Validation Results")
871
  validation_output = gr.Markdown()
872
+
873
  # Results section
874
  gr.Markdown("### πŸ† Scientific Evaluation Results")
875
+
876
  with gr.Row():
877
  evaluation_output = gr.Markdown()
878
+
879
  with gr.Row():
880
  with gr.Column():
881
  submission_plot = gr.Plot(label="πŸ“ˆ Submission Analysis")
882
  with gr.Column():
883
  cross_track_plot = gr.Plot(label="πŸ”„ Cross-Track Analysis")
884
+
885
  with gr.Row():
886
+ results_table = gr.Dataframe(
887
+ label="πŸ“Š Updated Leaderboard (Google-Comparable Track)",
888
+ interactive=False,
889
+ )
890
+
891
  # Tab 3: Google-Comparable Track
892
+ with gr.Tab(
893
+ "πŸ€– Google-Comparable Track",
894
+ id="google_track",
895
+ elem_classes=["track-tab", "google-comparable"],
896
+ ):
897
+ gr.Markdown(
898
+ f"""
899
  ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
900
 
901
  **Fair comparison with commercial translation systems**
 
906
  - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
907
  - **Purpose**: Commercial system comparison and baseline establishment
908
  - **Statistical Power**: High (optimized sample sizes)
909
+ """
910
+ )
911
+
912
  with gr.Row():
913
  with gr.Column(scale=2):
914
+ google_search = gr.Textbox(
915
+ label="πŸ” Search Models",
916
+ placeholder="Search by model name, author...",
917
+ )
918
  with gr.Column(scale=1):
919
  google_category = gr.Dropdown(
920
  label="🏷️ Category Filter",
921
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
922
+ value="all",
923
  )
924
  with gr.Column(scale=1):
925
  google_adequacy = gr.Slider(
926
  label="πŸ“Š Min Adequacy",
927
+ minimum=0.0,
928
+ maximum=1.0,
929
+ value=0.0,
930
+ step=0.1,
931
  )
932
  with gr.Column(scale=1):
933
  google_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
934
+
935
  with gr.Row():
936
  google_stats = gr.Markdown()
937
+
938
  with gr.Row():
939
  with gr.Column():
940
  google_ranking_plot = gr.Plot(label="πŸ† Google-Comparable Rankings")
941
  with gr.Column():
942
  google_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
943
+
944
  with gr.Row():
945
+ google_leaderboard = gr.Dataframe(
946
+ label="πŸ“ˆ Google-Comparable Leaderboard", interactive=False
947
+ )
948
+
949
  # Tab 4: UG40-Complete Track
950
+ with gr.Tab(
951
+ "🌍 UG40-Complete Track",
952
+ id="ug40_track",
953
+ elem_classes=["track-tab", "ug40-complete"],
954
+ ):
955
+ gr.Markdown(
956
+ f"""
957
  ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
958
 
959
  **Comprehensive evaluation across all Ugandan languages**
 
964
  - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
965
  - **Purpose**: Comprehensive Ugandan language capability assessment
966
  - **Coverage**: Complete linguistic landscape of Uganda
967
+ """
968
+ )
969
+
970
  with gr.Row():
971
  with gr.Column(scale=2):
972
+ ug40_search = gr.Textbox(
973
+ label="πŸ” Search Models",
974
+ placeholder="Search by model name, author...",
975
+ )
976
  with gr.Column(scale=1):
977
  ug40_category = gr.Dropdown(
978
  label="🏷️ Category Filter",
979
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
980
+ value="all",
981
  )
982
  with gr.Column(scale=1):
983
  ug40_adequacy = gr.Slider(
984
  label="πŸ“Š Min Adequacy",
985
+ minimum=0.0,
986
+ maximum=1.0,
987
+ value=0.0,
988
+ step=0.1,
989
  )
990
  with gr.Column(scale=1):
991
  ug40_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
992
+
993
  with gr.Row():
994
  ug40_stats = gr.Markdown()
995
+
996
  with gr.Row():
997
  with gr.Column():
998
  ug40_ranking_plot = gr.Plot(label="πŸ† UG40-Complete Rankings")
999
  with gr.Column():
1000
  ug40_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
1001
+
1002
  with gr.Row():
1003
+ ug40_leaderboard = gr.Dataframe(
1004
+ label="πŸ“ˆ UG40-Complete Leaderboard", interactive=False
1005
+ )
1006
+
1007
  # Tab 5: Language-Pair Matrix
1008
+ with gr.Tab(
1009
+ "πŸ“Š Language-Pair Matrix",
1010
+ id="matrix_track",
1011
+ elem_classes=["track-tab", "language-pair-matrix"],
1012
+ ):
1013
+ gr.Markdown(
1014
+ f"""
1015
  ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
1016
 
1017
  **Detailed language pair analysis with statistical significance**
 
1022
  - **Resolution**: Individual language pair performance
1023
  - **Purpose**: Detailed linguistic analysis and model diagnostics
1024
  - **Statistics**: Pairwise significance testing available
1025
+ """
1026
+ )
1027
+
1028
  with gr.Row():
1029
  with gr.Column(scale=2):
1030
+ matrix_search = gr.Textbox(
1031
+ label="πŸ” Search Models",
1032
+ placeholder="Search by model name, author...",
1033
+ )
1034
  with gr.Column(scale=1):
1035
  matrix_category = gr.Dropdown(
1036
  label="🏷️ Category Filter",
1037
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
1038
+ value="all",
1039
  )
1040
  with gr.Column(scale=1):
1041
  matrix_adequacy = gr.Slider(
1042
  label="πŸ“Š Min Adequacy",
1043
+ minimum=0.0,
1044
+ maximum=1.0,
1045
+ value=0.0,
1046
+ step=0.1,
1047
  )
1048
  with gr.Column(scale=1):
1049
  matrix_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
1050
+
1051
  with gr.Row():
1052
  matrix_stats = gr.Markdown()
1053
+
1054
  with gr.Row():
1055
  with gr.Column():
1056
+ matrix_ranking_plot = gr.Plot(
1057
+ label="πŸ† Language-Pair Matrix Rankings"
1058
+ )
1059
  with gr.Column():
1060
  matrix_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
1061
+
1062
  with gr.Row():
1063
+ matrix_leaderboard = gr.Dataframe(
1064
+ label="πŸ“ˆ Language-Pair Matrix Leaderboard", interactive=False
1065
+ )
1066
+
1067
  # Tab 6: Model Analysis
1068
  with gr.Tab("πŸ” Scientific Model Analysis", id="analysis"):
1069
+ gr.Markdown(
1070
+ """
1071
  ## πŸ”¬ Detailed Scientific Model Analysis
1072
 
1073
  Comprehensive analysis of individual models with statistical confidence intervals,
1074
  cross-track performance, and detailed language pair breakdowns.
1075
+ """
1076
+ )
1077
+
1078
  with gr.Row():
1079
  with gr.Column(scale=2):
1080
  model_select = gr.Dropdown(
1081
  label="πŸ€– Select Model",
1082
  choices=[],
1083
  value=None,
1084
+ info="Choose a model for detailed scientific analysis",
1085
  )
1086
  with gr.Column(scale=1):
1087
  track_select = gr.Dropdown(
1088
  label="🏁 Analysis Track",
1089
  choices=list(EVALUATION_TRACKS.keys()),
1090
  value="google_comparable",
1091
+ info="Track for detailed analysis",
1092
  )
1093
  with gr.Column(scale=1):
1094
  analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
1095
+
1096
  with gr.Row():
1097
  model_details = gr.Markdown()
1098
+
1099
  with gr.Row():
1100
  with gr.Column():
1101
+ model_analysis_plot = gr.Plot(
1102
+ label="πŸ“Š Detailed Performance Analysis"
1103
+ )
1104
  with gr.Column():
1105
  model_heatmap_plot = gr.Plot(label="πŸ—ΊοΈ Language Pair Heatmap")
1106
+
1107
  # Tab 7: Model Comparison
1108
  with gr.Tab("βš–οΈ Scientific Model Comparison", id="comparison"):
1109
+ gr.Markdown(
1110
+ """
1111
  ## πŸ”¬ Scientific Model Comparison
1112
 
1113
  Compare multiple models with statistical significance testing and fair comparison analysis.
1114
  Only models evaluated on the same language pairs are compared for scientific validity.
1115
+ """
1116
+ )
1117
+
1118
  with gr.Row():
1119
  with gr.Column(scale=2):
1120
  comparison_models = gr.CheckboxGroup(
1121
  label="πŸ€– Select Models to Compare",
1122
  choices=[],
1123
  value=[],
1124
+ info="Select 2-6 models for comparison",
1125
  )
1126
  with gr.Column(scale=1):
1127
  comparison_track = gr.Dropdown(
1128
  label="🏁 Comparison Track",
1129
  choices=list(EVALUATION_TRACKS.keys()),
1130
+ value="google_comparable",
1131
  )
1132
  comparison_type = gr.Radio(
1133
  label="πŸ“Š Comparison Type",
1134
  choices=["statistical", "category"],
1135
+ value="statistical",
1136
  )
1137
  compare_btn = gr.Button("βš–οΈ Compare Models", variant="primary")
1138
+
1139
  with gr.Row():
1140
  comparison_output = gr.Markdown()
1141
+
1142
  with gr.Row():
1143
  comparison_plot = gr.Plot(label="πŸ“Š Model Comparison Analysis")
1144
+
1145
  # Tab 8: Documentation
1146
  with gr.Tab("πŸ“š Scientific Documentation", id="docs"):
1147
+ gr.Markdown(
1148
+ f"""
1149
  # πŸ“– SALT Translation Leaderboard - Scientific Edition Documentation
1150
 
1151
  ## 🎯 Overview
 
1310
  ---
1311
 
1312
  *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
1313
+ """
1314
+ )
1315
+
1316
  # Event handlers with enhanced scientific functionality
1317
  predictions_validated = gr.State(value=None)
1318
  validation_info_state = gr.State(value=None)
1319
  detected_category_state = gr.State(value="community")
1320
+
1321
  # Download test set
1322
  download_btn.click(
1323
+ fn=download_scientific_test_set, outputs=[download_file, download_info]
 
1324
  )
1325
+
1326
  # Validate predictions
1327
  def handle_scientific_validation(file, model_name, author, description):
1328
+ report, predictions, category = validate_scientific_submission(
1329
+ file, model_name, author, description
1330
+ )
1331
+
1332
+ # Enable button if predictions are available and format is valid
1333
+ # This allows "can be evaluated with limitations" cases
1334
+ can_evaluate = predictions is not None
1335
+
1336
+ # Additional check: ensure we have some basic validity
1337
+ if can_evaluate and "❌ **Final Verdict**: Please address issues" in report:
1338
+ can_evaluate = False
1339
+
1340
  return (
1341
  report,
1342
  predictions,
1343
+ {"category": category, "validation_passed": can_evaluate},
1344
  category,
1345
+ gr.update(interactive=can_evaluate),
1346
  )
1347
+
1348
  validate_btn.click(
1349
  fn=handle_scientific_validation,
1350
  inputs=[predictions_file, model_name_input, author_input, description_input],
1351
+ outputs=[
1352
+ validation_output,
1353
+ predictions_validated,
1354
+ validation_info_state,
1355
+ detected_category_state,
1356
+ submit_btn,
1357
+ ],
1358
  )
1359
+
1360
  # Submit for evaluation
1361
+ def handle_scientific_submission(
1362
+ predictions, model_name, author, description, category, validation_info
1363
+ ):
1364
  if predictions is None:
1365
  return "❌ Please validate your submission first", None, None, None
1366
+
1367
  return evaluate_scientific_submission(
1368
  predictions, model_name, author, description, category, validation_info
1369
  )
1370
+
1371
  submit_btn.click(
1372
  fn=handle_scientific_submission,
1373
+ inputs=[
1374
+ predictions_validated,
1375
+ model_name_input,
1376
+ author_input,
1377
+ description_input,
1378
+ detected_category_state,
1379
+ validation_info_state,
1380
+ ],
1381
+ outputs=[evaluation_output, results_table, submission_plot, cross_track_plot],
1382
  )
1383
+
1384
  # Track leaderboard refresh functions
1385
  def refresh_google_track(*args):
1386
  return refresh_track_leaderboard("google_comparable", *args)
1387
+
1388
  def refresh_ug40_track(*args):
1389
  return refresh_track_leaderboard("ug40_complete", *args)
1390
+
1391
  def refresh_matrix_track(*args):
1392
  return refresh_track_leaderboard("language_pair_matrix", *args)
1393
+
1394
  # Google-Comparable Track
1395
  google_refresh.click(
1396
  fn=refresh_google_track,
1397
  inputs=[google_search, google_category, google_adequacy],
1398
+ outputs=[
1399
+ google_leaderboard,
1400
+ google_ranking_plot,
1401
+ google_comparison_plot,
1402
+ google_stats,
1403
+ ],
1404
  )
1405
+
1406
  # UG40-Complete Track
1407
  ug40_refresh.click(
1408
  fn=refresh_ug40_track,
1409
  inputs=[ug40_search, ug40_category, ug40_adequacy],
1410
+ outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats],
1411
  )
1412
+
1413
  # Language-Pair Matrix Track
1414
  matrix_refresh.click(
1415
  fn=refresh_matrix_track,
1416
  inputs=[matrix_search, matrix_category, matrix_adequacy],
1417
+ outputs=[
1418
+ matrix_leaderboard,
1419
+ matrix_ranking_plot,
1420
+ matrix_comparison_plot,
1421
+ matrix_stats,
1422
+ ],
1423
  )
1424
+
1425
  # Model analysis
1426
  analyze_btn.click(
1427
  fn=get_scientific_model_details,
1428
  inputs=[model_select, track_select],
1429
+ outputs=[model_details, model_analysis_plot, model_heatmap_plot],
1430
  )
1431
+
1432
  # Model comparison
1433
  compare_btn.click(
1434
  fn=perform_model_comparison,
1435
  inputs=[comparison_models, comparison_track, comparison_type],
1436
+ outputs=[comparison_output, comparison_plot],
1437
  )
1438
+
1439
  # Load initial data and update dropdowns
1440
  def load_initial_data():
1441
  # Load initial Google track data
1442
  google_data = refresh_google_track("", "all", 0.0)
1443
+
1444
  # Update dropdown choices
1445
  if current_leaderboard is not None and not current_leaderboard.empty:
1446
+ model_choices = current_leaderboard["model_name"].tolist()
1447
  else:
1448
  model_choices = []
1449
+
1450
  return (
1451
  google_data[0], # google_leaderboard
1452
+ google_data[1], # google_ranking_plot
1453
  google_data[2], # google_comparison_plot
1454
  google_data[3], # google_stats
1455
  gr.Dropdown(choices=model_choices), # model_select
1456
+ gr.CheckboxGroup(choices=model_choices), # comparison_models
1457
  )
1458
+
1459
  demo.load(
1460
  fn=load_initial_data,
1461
  outputs=[
1462
+ google_leaderboard,
1463
+ google_ranking_plot,
1464
+ google_comparison_plot,
1465
+ google_stats,
1466
+ model_select,
1467
+ comparison_models,
1468
+ ],
1469
  )
1470
 
1471
  # Launch the scientific application
1472
  if __name__ == "__main__":
1473
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)