akera commited on
Commit
e32fdda
Β·
verified Β·
1 Parent(s): cfbcff1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +591 -257
app.py CHANGED
@@ -1,200 +1,321 @@
1
  # app.py
2
  import gradio as gr
3
  import pandas as pd
4
- import matplotlib.pyplot as plt
5
- from datasets import load_dataset
6
- import yaml
7
  import json
8
- import torch
9
- from datetime import datetime
10
  import traceback
 
 
11
 
12
  # Import our modules
13
- from src.model_loader import load_model, get_model_info
14
- from src.evaluation import evaluate_model_full
15
- from src.leaderboard import load_leaderboard, add_model_results, get_leaderboard_summary, search_models
16
- from src.plotting import create_leaderboard_plot, create_detailed_comparison_plot, create_summary_metrics_plot
17
- from src.utils import validate_model_path, get_model_type, sanitize_input
 
 
 
 
 
 
 
 
 
18
  from config import *
19
 
20
  # Global variables for caching
21
  current_leaderboard = None
22
- test_data = None
 
23
 
24
- def load_salt_data():
25
- """Load SALT dataset for evaluation."""
26
- global test_data
27
-
28
- if test_data is not None:
29
- return test_data
30
 
31
  try:
32
- print("Loading SALT dataset...")
33
-
34
- # Configuration for SALT dataset
35
- dataset_config = f'''
36
- huggingface_load:
37
- path: {SALT_DATASET}
38
- name: text-all
39
- split: dev[:{MAX_EVAL_SAMPLES}]
40
- source:
41
- type: text
42
- language: {SUPPORTED_LANGUAGES}
43
- target:
44
- type: text
45
- language: {SUPPORTED_LANGUAGES}
46
- src_or_tgt_languages_must_contain: eng
47
- allow_same_src_and_tgt_language: False
48
- '''
49
-
50
- config = yaml.safe_load(dataset_config)
51
-
52
- # Import salt dataset utilities
53
- import salt.dataset
54
- test_data = pd.DataFrame(salt.dataset.create(config))
55
-
56
- print(f"Loaded {len(test_data)} evaluation samples")
57
- return test_data
58
 
59
  except Exception as e:
60
- print(f"Error loading SALT dataset: {e}")
61
- # Fallback: create minimal test data
62
- test_data = pd.DataFrame({
63
- 'source': ['Hello world', 'How are you?'],
64
- 'target': ['Amakuru', 'Oli otya?'],
65
- 'source.language': ['eng', 'eng'],
66
- 'target.language': ['lug', 'lug']
67
- })
68
- return test_data
69
 
70
- def refresh_leaderboard():
71
- """Refresh leaderboard data."""
72
- global current_leaderboard
73
- current_leaderboard = load_leaderboard()
74
- return current_leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def evaluate_submission(model_path: str, author_name: str) -> tuple:
77
- """Main evaluation function."""
78
 
79
  try:
80
- # Validate inputs
81
- model_path = sanitize_input(model_path)
82
- author_name = sanitize_input(author_name)
 
 
83
 
84
- if not model_path:
85
- return "❌ Error: Model path is required", None, None, None
 
86
 
87
- if not author_name:
88
- author_name = "Anonymous"
 
 
89
 
90
- if not validate_model_path(model_path):
91
- return "❌ Error: Invalid model path format", None, None, None
 
 
92
 
93
- # Load test data
94
- test_data = load_salt_data()
95
- if test_data is None or len(test_data) == 0:
96
- return "❌ Error: Could not load evaluation data", None, None, None
 
97
 
98
- # Get model info
99
- print(f"Getting model info for: {model_path}")
100
- model_info = get_model_info(model_path)
101
- model_type = get_model_type(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- # Load model
104
- print(f"Loading model: {model_path}")
105
- try:
106
- model, tokenizer = load_model(model_path)
107
- except Exception as e:
108
- return f"❌ Error loading model: {str(e)}", None, None, None
109
 
110
  # Run evaluation
111
- print("Starting evaluation...")
112
- try:
113
- detailed_metrics = evaluate_model_full(model, tokenizer, model_path, test_data)
114
- except Exception as e:
115
- return f"❌ Error during evaluation: {str(e)}", None, None, None
116
-
117
- # Extract average metrics
118
- avg_metrics = detailed_metrics.get('averages', {})
119
- if not avg_metrics:
120
- return "❌ Error: No metrics calculated", None, None, None
121
-
122
- # Add results to leaderboard
123
- print("Adding results to leaderboard...")
124
- updated_leaderboard = add_model_results(
125
- model_path=model_path,
126
- author=author_name,
127
- metrics=avg_metrics,
128
- detailed_metrics=detailed_metrics,
129
- evaluation_samples=len(test_data),
130
- model_type=model_type
131
  )
132
 
133
  # Update global leaderboard
134
- global current_leaderboard
135
  current_leaderboard = updated_leaderboard
136
 
137
- # Create visualizations
138
- leaderboard_plot = create_leaderboard_plot(updated_leaderboard, 'quality_score')
139
- detailed_plot = create_detailed_comparison_plot({model_path: detailed_metrics}, [model_path])
 
 
 
 
 
 
 
140
 
141
- # Format results message
142
- results_msg = f"""
143
- βœ… **Evaluation Complete!**
144
 
145
- **Model:** {model_path}
146
- **Author:** {author_name}
147
- **Type:** {model_type}
 
 
 
148
 
149
- **Results:**
150
- - Quality Score: {avg_metrics.get('quality_score', 0):.4f}
151
- - BLEU: {avg_metrics.get('bleu', 0):.2f}
152
- - ChrF: {avg_metrics.get('chrf', 0):.4f}
153
- - ROUGE-L: {avg_metrics.get('rougeL', 0):.4f}
154
 
155
- **Ranking:** #{updated_leaderboard[updated_leaderboard['model_path'] == model_path].index[0] + 1} out of {len(updated_leaderboard)} models
156
  """
157
 
158
- return results_msg, updated_leaderboard, leaderboard_plot, detailed_plot
159
 
160
  except Exception as e:
161
- error_msg = f"❌ Unexpected error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
162
- print(error_msg)
163
  return error_msg, None, None, None
164
 
165
- def update_leaderboard_display(search_query: str = "") -> tuple:
166
- """Update leaderboard display with optional search."""
 
 
 
 
 
167
 
168
- global current_leaderboard
169
- if current_leaderboard is None:
170
- current_leaderboard = refresh_leaderboard()
171
-
172
- # Apply search filter
173
- if search_query:
174
- filtered_df = search_models(current_leaderboard, search_query)
175
- else:
176
- filtered_df = current_leaderboard
177
-
178
- # Create plots
179
- leaderboard_plot = create_leaderboard_plot(filtered_df, 'quality_score')
180
- summary_plot = create_summary_metrics_plot(filtered_df)
181
-
182
- # Get summary stats
183
- summary = get_leaderboard_summary(filtered_df)
184
- summary_text = f"""
185
- πŸ“Š **Leaderboard Summary**
186
- - Total Models: {summary['total_models']}
187
- - Average Quality Score: {summary['avg_quality_score']:.4f}
188
- - Best Model: {summary['best_model']}
189
- - Latest Submission: {summary['latest_submission'][:10] if summary['latest_submission'] != 'None' else 'None'}
190
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- return filtered_df, leaderboard_plot, summary_plot, summary_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- # Initialize data
195
- print("Initializing SALT Translation Leaderboard...")
196
- load_salt_data()
197
- refresh_leaderboard()
198
 
199
  # Create Gradio interface
200
  with gr.Blocks(
@@ -202,17 +323,37 @@ with gr.Blocks(
202
  theme=gr.themes.Soft(),
203
  css="""
204
  .gradio-container {
205
- max-width: 1200px !important;
 
206
  }
207
  .main-header {
208
  text-align: center;
209
  margin-bottom: 2rem;
 
 
 
 
210
  }
211
- .metric-display {
212
  background: #f8f9fa;
213
  padding: 1rem;
214
- border-radius: 0.5rem;
215
  margin: 0.5rem 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  }
217
  """
218
  ) as demo:
@@ -225,189 +366,382 @@ with gr.Blocks(
225
 
226
  {DESCRIPTION}
227
 
228
- **Supported Languages:** Luganda (lug), Acholi (ach), Swahili (swa), English (eng)
229
 
230
  </div>
231
  """)
232
 
 
 
 
 
 
 
 
 
233
  with gr.Tabs():
234
 
235
- # Tab 1: Submit Model
236
- with gr.Tab("πŸš€ Submit Model", id="submit"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  gr.Markdown("""
239
- ### Submit Your Translation Model
240
 
241
- Enter a HuggingFace model path (e.g., `microsoft/DialoGPT-medium`) or use `google-translate` to benchmark against Google Translate.
 
 
 
242
 
243
- **Supported Model Types:** Gemma, Qwen, Llama, NLLB, Google Translate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  """)
245
 
246
  with gr.Row():
247
- with gr.Column(scale=2):
248
- model_input = gr.Textbox(
249
- label="πŸ€— HuggingFace Model Path",
250
- placeholder="e.g., Sunbird/gemma3-12b-ug40-merged",
251
- info="Enter the full HuggingFace model path or 'google-translate'"
 
 
 
252
  )
253
 
254
  author_input = gr.Textbox(
255
- label="πŸ‘€ Author/Organization",
256
  placeholder="Your name or organization",
257
  value="Anonymous"
258
  )
259
 
260
- submit_btn = gr.Button(
261
- "πŸ”„ Evaluate Model",
262
- variant="primary",
263
- size="lg"
264
  )
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  with gr.Column(scale=1):
267
- gr.Markdown("""
268
- **πŸ“‹ Evaluation Process:**
269
- 1. Model validation
270
- 2. Loading model weights
271
- 3. Generating translations
272
- 4. Calculating metrics
273
- 5. Updating leaderboard
274
-
275
- ⏱️ **Expected time:** 5-15 minutes
276
- """)
277
 
278
  # Results section
279
- with gr.Group():
280
- results_output = gr.Markdown(label="πŸ“Š Results")
281
-
282
- with gr.Row():
283
- with gr.Column():
284
- results_leaderboard = gr.Dataframe(
285
- label="πŸ“ˆ Updated Leaderboard",
286
- interactive=False
287
- )
288
-
289
- with gr.Row():
290
- results_plot = gr.Plot(label="πŸ“Š Leaderboard Ranking")
291
- detailed_plot = gr.Plot(label="πŸ” Detailed Performance")
292
 
293
- # Tab 2: Leaderboard
294
  with gr.Tab("πŸ† Leaderboard", id="leaderboard"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  with gr.Row():
297
- search_input = gr.Textbox(
298
- label="πŸ” Search Models",
299
- placeholder="Search by model name, author, or path...",
300
- scale=3
301
- )
302
- refresh_btn = gr.Button("πŸ”„ Refresh", scale=1)
303
 
304
- summary_stats = gr.Markdown(label="πŸ“Š Summary")
 
 
 
 
 
 
 
305
 
306
  with gr.Row():
307
  leaderboard_table = gr.Dataframe(
308
- label="πŸ† Model Rankings",
309
  interactive=False,
310
  wrap=True
311
  )
 
 
 
 
 
 
 
 
 
 
 
312
 
313
  with gr.Row():
314
- leaderboard_viz = gr.Plot(label="πŸ“Š Performance Comparison")
315
- summary_viz = gr.Plot(label="πŸ“ˆ Top Models Summary")
 
 
316
 
317
- # Tab 3: Documentation
318
  with gr.Tab("πŸ“š Documentation", id="docs"):
 
 
319
 
320
- gr.Markdown("""
321
- ## πŸ“– How to Use the SALT Translation Leaderboard
 
 
 
 
322
 
323
- ### πŸš€ Submitting Your Model
 
324
 
325
- 1. **Prepare your model**: Ensure your model is uploaded to HuggingFace Hub
326
- 2. **Enter model path**: Use the format `username/model-name`
327
- 3. **Add your details**: Provide your name or organization
328
- 4. **Submit**: Click "Evaluate Model" and wait for results
329
 
330
- ### πŸ“Š Metrics Explained
331
 
332
- - **Quality Score**: Combined metric (0-1, higher is better)
333
- - **BLEU**: Translation quality (0-100, higher is better)
334
- - **ChrF**: Character-level F-score (0-1, higher is better)
335
- - **ROUGE-L**: Longest common subsequence (0-1, higher is better)
336
- - **CER/WER**: Character/Word Error Rate (0-1, lower is better)
337
 
338
- ### 🎯 Supported Models
 
 
 
339
 
340
- - **Gemma**: Google's Gemma models fine-tuned for translation
341
- - **Qwen**: Alibaba's Qwen models
342
- - **Llama**: Meta's Llama models
343
- - **NLLB**: Facebook's No Language Left Behind models
344
- - **Google Translate**: Baseline comparison
345
 
346
- ### πŸ“‹ Dataset Information
 
 
 
347
 
348
- **SALT Dataset**: Sunbird AI's comprehensive translation dataset
349
- - **Languages**: Luganda, Acholi, Swahili, English
350
- - **Evaluation Size**: {MAX_EVAL_SAMPLES} samples
351
- - **Domains**: Multiple domains including news, literature, and conversations
352
 
353
- ### πŸ”„ API Access
 
 
 
 
354
 
355
- The leaderboard data is available via HuggingFace Datasets:
356
- ```python
357
- from datasets import load_dataset
358
- leaderboard = load_dataset("{LEADERBOARD_DATASET}")
 
 
 
 
 
 
 
 
 
 
359
  ```
360
 
361
- ### 🀝 Contributing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
364
- For issues or suggestions, please contact us or submit a GitHub issue.
365
 
366
- ### πŸ“œ License & Citation
 
 
 
367
 
368
  If you use this leaderboard in your research, please cite:
369
- ```
 
370
  @misc{{salt_leaderboard_2024,
371
- title={{SALT Translation Leaderboard}},
372
  author={{Sunbird AI}},
373
  year={{2024}},
374
  url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard}}
375
  }}
376
  ```
 
 
 
 
 
 
377
  """)
378
 
379
- # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  submit_btn.click(
381
- fn=evaluate_submission,
382
- inputs=[model_input, author_input],
383
- outputs=[results_output, results_leaderboard, results_plot, detailed_plot],
384
- show_progress=True
385
  )
386
 
 
 
 
 
 
 
 
 
 
387
  refresh_btn.click(
388
- fn=update_leaderboard_display,
389
- inputs=[search_input],
390
- outputs=[leaderboard_table, leaderboard_viz, summary_viz, summary_stats]
391
  )
392
 
393
- search_input.change(
394
- fn=update_leaderboard_display,
395
- inputs=[search_input],
396
- outputs=[leaderboard_table, leaderboard_viz, summary_viz, summary_stats]
 
 
 
 
 
 
 
 
 
397
  )
398
 
399
- # Load initial leaderboard data
400
  demo.load(
401
- fn=update_leaderboard_display,
402
- inputs=[],
403
- outputs=[leaderboard_table, leaderboard_viz, summary_viz, summary_stats]
404
  )
405
 
406
- # Launch the app
407
  if __name__ == "__main__":
408
  demo.launch(
409
  server_name="0.0.0.0",
410
  server_port=7860,
411
  share=False,
412
- show_error=True
 
413
  )
 
1
  # app.py
2
  import gradio as gr
3
  import pandas as pd
 
 
 
4
  import json
 
 
5
  import traceback
6
+ from datetime import datetime
7
+ from typing import Optional, Dict, Tuple
8
 
9
  # Import our modules
10
+ from src.test_set import get_public_test_set, get_complete_test_set, create_test_set_download, validate_test_set_integrity
11
+ from src.validation import validate_submission_complete
12
+ from src.evaluation import evaluate_predictions, generate_evaluation_report, get_google_translate_baseline
13
+ from src.leaderboard import (
14
+ load_leaderboard, add_model_to_leaderboard, get_leaderboard_stats,
15
+ filter_leaderboard, export_leaderboard, get_model_comparison
16
+ )
17
+ from src.plotting import (
18
+ create_leaderboard_ranking_plot, create_metrics_comparison_plot,
19
+ create_language_pair_heatmap, create_coverage_analysis_plot,
20
+ create_model_performance_timeline, create_google_comparison_plot,
21
+ create_detailed_model_analysis, create_submission_summary_plot
22
+ )
23
+ from src.utils import sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs
24
  from config import *
25
 
26
  # Global variables for caching
27
  current_leaderboard = None
28
+ public_test_set = None
29
+ complete_test_set = None
30
 
31
+ def initialize_data():
32
+ """Initialize test sets and leaderboard data."""
33
+ global public_test_set, complete_test_set, current_leaderboard
 
 
 
34
 
35
  try:
36
+ print("πŸ”„ Initializing SALT Translation Leaderboard...")
37
+
38
+ # Load test sets
39
+ print("πŸ“₯ Loading test sets...")
40
+ public_test_set = get_public_test_set()
41
+ complete_test_set = get_complete_test_set()
42
+
43
+ # Load leaderboard
44
+ print("πŸ† Loading leaderboard...")
45
+ current_leaderboard = load_leaderboard()
46
+
47
+ print(f"βœ… Initialization complete!")
48
+ print(f" - Test set: {len(public_test_set):,} samples")
49
+ print(f" - Language pairs: {len(get_all_language_pairs())}")
50
+ print(f" - Current models: {len(current_leaderboard)}")
51
+
52
+ return True
 
 
 
 
 
 
 
 
 
53
 
54
  except Exception as e:
55
+ print(f"❌ Initialization failed: {e}")
56
+ traceback.print_exc()
57
+ return False
 
 
 
 
 
 
58
 
59
+ def download_test_set() -> Tuple[str, str]:
60
+ """Create downloadable test set and return file path and info."""
61
+
62
+ try:
63
+ global public_test_set
64
+ if public_test_set is None:
65
+ public_test_set = get_public_test_set()
66
+
67
+ # Create download file
68
+ download_path, stats = create_test_set_download()
69
+
70
+ # Create info message
71
+ info_msg = f"""
72
+ πŸ“₯ **SALT Test Set Downloaded Successfully!**
73
+
74
+ **Dataset Statistics:**
75
+ - **Total Samples**: {stats['total_samples']:,}
76
+ - **Language Pairs**: {stats['language_pairs']}
77
+ - **Google Comparable**: {stats['google_comparable_samples']:,} samples
78
+ - **Languages**: {', '.join(stats['languages'])}
79
+
80
+ **File Format:**
81
+ - `sample_id`: Unique identifier for each sample
82
+ - `source_text`: Text to be translated
83
+ - `source_language`: Source language code
84
+ - `target_language`: Target language code
85
+ - `domain`: Content domain (if available)
86
+ - `google_comparable`: Whether this pair can be compared with Google Translate
87
+
88
+ **Next Steps:**
89
+ 1. Run your model on the source texts
90
+ 2. Create a CSV/JSON file with columns: `sample_id`, `prediction`
91
+ 3. Upload your predictions using the "Submit Predictions" tab
92
+ """
93
+
94
+ return download_path, info_msg
95
+
96
+ except Exception as e:
97
+ error_msg = f"❌ Error creating test set download: {str(e)}"
98
+ return None, error_msg
99
 
100
+ def validate_submission(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame]]:
101
+ """Validate uploaded prediction file."""
102
 
103
  try:
104
+ if file is None:
105
+ return "❌ Please upload a predictions file", None
106
+
107
+ if not model_name.strip():
108
+ return "❌ Please provide a model name", None
109
 
110
+ # Read file content
111
+ file_content = file.read()
112
+ filename = file.name
113
 
114
+ # Get test set for validation
115
+ global complete_test_set
116
+ if complete_test_set is None:
117
+ complete_test_set = get_complete_test_set()
118
 
119
+ # Validate submission
120
+ validation_result = validate_submission_complete(
121
+ file_content, filename, complete_test_set, model_name
122
+ )
123
 
124
+ if validation_result['valid']:
125
+ # Store validation info for later use
126
+ return validation_result['report'], validation_result['predictions']
127
+ else:
128
+ return validation_result['report'], None
129
 
130
+ except Exception as e:
131
+ error_msg = f"❌ Validation error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
132
+ return error_msg, None
133
+
134
+ def evaluate_submission(
135
+ predictions_df: pd.DataFrame,
136
+ model_name: str,
137
+ author: str,
138
+ description: str,
139
+ validation_info: Dict
140
+ ) -> Tuple[str, pd.DataFrame, object, object]:
141
+ """Evaluate validated predictions and update leaderboard."""
142
+
143
+ try:
144
+ if predictions_df is None:
145
+ return "❌ No valid predictions to evaluate", None, None, None
146
 
147
+ # Get complete test set with targets
148
+ global complete_test_set, current_leaderboard
149
+ if complete_test_set is None:
150
+ complete_test_set = get_complete_test_set()
 
 
151
 
152
  # Run evaluation
153
+ print(f"πŸ”„ Evaluating {model_name}...")
154
+ evaluation_results = evaluate_predictions(predictions_df, complete_test_set)
155
+
156
+ if evaluation_results.get('error'):
157
+ return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
158
+
159
+ # Add to leaderboard
160
+ print("πŸ† Adding to leaderboard...")
161
+ model_type = "user_submission" # Could be enhanced to detect model type
162
+
163
+ updated_leaderboard = add_model_to_leaderboard(
164
+ model_name=sanitize_model_name(model_name),
165
+ author=author or "Anonymous",
166
+ evaluation_results=evaluation_results,
167
+ validation_info=validation_info,
168
+ model_type=model_type,
169
+ description=description or ""
 
 
 
170
  )
171
 
172
  # Update global leaderboard
 
173
  current_leaderboard = updated_leaderboard
174
 
175
+ # Generate evaluation report
176
+ report = generate_evaluation_report(evaluation_results, model_name)
177
+
178
+ # Create visualization plots
179
+ summary_plot = create_submission_summary_plot(validation_info, evaluation_results)
180
+ ranking_plot = create_leaderboard_ranking_plot(updated_leaderboard)
181
+
182
+ # Format success message
183
+ rank = updated_leaderboard[updated_leaderboard['model_name'] == sanitize_model_name(model_name)].index[0] + 1
184
+ total_models = len(updated_leaderboard)
185
 
186
+ success_msg = f"""
187
+ πŸŽ‰ **Evaluation Complete!**
 
188
 
189
+ **Your Results:**
190
+ - **Model**: {model_name}
191
+ - **Rank**: #{rank} out of {total_models} models
192
+ - **Quality Score**: {evaluation_results['averages'].get('quality_score', 0):.4f}
193
+ - **BLEU**: {evaluation_results['averages'].get('bleu', 0):.2f}
194
+ - **ChrF**: {evaluation_results['averages'].get('chrf', 0):.4f}
195
 
196
+ **Coverage:**
197
+ - **Samples Evaluated**: {evaluation_results['evaluated_samples']:,}
198
+ - **Language Pairs**: {evaluation_results['summary']['language_pairs_covered']}
199
+ - **Google Comparable**: {evaluation_results['summary']['google_comparable_pairs']} pairs
 
200
 
201
+ {report}
202
  """
203
 
204
+ return success_msg, updated_leaderboard, summary_plot, ranking_plot
205
 
206
  except Exception as e:
207
+ error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
 
208
  return error_msg, None, None, None
209
 
210
+ def refresh_leaderboard_display(
211
+ search_query: str = "",
212
+ model_type_filter: str = "all",
213
+ min_coverage: float = 0.0,
214
+ google_only: bool = False
215
+ ) -> Tuple[pd.DataFrame, object, object, str]:
216
+ """Refresh and filter leaderboard display."""
217
 
218
+ try:
219
+ global current_leaderboard
220
+ if current_leaderboard is None:
221
+ current_leaderboard = load_leaderboard()
222
+
223
+ # Apply filters
224
+ filtered_df = filter_leaderboard(
225
+ current_leaderboard,
226
+ search_query=search_query,
227
+ model_type=model_type_filter,
228
+ min_coverage=min_coverage,
229
+ google_comparable_only=google_only
230
+ )
231
+
232
+ # Create plots
233
+ ranking_plot = create_leaderboard_ranking_plot(filtered_df)
234
+ comparison_plot = create_metrics_comparison_plot(filtered_df)
235
+
236
+ # Get stats
237
+ stats = get_leaderboard_stats(filtered_df)
238
+ stats_text = f"""
239
+ πŸ“Š **Leaderboard Statistics**
240
+
241
+ - **Total Models**: {stats['total_models']}
242
+ - **Average Quality Score**: {stats['avg_quality_score']:.4f}
243
+ - **Google Comparable Models**: {stats['google_comparable_models']}
244
+
245
+ **Best Model**: {stats['best_model']['name'] if stats['best_model'] else 'None'}
246
+ **Latest Submission**: {stats['latest_submission'][:10] if stats['latest_submission'] else 'None'}
247
+ """
248
+
249
+ return filtered_df, ranking_plot, comparison_plot, stats_text
250
+
251
+ except Exception as e:
252
+ error_msg = f"Error loading leaderboard: {str(e)}"
253
+ empty_df = pd.DataFrame()
254
+ return empty_df, None, None, error_msg
255
+
256
+ def get_model_details(model_name: str) -> Tuple[str, object]:
257
+ """Get detailed analysis for a specific model."""
258
 
259
+ try:
260
+ global current_leaderboard
261
+ if current_leaderboard is None:
262
+ return "Leaderboard not loaded", None
263
+
264
+ # Find model
265
+ model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
266
+
267
+ if model_row.empty:
268
+ return f"Model '{model_name}' not found", None
269
+
270
+ model_info = model_row.iloc[0]
271
+
272
+ # Parse detailed metrics
273
+ try:
274
+ detailed_results = json.loads(model_info['detailed_metrics'])
275
+ except:
276
+ detailed_results = {}
277
+
278
+ # Create detailed plot
279
+ detail_plot = create_detailed_model_analysis(detailed_results, model_name)
280
+
281
+ # Format model details
282
+ details_text = f"""
283
+ # πŸ” Model Details: {model_name}
284
+
285
+ **Basic Information:**
286
+ - **Author**: {model_info['author']}
287
+ - **Submission Date**: {model_info['submission_date'][:10]}
288
+ - **Model Type**: {model_info['model_type']}
289
+ - **Description**: {model_info['description'] or 'No description provided'}
290
+
291
+ **Performance Metrics:**
292
+ - **Quality Score**: {model_info['quality_score']:.4f}
293
+ - **BLEU**: {model_info['bleu']:.2f}
294
+ - **ChrF**: {model_info['chrf']:.4f}
295
+ - **ROUGE-1**: {model_info['rouge1']:.4f}
296
+ - **ROUGE-L**: {model_info['rougeL']:.4f}
297
+
298
+ **Coverage Information:**
299
+ - **Total Samples**: {model_info['total_samples']:,}
300
+ - **Language Pairs Covered**: {model_info['language_pairs_covered']}
301
+ - **Google Comparable Pairs**: {model_info['google_pairs_covered']}
302
+ - **Coverage Rate**: {model_info['coverage_rate']:.1%}
303
+
304
+ **Google Translate Comparison:**
305
+ - **Google Quality Score**: {model_info['google_quality_score']:.4f}
306
+ - **Google BLEU**: {model_info['google_bleu']:.2f}
307
+ - **Google ChrF**: {model_info['google_chrf']:.4f}
308
+ """
309
+
310
+ return details_text, detail_plot
311
+
312
+ except Exception as e:
313
+ error_msg = f"Error getting model details: {str(e)}"
314
+ return error_msg, None
315
 
316
+ # Initialize data on startup
317
+ print("πŸš€ Starting SALT Translation Leaderboard...")
318
+ initialization_success = initialize_data()
 
319
 
320
  # Create Gradio interface
321
  with gr.Blocks(
 
323
  theme=gr.themes.Soft(),
324
  css="""
325
  .gradio-container {
326
+ max-width: 1400px !important;
327
+ margin: 0 auto;
328
  }
329
  .main-header {
330
  text-align: center;
331
  margin-bottom: 2rem;
332
+ padding: 2rem;
333
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
334
+ color: white;
335
+ border-radius: 10px;
336
  }
337
+ .metric-box {
338
  background: #f8f9fa;
339
  padding: 1rem;
340
+ border-radius: 8px;
341
  margin: 0.5rem 0;
342
+ border-left: 4px solid #007bff;
343
+ }
344
+ .error-box {
345
+ background: #f8d7da;
346
+ color: #721c24;
347
+ padding: 1rem;
348
+ border-radius: 8px;
349
+ border-left: 4px solid #dc3545;
350
+ }
351
+ .success-box {
352
+ background: #d4edda;
353
+ color: #155724;
354
+ padding: 1rem;
355
+ border-radius: 8px;
356
+ border-left: 4px solid #28a745;
357
  }
358
  """
359
  ) as demo:
 
366
 
367
  {DESCRIPTION}
368
 
369
+ **Supported Languages**: {len(ALL_UG40_LANGUAGES)} Ugandan languages | **Google Comparable**: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages
370
 
371
  </div>
372
  """)
373
 
374
+ # Status indicator
375
+ if initialization_success:
376
+ status_msg = "βœ… System initialized successfully"
377
+ else:
378
+ status_msg = "❌ System initialization failed - some features may not work"
379
+
380
+ gr.Markdown(f"**Status**: {status_msg}")
381
+
382
  with gr.Tabs():
383
 
384
+ # Tab 1: Get Test Set
385
+ with gr.Tab("πŸ“₯ Download Test Set", id="download"):
386
+ gr.Markdown("""
387
+ ## πŸ“‹ Get the SALT Translation Test Set
388
+
389
+ Download the standardized test set to evaluate your translation model.
390
+ The test set contains source texts in multiple Ugandan languages that you need to translate.
391
+ """)
392
+
393
+ with gr.Row():
394
+ download_btn = gr.Button("πŸ“₯ Download Test Set", variant="primary", size="lg")
395
+
396
+ with gr.Row():
397
+ with gr.Column():
398
+ download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
399
+ with gr.Column():
400
+ download_info = gr.Markdown(label="ℹ️ Test Set Information")
401
 
402
  gr.Markdown("""
403
+ ### πŸ“– Instructions
404
 
405
+ 1. **Download** the test set using the button above
406
+ 2. **Run your model** on the source texts to generate translations
407
+ 3. **Create a predictions file** with your model's outputs
408
+ 4. **Submit** your predictions using the "Submit Predictions" tab
409
 
410
+ ### πŸ“‹ Required Prediction Format
411
+
412
+ Your predictions file must be a CSV/TSV/JSON with these columns:
413
+ - `sample_id`: The unique identifier from the test set
414
+ - `prediction`: Your model's translation for that sample
415
+
416
+ **Example CSV:**
417
+ ```
418
+ sample_id,prediction
419
+ salt_000001,Oli otya mukwano gwange?
420
+ salt_000002,Webale nyo olukya
421
+ ...
422
+ ```
423
+ """)
424
+
425
+ # Tab 2: Submit Predictions
426
+ with gr.Tab("πŸš€ Submit Predictions", id="submit"):
427
+ gr.Markdown("""
428
+ ## 🎯 Submit Your Model's Predictions
429
+
430
+ Upload your model's predictions on the SALT test set for evaluation.
431
  """)
432
 
433
  with gr.Row():
434
+ with gr.Column(scale=1):
435
+ # Model information
436
+ gr.Markdown("### πŸ“ Model Information")
437
+
438
+ model_name_input = gr.Textbox(
439
+ label="πŸ€– Model Name",
440
+ placeholder="e.g., MyTranslator-v1.0",
441
+ info="Unique name for your model"
442
  )
443
 
444
  author_input = gr.Textbox(
445
+ label="πŸ‘€ Author/Organization",
446
  placeholder="Your name or organization",
447
  value="Anonymous"
448
  )
449
 
450
+ description_input = gr.Textbox(
451
+ label="πŸ“„ Description (Optional)",
452
+ placeholder="Brief description of your model",
453
+ lines=3
454
  )
455
+
456
+ # File upload
457
+ gr.Markdown("### πŸ“€ Upload Predictions")
458
+
459
+ predictions_file = gr.File(
460
+ label="πŸ“‚ Predictions File",
461
+ file_types=[".csv", ".tsv", ".json"],
462
+ info="CSV/TSV/JSON file with your model's predictions"
463
+ )
464
+
465
+ validate_btn = gr.Button("βœ… Validate Submission", variant="secondary")
466
+ submit_btn = gr.Button("πŸš€ Submit for Evaluation", variant="primary", interactive=False)
467
 
468
  with gr.Column(scale=1):
469
+ gr.Markdown("### πŸ“Š Validation Results")
470
+ validation_output = gr.Markdown()
 
 
 
 
 
 
 
 
471
 
472
  # Results section
473
+ gr.Markdown("### πŸ† Evaluation Results")
474
+
475
+ with gr.Row():
476
+ evaluation_output = gr.Markdown()
477
+
478
+ with gr.Row():
479
+ with gr.Column():
480
+ submission_plot = gr.Plot(label="πŸ“ˆ Your Submission Analysis")
481
+ with gr.Column():
482
+ updated_leaderboard_plot = gr.Plot(label="πŸ† Updated Leaderboard")
483
+
484
+ with gr.Row():
485
+ results_table = gr.Dataframe(label="πŸ“Š Updated Leaderboard", interactive=False)
486
 
487
+ # Tab 3: Leaderboard
488
  with gr.Tab("πŸ† Leaderboard", id="leaderboard"):
489
+ with gr.Row():
490
+ with gr.Column(scale=3):
491
+ search_input = gr.Textbox(
492
+ label="πŸ” Search Models",
493
+ placeholder="Search by model name, author...",
494
+ )
495
+ with gr.Column(scale=1):
496
+ model_type_dropdown = gr.Dropdown(
497
+ label="πŸ”§ Model Type",
498
+ choices=["all", "user_submission", "baseline"],
499
+ value="all"
500
+ )
501
+ with gr.Column(scale=1):
502
+ min_coverage_slider = gr.Slider(
503
+ label="πŸ“Š Min Coverage",
504
+ minimum=0.0,
505
+ maximum=1.0,
506
+ value=0.0,
507
+ step=0.1
508
+ )
509
+ with gr.Column(scale=1):
510
+ google_only_checkbox = gr.Checkbox(
511
+ label="πŸ€– Google Comparable Only",
512
+ value=False
513
+ )
514
 
515
  with gr.Row():
516
+ refresh_btn = gr.Button("πŸ”„ Refresh", variant="secondary")
 
 
 
 
 
517
 
518
+ with gr.Row():
519
+ leaderboard_stats = gr.Markdown()
520
+
521
+ with gr.Row():
522
+ with gr.Column():
523
+ leaderboard_plot = gr.Plot(label="πŸ† Rankings")
524
+ with gr.Column():
525
+ comparison_plot = gr.Plot(label="πŸ“Š Multi-Metric Comparison")
526
 
527
  with gr.Row():
528
  leaderboard_table = gr.Dataframe(
529
+ label="πŸ“ˆ Full Leaderboard",
530
  interactive=False,
531
  wrap=True
532
  )
533
+
534
+ # Tab 4: Model Analysis
535
+ with gr.Tab("πŸ” Model Analysis", id="analysis"):
536
+ with gr.Row():
537
+ model_select = gr.Dropdown(
538
+ label="πŸ€– Select Model",
539
+ choices=[],
540
+ value=None,
541
+ info="Choose a model for detailed analysis"
542
+ )
543
+ analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
544
 
545
  with gr.Row():
546
+ model_details = gr.Markdown()
547
+
548
+ with gr.Row():
549
+ model_analysis_plot = gr.Plot(label="πŸ“Š Detailed Performance Analysis")
550
 
551
+ # Tab 5: Documentation
552
  with gr.Tab("πŸ“š Documentation", id="docs"):
553
+ gr.Markdown(f"""
554
+ # πŸ“– SALT Translation Leaderboard Documentation
555
 
556
+ ## 🎯 Overview
557
+
558
+ The SALT Translation Leaderboard is a scientific evaluation platform for translation models on Ugandan languages.
559
+ Submit your model's predictions on our standardized test set to see how it compares with other models.
560
+
561
+ ## πŸ—£οΈ Supported Languages
562
 
563
+ **All UG40 Languages ({len(ALL_UG40_LANGUAGES)} total):**
564
+ {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in ALL_UG40_LANGUAGES])}
565
 
566
+ **Google Translate Comparable ({len(GOOGLE_SUPPORTED_LANGUAGES)} languages):**
567
+ {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in GOOGLE_SUPPORTED_LANGUAGES])}
 
 
568
 
569
+ ## πŸ“Š Evaluation Metrics
570
 
571
+ ### Primary Metrics
572
+ - **Quality Score**: Composite metric (0-1, higher better) combining multiple metrics
573
+ - **BLEU**: Translation quality score (0-100, higher better)
574
+ - **ChrF**: Character-level F-score (0-1, higher better)
 
575
 
576
+ ### Secondary Metrics
577
+ - **ROUGE-1/ROUGE-L**: Recall-oriented metrics (0-1, higher better)
578
+ - **CER/WER**: Character/Word Error Rate (0-1, lower better)
579
+ - **Length Ratio**: Prediction/reference length ratio
580
 
581
+ ## πŸ”„ Submission Process
 
 
 
 
582
 
583
+ ### Step 1: Download Test Set
584
+ 1. Go to "Download Test Set" tab
585
+ 2. Click "Download Test Set" button
586
+ 3. Save the `salt_test_set.csv` file
587
 
588
+ ### Step 2: Generate Predictions
589
+ 1. Load the test set in your code
590
+ 2. For each row, translate `source_text` from `source_language` to `target_language`
591
+ 3. Save results as CSV with columns: `sample_id`, `prediction`
592
 
593
+ ### Step 3: Submit & Evaluate
594
+ 1. Go to "Submit Predictions" tab
595
+ 2. Fill in model information
596
+ 3. Upload your predictions file
597
+ 4. Validate and submit for evaluation
598
 
599
+ ## πŸ“‹ File Formats
600
+
601
+ ### Test Set Format
602
+ ```csv
603
+ sample_id,source_text,source_language,target_language,domain,google_comparable
604
+ salt_000001,"Hello world",eng,lug,general,true
605
+ salt_000002,"How are you?",eng,ach,conversation,true
606
+ ```
607
+
608
+ ### Predictions Format
609
+ ```csv
610
+ sample_id,prediction
611
+ salt_000001,"Amakuru ensi"
612
+ salt_000002,"Ibino nining?"
613
  ```
614
 
615
+ ## πŸ† Leaderboard Types
616
+
617
+ ### 1. Full UG40 Leaderboard
618
+ - Includes all {len(get_all_language_pairs())} language pairs
619
+ - Complete evaluation across all Ugandan languages
620
+ - Primary ranking system
621
+
622
+ ### 2. Google Translate Comparable
623
+ - Limited to {len(get_google_comparable_pairs())} pairs
624
+ - Only languages supported by Google Translate
625
+ - Allows direct comparison with Google Translate baseline
626
+
627
+ ## πŸ”¬ Scientific Rigor
628
+
629
+ - **Standardized Evaluation**: Same test set for all models
630
+ - **Multiple Metrics**: Comprehensive evaluation beyond just BLEU
631
+ - **Coverage Tracking**: Transparency about what each model covers
632
+ - **Reproducible**: All evaluation code and data available
633
+
634
+ ## 🀝 Contributing
635
 
636
  This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
 
637
 
638
+ **Contact**: [[email protected]](mailto:[email protected])
639
+ **GitHub**: [Sunbird AI GitHub](https://github.com/sunbirdai)
640
+
641
+ ## πŸ“„ Citation
642
 
643
  If you use this leaderboard in your research, please cite:
644
+
645
+ ```bibtex
646
  @misc{{salt_leaderboard_2024,
647
+ title={{SALT Translation Leaderboard: Evaluation of Translation Models on Ugandan Languages}},
648
  author={{Sunbird AI}},
649
  year={{2024}},
650
  url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard}}
651
  }}
652
  ```
653
+
654
+ ## πŸ”— Related Resources
655
+
656
+ - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
657
+ - **Sunbird AI Models**: [Sunbird Organization](https://huggingface.co/Sunbird)
658
+ - **Research Papers**: [Sunbird AI Publications](https://sunbird.ai/research)
659
  """)
660
 
661
+ # Event handlers with state management
662
+ predictions_validated = gr.State(value=None)
663
+ validation_info_state = gr.State(value=None)
664
+
665
+ # Download test set
666
+ download_btn.click(
667
+ fn=download_test_set,
668
+ outputs=[download_file, download_info]
669
+ )
670
+
671
+ # Validate predictions
672
+ def handle_validation(file, model_name, author, description):
673
+ report, predictions = validate_submission(file, model_name, author, description)
674
+ is_valid = predictions is not None
675
+ return report, predictions, predictions, is_valid
676
+
677
+ validate_btn.click(
678
+ fn=handle_validation,
679
+ inputs=[predictions_file, model_name_input, author_input, description_input],
680
+ outputs=[validation_output, predictions_validated, validation_info_state, submit_btn]
681
+ )
682
+
683
+ # Submit for evaluation
684
+ def handle_submission(predictions, model_name, author, description, validation_info):
685
+ if predictions is None:
686
+ return "❌ Please validate your submission first", None, None, None
687
+
688
+ # Extract validation info dict
689
+ validation_dict = {
690
+ 'coverage': getattr(validation_info, 'coverage', 0.8) if hasattr(validation_info, 'coverage') else 0.8,
691
+ 'report': 'Validation passed'
692
+ }
693
+
694
+ return evaluate_submission(predictions, model_name, author, description, validation_dict)
695
+
696
  submit_btn.click(
697
+ fn=handle_submission,
698
+ inputs=[predictions_validated, model_name_input, author_input, description_input, validation_info_state],
699
+ outputs=[evaluation_output, results_table, submission_plot, updated_leaderboard_plot]
 
700
  )
701
 
702
+ # Refresh leaderboard
703
+ def update_leaderboard_and_dropdown(*args):
704
+ table, plot1, plot2, stats = refresh_leaderboard_display(*args)
705
+
706
+ # Update model dropdown choices
707
+ model_choices = table['model_name'].tolist() if not table.empty else []
708
+
709
+ return table, plot1, plot2, stats, gr.Dropdown(choices=model_choices)
710
+
711
  refresh_btn.click(
712
+ fn=update_leaderboard_and_dropdown,
713
+ inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
714
+ outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
715
  )
716
 
717
+ # Auto-refresh on filter changes
718
+ for input_component in [search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox]:
719
+ input_component.change(
720
+ fn=update_leaderboard_and_dropdown,
721
+ inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
722
+ outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
723
+ )
724
+
725
+ # Model analysis
726
+ analyze_btn.click(
727
+ fn=get_model_details,
728
+ inputs=[model_select],
729
+ outputs=[model_details, model_analysis_plot]
730
  )
731
 
732
+ # Load initial data
733
  demo.load(
734
+ fn=update_leaderboard_and_dropdown,
735
+ inputs=[search_input, model_type_dropdown, min_coverage_slider, google_only_checkbox],
736
+ outputs=[leaderboard_table, leaderboard_plot, comparison_plot, leaderboard_stats, model_select]
737
  )
738
 
739
+ # Launch the application
740
  if __name__ == "__main__":
741
  demo.launch(
742
  server_name="0.0.0.0",
743
  server_port=7860,
744
  share=False,
745
+ show_error=True,
746
+ enable_queue=True
747
  )