sudoping01 commited on
Commit
dce5a82
Β·
verified Β·
1 Parent(s): 6e99c6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1349 -179
app.py CHANGED
@@ -1,3 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
@@ -5,15 +414,337 @@ from jiwer import wer, cer
5
  import os
6
  from datetime import datetime
7
  import re
8
-
 
9
  from huggingface_hub import login
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Login to Hugging Face Hub (if token is available)
12
  token = os.environ.get("HG_TOKEN")
13
  if token:
14
  login(token)
15
 
16
-
17
  try:
18
  dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
19
  references = {row["id"]: row["text"] for row in dataset}
@@ -22,25 +753,30 @@ except Exception as e:
22
  print(f"Error loading dataset: {str(e)}")
23
  references = {}
24
 
25
-
26
  leaderboard_file = "leaderboard.csv"
27
  if not os.path.exists(leaderboard_file):
28
-
29
  sample_data = [
30
- ["test_1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
31
- ["test_2", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
32
- ]
33
  pd.DataFrame(sample_data,
34
- columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
35
  print(f"Created new leaderboard file with sample data")
36
  else:
37
  leaderboard_df = pd.read_csv(leaderboard_file)
38
 
39
-
40
- if "Combined_Score" not in leaderboard_df.columns:
41
- leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
42
- leaderboard_df.to_csv(leaderboard_file, index=False)
43
- print(f"Added Combined_Score column to existing leaderboard")
 
 
 
 
 
 
44
  print(f"Loaded leaderboard with {len(leaderboard_df)} entries")
45
 
46
  def normalize_text(text):
@@ -103,7 +839,6 @@ def calculate_metrics(predictions_df):
103
  avg_wer = sum(item["wer"] for item in results) / len(results)
104
  avg_cer = sum(item["cer"] for item in results) / len(results)
105
 
106
-
107
  weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
108
  weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
109
 
@@ -113,26 +848,209 @@ def format_as_percentage(value):
113
  """Convert decimal to percentage with 2 decimal places"""
114
  return f"{value * 100:.2f}%"
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
117
  """Format leaderboard for display with ranking and percentages"""
118
  if df is None or len(df) == 0:
119
- return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
120
 
121
-
122
  display_df = df.copy()
123
 
124
-
125
- display_df = display_df.sort_values(sort_by)
 
 
 
 
126
 
 
127
  display_df.insert(0, "Rank", range(1, len(display_df) + 1))
128
 
 
129
  for col in ["WER", "CER", "Combined_Score"]:
130
  if col in display_df.columns:
131
- display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
132
 
133
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- return display_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  def update_ranking(method):
138
  """Update leaderboard ranking based on selected method"""
@@ -152,52 +1070,91 @@ def update_ranking(method):
152
 
153
  except Exception as e:
154
  print(f"Error updating ranking: {str(e)}")
155
- return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
156
 
157
- def process_submission(model_name, csv_file):
158
- """Process a new model submission"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  if not model_name or not model_name.strip():
160
- return "Error: Please provide a model name.", None
161
 
162
  if not csv_file:
163
- return "Error: Please upload a CSV file.", None
164
 
165
  try:
166
  df = pd.read_csv(csv_file)
167
 
168
  if len(df) == 0:
169
- return "Error: Uploaded CSV is empty.", None
170
 
171
  if set(df.columns) != {"id", "text"}:
172
- return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None
173
 
174
  if df["id"].duplicated().any():
175
  dup_ids = df[df["id"].duplicated()]["id"].unique()
176
- return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None
177
 
178
  missing_ids = set(references.keys()) - set(df["id"])
179
  extra_ids = set(df["id"]) - set(references.keys())
180
 
181
  if missing_ids:
182
- return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None
183
 
184
  if extra_ids:
185
- return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None
186
 
187
  try:
188
  avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
189
 
190
- # Check for suspiciously low values
191
  if avg_wer < 0.001:
192
- return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
193
 
194
  except Exception as e:
195
- return f"Error calculating metrics: {str(e)}", None
196
 
197
-
198
  leaderboard = pd.read_csv(leaderboard_file)
199
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
200
-
201
  combined_score = avg_wer * 0.7 + avg_cer * 0.3
202
 
203
  if model_name in leaderboard["Model_Name"].values:
@@ -206,11 +1163,13 @@ def process_submission(model_name, csv_file):
206
  leaderboard.loc[idx, "CER"] = avg_cer
207
  leaderboard.loc[idx, "Combined_Score"] = combined_score
208
  leaderboard.loc[idx, "timestamp"] = timestamp
 
 
209
  updated_leaderboard = leaderboard
210
  else:
211
  new_entry = pd.DataFrame(
212
- [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
213
- columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
214
  )
215
  updated_leaderboard = pd.concat([leaderboard, new_entry])
216
 
@@ -218,11 +1177,24 @@ def process_submission(model_name, csv_file):
218
  updated_leaderboard.to_csv(leaderboard_file, index=False)
219
 
220
  display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
 
 
 
221
 
222
- return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
 
 
 
 
 
 
 
 
 
 
223
 
224
  except Exception as e:
225
- return f"Error processing submission: {str(e)}", None
226
 
227
  def get_current_leaderboard():
228
  """Get the current leaderboard data for display"""
@@ -230,178 +1202,376 @@ def get_current_leaderboard():
230
  if os.path.exists(leaderboard_file):
231
  current_leaderboard = pd.read_csv(leaderboard_file)
232
 
233
- if "Combined_Score" not in current_leaderboard.columns:
234
- current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
235
- current_leaderboard.to_csv(leaderboard_file, index=False)
236
-
 
 
 
 
 
 
237
  return current_leaderboard
238
  else:
239
- return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
240
  except Exception as e:
241
  print(f"Error getting leaderboard: {str(e)}")
242
- return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
243
 
244
  def create_leaderboard_table():
245
  """Create and format the leaderboard table for display"""
246
  leaderboard_data = get_current_leaderboard()
247
  return prepare_leaderboard_for_display(leaderboard_data)
248
 
249
- with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
250
- gr.Markdown(
251
- """
252
- # πŸ‡²πŸ‡± Bambara ASR Leaderboard
253
-
254
- This leaderboard tracks and evaluates speech recognition models for the Bambara language.
255
- Models are ranked based on Word Error Rate (WER), Character Error Rate (CER), and a combined score.
256
-
257
- ## Current Models Performance
258
- """
259
- )
260
 
261
- current_data = get_current_leaderboard()
 
262
 
263
-
264
- if len(current_data) > 0:
265
- best_model = current_data.sort_values("Combined_Score").iloc[0]
266
- gr.Markdown(f"""
267
- ### πŸ† Current Best Model: **{best_model['Model_Name']}**
268
- * WER: **{best_model['WER']*100:.2f}%**
269
- * CER: **{best_model['CER']*100:.2f}%**
270
- * Combined Score: **{best_model['Combined_Score']*100:.2f}%**
271
- """)
272
 
273
- with gr.Tabs() as tabs:
274
- with gr.TabItem("πŸ… Model Rankings"):
275
 
276
- initial_leaderboard = create_leaderboard_table()
277
-
278
- ranking_method = gr.Radio(
279
- ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
280
- label="Ranking Method",
281
- value="Combined Score (WER 70%, CER 30%)"
282
- )
283
-
284
- leaderboard_view = gr.DataFrame(
285
- value=initial_leaderboard,
286
- interactive=False,
287
- label="Models are ranked by selected metric - lower is better"
288
- )
289
-
290
- ranking_method.change(
291
- fn=update_ranking,
292
- inputs=[ranking_method],
293
- outputs=[leaderboard_view]
294
- )
295
-
296
- with gr.Accordion("Metrics Explanation", open=False):
297
- gr.Markdown(
298
- """
299
- ## Understanding ASR Metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  ### Word Error Rate (WER)
302
- WER measures how accurately the ASR system recognizes whole words:
303
- * Lower values indicate better performance
304
- * Calculated as: (Substitutions + Insertions + Deletions) / Total Words
305
- * A WER of 0% means perfect transcription
306
- * A WER of 20% means approximately 1 in 5 words contains an error
 
 
 
307
 
308
  ### Character Error Rate (CER)
309
- CER measures accuracy at the character level:
310
- * More fine-grained than WER
311
- * Better at capturing partial word matches
312
- * Particularly useful for agglutinative languages like Bambara
313
 
314
- ### Combined Score
315
- * Weighted average: 70% WER + 30% CER
316
- * Provides a balanced evaluation of model performance
317
- * Used as the primary ranking metric
318
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  )
320
-
321
- with gr.TabItem("πŸ“Š Submit New Results"):
322
- gr.Markdown(
323
- """
324
- ### Submit a new model for evaluation
325
 
326
- Upload a CSV file with the following format:
327
- * Must contain exactly two columns: 'id' and 'text'
328
- * The 'id' column should match the reference dataset IDs
329
- * The 'text' column should contain your model's transcriptions
330
- """
331
- )
332
-
333
- with gr.Row():
334
- model_name_input = gr.Textbox(
335
- label="Model Name",
336
- placeholder="e.g., MALIBA-AI/bambara-asr"
 
 
 
 
 
 
 
 
 
 
337
  )
338
- gr.Markdown("*Use a descriptive name to identify your model*")
339
 
340
- with gr.Row():
341
- csv_upload = gr.File(
342
- label="Upload CSV File",
343
- file_types=[".csv"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  )
345
- gr.Markdown("*CSV with columns: id, text*")
346
-
347
- submit_btn = gr.Button("Submit", variant="primary")
348
- output_msg = gr.Textbox(label="Status", interactive=False)
349
- leaderboard_display = gr.DataFrame(
350
- label="Updated Leaderboard",
351
- value=initial_leaderboard,
352
- interactive=False
353
- )
354
-
355
- submit_btn.click(
356
- fn=process_submission,
357
- inputs=[model_name_input, csv_upload],
358
- outputs=[output_msg, leaderboard_display]
359
- )
360
 
361
- with gr.TabItem("πŸ“ Benchmark Dataset"):
362
- gr.Markdown(
363
- """
364
- ## About the Benchmark Dataset
365
 
366
- This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset:
 
367
 
368
- * Contains diverse Bambara speech samples
369
- * Includes various speakers, accents, and dialects
370
- * Covers different speech styles and recording conditions
371
- * Transcribed and validated
372
 
373
- ### How to Generate Predictions
374
 
375
- To submit results to this leaderboard:
 
 
 
 
376
 
377
- 1. Download the audio files from the benchmark dataset
378
- 2. Run your ASR model on the audio files
379
- 3. Generate a CSV file with 'id' and 'text' columns
380
- 4. Submit your results using the form in the "Submit New Results" tab
381
 
382
- ### Evaluation Guidelines
 
 
 
 
383
 
384
- * Text is normalized (lowercase, punctuation removed) before metrics calculation
385
- * Extreme outliers are capped to prevent skewing results
386
- * All submissions are validated for format and completeness
387
-
388
- NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia
389
- """
390
- )
391
-
392
- gr.Markdown(
393
- """
394
- ---
395
- ### About MALIBA-AI
396
-
397
- **MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation**
398
-
399
- *"No Malian Language Left Behind"*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
- This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
402
- For more information, visit [MALIBA-AI on Hugging Face](https://huggingface.co/MALIBA-AI).
403
- """
404
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
  if __name__ == "__main__":
407
  demo.launch()
 
1
+ # import gradio as gr
2
+ # import pandas as pd
3
+ # from datasets import load_dataset
4
+ # from jiwer import wer, cer
5
+ # import os
6
+ # from datetime import datetime
7
+ # import re
8
+
9
+ # from huggingface_hub import login
10
+
11
+ # # Login to Hugging Face Hub (if token is available)
12
+ # token = os.environ.get("HG_TOKEN")
13
+ # if token:
14
+ # login(token)
15
+
16
+
17
+ # try:
18
+ # dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
19
+ # references = {row["id"]: row["text"] for row in dataset}
20
+ # print(f"Loaded {len(references)} reference transcriptions")
21
+ # except Exception as e:
22
+ # print(f"Error loading dataset: {str(e)}")
23
+ # references = {}
24
+
25
+
26
+ # leaderboard_file = "leaderboard.csv"
27
+ # if not os.path.exists(leaderboard_file):
28
+
29
+ # sample_data = [
30
+ # ["test_1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
31
+ # ["test_2", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
32
+ # ]
33
+ # pd.DataFrame(sample_data,
34
+ # columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
35
+ # print(f"Created new leaderboard file with sample data")
36
+ # else:
37
+ # leaderboard_df = pd.read_csv(leaderboard_file)
38
+
39
+
40
+ # if "Combined_Score" not in leaderboard_df.columns:
41
+ # leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
42
+ # leaderboard_df.to_csv(leaderboard_file, index=False)
43
+ # print(f"Added Combined_Score column to existing leaderboard")
44
+ # print(f"Loaded leaderboard with {len(leaderboard_df)} entries")
45
+
46
+ # def normalize_text(text):
47
+ # """Normalize text for WER/CER calculation"""
48
+ # if not isinstance(text, str):
49
+ # text = str(text)
50
+
51
+ # text = text.lower()
52
+ # text = re.sub(r'[^\w\s]', '', text)
53
+ # text = re.sub(r'\s+', ' ', text).strip()
54
+ # return text
55
+
56
+ # def calculate_metrics(predictions_df):
57
+ # """Calculate WER and CER for predictions."""
58
+ # results = []
59
+ # total_ref_words = 0
60
+ # total_ref_chars = 0
61
+
62
+ # for _, row in predictions_df.iterrows():
63
+ # id_val = row["id"]
64
+ # if id_val not in references:
65
+ # continue
66
+
67
+ # reference = normalize_text(references[id_val])
68
+ # hypothesis = normalize_text(row["text"])
69
+
70
+ # if not reference or not hypothesis:
71
+ # continue
72
+
73
+ # reference_words = reference.split()
74
+ # hypothesis_words = hypothesis.split()
75
+ # reference_chars = list(reference)
76
+
77
+ # try:
78
+ # sample_wer = wer(reference, hypothesis)
79
+ # sample_cer = cer(reference, hypothesis)
80
+
81
+ # sample_wer = min(sample_wer, 2.0)
82
+ # sample_cer = min(sample_cer, 2.0)
83
+
84
+ # total_ref_words += len(reference_words)
85
+ # total_ref_chars += len(reference_chars)
86
+
87
+ # results.append({
88
+ # "id": id_val,
89
+ # "reference": reference,
90
+ # "hypothesis": hypothesis,
91
+ # "ref_word_count": len(reference_words),
92
+ # "ref_char_count": len(reference_chars),
93
+ # "wer": sample_wer,
94
+ # "cer": sample_cer
95
+ # })
96
+ # except Exception as e:
97
+ # print(f"Error processing sample {id_val}: {str(e)}")
98
+ # pass
99
+
100
+ # if not results:
101
+ # raise ValueError("No valid samples for WER/CER calculation")
102
+
103
+ # avg_wer = sum(item["wer"] for item in results) / len(results)
104
+ # avg_cer = sum(item["cer"] for item in results) / len(results)
105
+
106
+
107
+ # weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
108
+ # weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
109
+
110
+ # return avg_wer, avg_cer, weighted_wer, weighted_cer, results
111
+
112
+ # def format_as_percentage(value):
113
+ # """Convert decimal to percentage with 2 decimal places"""
114
+ # return f"{value * 100:.2f}%"
115
+
116
+ # def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
117
+ # """Format leaderboard for display with ranking and percentages"""
118
+ # if df is None or len(df) == 0:
119
+ # return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
120
+
121
+
122
+ # display_df = df.copy()
123
+
124
+
125
+ # display_df = display_df.sort_values(sort_by)
126
+
127
+ # display_df.insert(0, "Rank", range(1, len(display_df) + 1))
128
+
129
+ # for col in ["WER", "CER", "Combined_Score"]:
130
+ # if col in display_df.columns:
131
+ # display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
132
+
133
+
134
+
135
+ # return display_df
136
+
137
+ # def update_ranking(method):
138
+ # """Update leaderboard ranking based on selected method"""
139
+ # try:
140
+ # current_lb = pd.read_csv(leaderboard_file)
141
+
142
+ # if "Combined_Score" not in current_lb.columns:
143
+ # current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3
144
+
145
+ # sort_column = "Combined_Score"
146
+ # if method == "WER Only":
147
+ # sort_column = "WER"
148
+ # elif method == "CER Only":
149
+ # sort_column = "CER"
150
+
151
+ # return prepare_leaderboard_for_display(current_lb, sort_column)
152
+
153
+ # except Exception as e:
154
+ # print(f"Error updating ranking: {str(e)}")
155
+ # return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
156
+
157
+ # def process_submission(model_name, csv_file):
158
+ # """Process a new model submission"""
159
+ # if not model_name or not model_name.strip():
160
+ # return "Error: Please provide a model name.", None
161
+
162
+ # if not csv_file:
163
+ # return "Error: Please upload a CSV file.", None
164
+
165
+ # try:
166
+ # df = pd.read_csv(csv_file)
167
+
168
+ # if len(df) == 0:
169
+ # return "Error: Uploaded CSV is empty.", None
170
+
171
+ # if set(df.columns) != {"id", "text"}:
172
+ # return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None
173
+
174
+ # if df["id"].duplicated().any():
175
+ # dup_ids = df[df["id"].duplicated()]["id"].unique()
176
+ # return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None
177
+
178
+ # missing_ids = set(references.keys()) - set(df["id"])
179
+ # extra_ids = set(df["id"]) - set(references.keys())
180
+
181
+ # if missing_ids:
182
+ # return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None
183
+
184
+ # if extra_ids:
185
+ # return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None
186
+
187
+ # try:
188
+ # avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
189
+
190
+ # # Check for suspiciously low values
191
+ # if avg_wer < 0.001:
192
+ # return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
193
+
194
+ # except Exception as e:
195
+ # return f"Error calculating metrics: {str(e)}", None
196
+
197
+
198
+ # leaderboard = pd.read_csv(leaderboard_file)
199
+ # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
200
+
201
+ # combined_score = avg_wer * 0.7 + avg_cer * 0.3
202
+
203
+ # if model_name in leaderboard["Model_Name"].values:
204
+ # idx = leaderboard[leaderboard["Model_Name"] == model_name].index
205
+ # leaderboard.loc[idx, "WER"] = avg_wer
206
+ # leaderboard.loc[idx, "CER"] = avg_cer
207
+ # leaderboard.loc[idx, "Combined_Score"] = combined_score
208
+ # leaderboard.loc[idx, "timestamp"] = timestamp
209
+ # updated_leaderboard = leaderboard
210
+ # else:
211
+ # new_entry = pd.DataFrame(
212
+ # [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
213
+ # columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
214
+ # )
215
+ # updated_leaderboard = pd.concat([leaderboard, new_entry])
216
+
217
+ # updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
218
+ # updated_leaderboard.to_csv(leaderboard_file, index=False)
219
+
220
+ # display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
221
+
222
+ # return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
223
+
224
+ # except Exception as e:
225
+ # return f"Error processing submission: {str(e)}", None
226
+
227
+ # def get_current_leaderboard():
228
+ # """Get the current leaderboard data for display"""
229
+ # try:
230
+ # if os.path.exists(leaderboard_file):
231
+ # current_leaderboard = pd.read_csv(leaderboard_file)
232
+
233
+ # if "Combined_Score" not in current_leaderboard.columns:
234
+ # current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
235
+ # current_leaderboard.to_csv(leaderboard_file, index=False)
236
+
237
+ # return current_leaderboard
238
+ # else:
239
+ # return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
240
+ # except Exception as e:
241
+ # print(f"Error getting leaderboard: {str(e)}")
242
+ # return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
243
+
244
+ # def create_leaderboard_table():
245
+ # """Create and format the leaderboard table for display"""
246
+ # leaderboard_data = get_current_leaderboard()
247
+ # return prepare_leaderboard_for_display(leaderboard_data)
248
+
249
+ # with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
250
+ # gr.Markdown(
251
+ # """
252
+ # # πŸ‡²πŸ‡± Bambara ASR Leaderboard
253
+
254
+ # This leaderboard tracks and evaluates speech recognition models for the Bambara language.
255
+ # Models are ranked based on Word Error Rate (WER), Character Error Rate (CER), and a combined score.
256
+
257
+ # ## Current Models Performance
258
+ # """
259
+ # )
260
+
261
+ # current_data = get_current_leaderboard()
262
+
263
+
264
+ # if len(current_data) > 0:
265
+ # best_model = current_data.sort_values("Combined_Score").iloc[0]
266
+ # gr.Markdown(f"""
267
+ # ### πŸ† Current Best Model: **{best_model['Model_Name']}**
268
+ # * WER: **{best_model['WER']*100:.2f}%**
269
+ # * CER: **{best_model['CER']*100:.2f}%**
270
+ # * Combined Score: **{best_model['Combined_Score']*100:.2f}%**
271
+ # """)
272
+
273
+ # with gr.Tabs() as tabs:
274
+ # with gr.TabItem("πŸ… Model Rankings"):
275
+
276
+ # initial_leaderboard = create_leaderboard_table()
277
+
278
+ # ranking_method = gr.Radio(
279
+ # ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
280
+ # label="Ranking Method",
281
+ # value="Combined Score (WER 70%, CER 30%)"
282
+ # )
283
+
284
+ # leaderboard_view = gr.DataFrame(
285
+ # value=initial_leaderboard,
286
+ # interactive=False,
287
+ # label="Models are ranked by selected metric - lower is better"
288
+ # )
289
+
290
+ # ranking_method.change(
291
+ # fn=update_ranking,
292
+ # inputs=[ranking_method],
293
+ # outputs=[leaderboard_view]
294
+ # )
295
+
296
+ # with gr.Accordion("Metrics Explanation", open=False):
297
+ # gr.Markdown(
298
+ # """
299
+ # ## Understanding ASR Metrics
300
+
301
+ # ### Word Error Rate (WER)
302
+ # WER measures how accurately the ASR system recognizes whole words:
303
+ # * Lower values indicate better performance
304
+ # * Calculated as: (Substitutions + Insertions + Deletions) / Total Words
305
+ # * A WER of 0% means perfect transcription
306
+ # * A WER of 20% means approximately 1 in 5 words contains an error
307
+
308
+ # ### Character Error Rate (CER)
309
+ # CER measures accuracy at the character level:
310
+ # * More fine-grained than WER
311
+ # * Better at capturing partial word matches
312
+ # * Particularly useful for agglutinative languages like Bambara
313
+
314
+ # ### Combined Score
315
+ # * Weighted average: 70% WER + 30% CER
316
+ # * Provides a balanced evaluation of model performance
317
+ # * Used as the primary ranking metric
318
+ # """
319
+ # )
320
+
321
+ # with gr.TabItem("πŸ“Š Submit New Results"):
322
+ # gr.Markdown(
323
+ # """
324
+ # ### Submit a new model for evaluation
325
+
326
+ # Upload a CSV file with the following format:
327
+ # * Must contain exactly two columns: 'id' and 'text'
328
+ # * The 'id' column should match the reference dataset IDs
329
+ # * The 'text' column should contain your model's transcriptions
330
+ # """
331
+ # )
332
+
333
+ # with gr.Row():
334
+ # model_name_input = gr.Textbox(
335
+ # label="Model Name",
336
+ # placeholder="e.g., MALIBA-AI/bambara-asr"
337
+ # )
338
+ # gr.Markdown("*Use a descriptive name to identify your model*")
339
+
340
+ # with gr.Row():
341
+ # csv_upload = gr.File(
342
+ # label="Upload CSV File",
343
+ # file_types=[".csv"]
344
+ # )
345
+ # gr.Markdown("*CSV with columns: id, text*")
346
+
347
+ # submit_btn = gr.Button("Submit", variant="primary")
348
+ # output_msg = gr.Textbox(label="Status", interactive=False)
349
+ # leaderboard_display = gr.DataFrame(
350
+ # label="Updated Leaderboard",
351
+ # value=initial_leaderboard,
352
+ # interactive=False
353
+ # )
354
+
355
+ # submit_btn.click(
356
+ # fn=process_submission,
357
+ # inputs=[model_name_input, csv_upload],
358
+ # outputs=[output_msg, leaderboard_display]
359
+ # )
360
+
361
+ # with gr.TabItem("πŸ“ Benchmark Dataset"):
362
+ # gr.Markdown(
363
+ # """
364
+ # ## About the Benchmark Dataset
365
+
366
+ # This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset:
367
+
368
+ # * Contains diverse Bambara speech samples
369
+ # * Includes various speakers, accents, and dialects
370
+ # * Covers different speech styles and recording conditions
371
+ # * Transcribed and validated
372
+
373
+ # ### How to Generate Predictions
374
+
375
+ # To submit results to this leaderboard:
376
+
377
+ # 1. Download the audio files from the benchmark dataset
378
+ # 2. Run your ASR model on the audio files
379
+ # 3. Generate a CSV file with 'id' and 'text' columns
380
+ # 4. Submit your results using the form in the "Submit New Results" tab
381
+
382
+ # ### Evaluation Guidelines
383
+
384
+ # * Text is normalized (lowercase, punctuation removed) before metrics calculation
385
+ # * Extreme outliers are capped to prevent skewing results
386
+ # * All submissions are validated for format and completeness
387
+
388
+ # NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia
389
+ # """
390
+ # )
391
+
392
+ # gr.Markdown(
393
+ # """
394
+ # ---
395
+ # ### About MALIBA-AI
396
+
397
+ # **MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation**
398
+
399
+ # *"No Malian Language Left Behind"*
400
+
401
+ # This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
402
+ # For more information, visit [MALIBA-AI on Hugging Face](https://huggingface.co/MALIBA-AI).
403
+ # """
404
+ # )
405
+
406
+ # if __name__ == "__main__":
407
+ # demo.launch()
408
+
409
+
410
  import gradio as gr
411
  import pandas as pd
412
  from datasets import load_dataset
 
414
  import os
415
  from datetime import datetime
416
  import re
417
+ import plotly.express as px
418
+ import plotly.graph_objects as go
419
  from huggingface_hub import login
420
+ import numpy as np
421
+
422
+ # Custom CSS inspired by Sahara leaderboard
423
+ custom_head_html = """
424
+ <link rel="preconnect" href="https://fonts.googleapis.com">
425
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
426
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Rubik:wght@400;600&display=swap" rel="stylesheet">
427
+ """
428
+
429
+ # Header with MALIBA-AI branding
430
+ new_header_html = """
431
+ <center>
432
+ <br><br>
433
+ <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-bottom: 20px;">
434
+ <div style="font-size: 4em;">πŸ‡²πŸ‡±</div>
435
+ <div>
436
+ <h1 style="margin: 0; font-family: 'Rubik', sans-serif; color: #2f3b7d; font-size: 2.5em; font-weight: 700;">
437
+ Bambara ASR Leaderboard
438
+ </h1>
439
+ <p style="margin: 5px 0 0 0; font-size: 1.2em; color: #7d3561; font-weight: 600;">
440
+ Powered by MALIBA-AI β€’ "No Malian Language Left Behind"
441
+ </p>
442
+ </div>
443
+ <div style="font-size: 4em;">πŸŽ™οΈ</div>
444
+ </div>
445
+ </center>
446
+ """
447
+
448
+ # Advanced CSS styling inspired by Sahara
449
+ sahara_style_css = """
450
+ /* Global Styles */
451
+ div[class*="gradio-container"] {
452
+ background: #FFFBF5 !important;
453
+ color: #000 !important;
454
+ font-family: 'Inter', sans-serif !important;
455
+ }
456
+
457
+ div.svelte-1nguped {
458
+ background: white !important;
459
+ }
460
+
461
+ .fillable.svelte-15jxnnn.svelte-15jxnnn:not(.fill_width) {
462
+ max-width: 1580px !important;
463
+ }
464
+
465
+ /* Navigation Buttons */
466
+ .nav-button {
467
+ background-color: #117b75 !important;
468
+ color: #fff !important;
469
+ font-weight: bold !important;
470
+ border-radius: 8px !important;
471
+ border: none !important;
472
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
473
+ transition: all 0.3s ease !important;
474
+ }
475
+
476
+ .nav-button:hover {
477
+ background-color: #0f6b66 !important;
478
+ color: #e8850e !important;
479
+ transform: translateY(-1px) !important;
480
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important;
481
+ }
482
+
483
+ /* Content Cards */
484
+ .content-section {
485
+ padding: 40px 0;
486
+ }
487
+
488
+ .content-card {
489
+ background-color: #fff !important;
490
+ border-radius: 16px !important;
491
+ box-shadow: 0 10px 25px -5px rgba(0,0,0,0.1), 0 8px 10px -6px rgba(0,0,0,0.1) !important;
492
+ padding: 40px !important;
493
+ margin-bottom: 30px !important;
494
+ border: 1px solid rgba(0,0,0,0.05) !important;
495
+ }
496
+
497
+ /* Typography */
498
+ .content-card h2 {
499
+ font-family: "Rubik", sans-serif !important;
500
+ font-size: 32px !important;
501
+ font-weight: 700 !important;
502
+ line-height: 1.25 !important;
503
+ letter-spacing: -1px !important;
504
+ color: #2f3b7d !important;
505
+ margin-bottom: 20px !important;
506
+ text-align: center !important;
507
+ }
508
+
509
+ .content-card h3 {
510
+ font-size: 22px !important;
511
+ color: #2f3b7d !important;
512
+ font-weight: 600 !important;
513
+ margin-bottom: 15px !important;
514
+ }
515
+
516
+ .content-card h4 {
517
+ font-family: "Rubik", sans-serif !important;
518
+ color: #7d3561 !important;
519
+ font-weight: 600 !important;
520
+ margin-bottom: 10px !important;
521
+ }
522
+
523
+ .title {
524
+ color: #7d3561 !important;
525
+ font-weight: 600 !important;
526
+ }
527
+
528
+ /* Tab Styling */
529
+ .tab-wrapper.svelte-1tcem6n.svelte-1tcem6n {
530
+ display: flex;
531
+ align-items: center;
532
+ justify-content: space-between;
533
+ position: relative;
534
+ height: auto !important;
535
+ padding-bottom: 0 !important;
536
+ }
537
+
538
+ .selected.svelte-1tcem6n.svelte-1tcem6n {
539
+ background-color: #7d3561 !important;
540
+ color: #fff !important;
541
+ border-radius: 8px 8px 0 0 !important;
542
+ }
543
+
544
+ button.svelte-1tcem6n.svelte-1tcem6n {
545
+ color: #7d3561 !important;
546
+ font-weight: 600 !important;
547
+ font-size: 16px !important;
548
+ padding: 12px 20px !important;
549
+ background-color: #fff !important;
550
+ border-radius: 8px 8px 0 0 !important;
551
+ border: 2px solid #e9ecef !important;
552
+ border-bottom: none !important;
553
+ transition: all 0.3s ease !important;
554
+ }
555
+
556
+ button.svelte-1tcem6n.svelte-1tcem6n:hover {
557
+ background-color: #f8f9fa !important;
558
+ border-color: #7d3561 !important;
559
+ }
560
+
561
+ .tab-container.svelte-1tcem6n.svelte-1tcem6n:after {
562
+ content: "";
563
+ position: absolute;
564
+ bottom: 0;
565
+ left: 0;
566
+ right: 0;
567
+ height: 3px;
568
+ background: linear-gradient(90deg, #7d3561 0%, #2f3b7d 100%) !important;
569
+ }
570
+
571
+ /* Table Styling */
572
+ div[class*="gradio-container"] .prose table {
573
+ color: #000 !important;
574
+ border: 2px solid #dca02a !important;
575
+ border-radius: 12px !important;
576
+ margin-bottom: 20px !important;
577
+ margin-left: auto !important;
578
+ margin-right: auto !important;
579
+ width: 100% !important;
580
+ border-collapse: separate !important;
581
+ border-spacing: 0 !important;
582
+ overflow: hidden !important;
583
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important;
584
+ }
585
+
586
+ div[class*="gradio-container"] .prose thead tr {
587
+ background: linear-gradient(90deg, #7d3561 0%, #2f3b7d 100%) !important;
588
+ }
589
+
590
+ div[class*="gradio-container"] .prose th {
591
+ color: #fff !important;
592
+ font-weight: 700 !important;
593
+ font-size: 14px !important;
594
+ padding: 15px 10px !important;
595
+ text-align: center !important;
596
+ border: none !important;
597
+ }
598
+
599
+ div[class*="gradio-container"] .prose td {
600
+ font-size: 14px !important;
601
+ padding: 12px 10px !important;
602
+ border: none !important;
603
+ text-align: center !important;
604
+ color: #000 !important;
605
+ border-bottom: 1px solid #f8f9fa !important;
606
+ }
607
+
608
+ div[class*="gradio-container"] .prose tbody tr:nth-child(even) {
609
+ background-color: #f8f9fa !important;
610
+ }
611
+
612
+ div[class*="gradio-container"] .prose tbody tr:hover {
613
+ background-color: #e3f2fd !important;
614
+ transition: background-color 0.2s ease !important;
615
+ }
616
+
617
+ /* First column (model names) styling */
618
+ div[class*="gradio-container"] .prose th:first-child,
619
+ div[class*="gradio-container"] .prose td:first-child {
620
+ text-align: left !important;
621
+ min-width: 250px !important;
622
+ font-weight: 600 !important;
623
+ }
624
+
625
+ /* Performance badges */
626
+ .performance-badge {
627
+ display: inline-block;
628
+ padding: 4px 8px;
629
+ border-radius: 12px;
630
+ font-size: 12px;
631
+ font-weight: 600;
632
+ margin-left: 8px;
633
+ }
634
+
635
+ .badge-excellent {
636
+ background: #d4edda;
637
+ color: #155724;
638
+ }
639
+
640
+ .badge-good {
641
+ background: #fff3cd;
642
+ color: #856404;
643
+ }
644
+
645
+ .badge-fair {
646
+ background: #f8d7da;
647
+ color: #721c24;
648
+ }
649
+
650
+ /* Stats cards */
651
+ .stats-grid {
652
+ display: grid;
653
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
654
+ gap: 20px;
655
+ margin: 20px 0;
656
+ }
657
+
658
+ .stat-card {
659
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
660
+ color: white;
661
+ padding: 20px;
662
+ border-radius: 12px;
663
+ text-align: center;
664
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
665
+ }
666
+
667
+ .stat-number {
668
+ font-size: 2em;
669
+ font-weight: 700;
670
+ margin-bottom: 5px;
671
+ }
672
+
673
+ .stat-label {
674
+ font-size: 0.9em;
675
+ opacity: 0.9;
676
+ }
677
+
678
+ /* Form styling */
679
+ .form-section {
680
+ background: #f8f9fa;
681
+ border-radius: 12px;
682
+ padding: 25px;
683
+ margin: 20px 0;
684
+ border-left: 4px solid #7d3561;
685
+ }
686
+
687
+ /* Citation block */
688
+ .citation-block {
689
+ background-color: #FDF6E3 !important;
690
+ border-radius: 12px !important;
691
+ padding: 25px !important;
692
+ border-left: 4px solid #D97706 !important;
693
+ margin: 20px 0 !important;
694
+ }
695
+
696
+ /* Dropdown styling */
697
+ .gradio-dropdown {
698
+ border-radius: 8px !important;
699
+ border: 2px solid #e9ecef !important;
700
+ }
701
+
702
+ .gradio-dropdown:focus {
703
+ border-color: #7d3561 !important;
704
+ box-shadow: 0 0 0 3px rgba(125, 53, 97, 0.1) !important;
705
+ }
706
+
707
+ /* Button styling */
708
+ .gradio-button {
709
+ border-radius: 8px !important;
710
+ font-weight: 600 !important;
711
+ transition: all 0.3s ease !important;
712
+ }
713
+
714
+ .gradio-button.primary {
715
+ background: linear-gradient(135deg, #7d3561 0%, #2f3b7d 100%) !important;
716
+ border: none !important;
717
+ color: white !important;
718
+ }
719
+
720
+ .gradio-button.primary:hover {
721
+ transform: translateY(-2px) !important;
722
+ box-shadow: 0 4px 12px rgba(125, 53, 97, 0.3) !important;
723
+ }
724
+
725
+ /* Responsive design */
726
+ @media (max-width: 768px) {
727
+ .content-card {
728
+ padding: 20px !important;
729
+ margin-bottom: 20px !important;
730
+ }
731
+
732
+ .content-card h2 {
733
+ font-size: 24px !important;
734
+ }
735
+
736
+ .stats-grid {
737
+ grid-template-columns: 1fr !important;
738
+ }
739
+ }
740
+ """
741
 
742
  # Login to Hugging Face Hub (if token is available)
743
  token = os.environ.get("HG_TOKEN")
744
  if token:
745
  login(token)
746
 
747
+ # Load dataset
748
  try:
749
  dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
750
  references = {row["id"]: row["text"] for row in dataset}
 
753
  print(f"Error loading dataset: {str(e)}")
754
  references = {}
755
 
756
+ # Initialize leaderboard
757
  leaderboard_file = "leaderboard.csv"
758
  if not os.path.exists(leaderboard_file):
 
759
  sample_data = [
760
+ ["MALIBA-AI/bambara-whisper-small", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45", "Whisper-based", "Mali", "ASR"],
761
+ ["OpenAI/whisper-base", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45", "Foundation", "USA", "ASR"],
762
+ ]
763
  pd.DataFrame(sample_data,
764
+ columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]).to_csv(leaderboard_file, index=False)
765
  print(f"Created new leaderboard file with sample data")
766
  else:
767
  leaderboard_df = pd.read_csv(leaderboard_file)
768
 
769
+ # Add new columns if they don't exist
770
+ required_columns = ["Combined_Score", "Type", "Origin", "Task"]
771
+ for col in required_columns:
772
+ if col not in leaderboard_df.columns:
773
+ if col == "Combined_Score":
774
+ leaderboard_df[col] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
775
+ else:
776
+ default_val = "Unknown" if col != "Task" else "ASR"
777
+ leaderboard_df[col] = default_val
778
+
779
+ leaderboard_df.to_csv(leaderboard_file, index=False)
780
  print(f"Loaded leaderboard with {len(leaderboard_df)} entries")
781
 
782
  def normalize_text(text):
 
839
  avg_wer = sum(item["wer"] for item in results) / len(results)
840
  avg_cer = sum(item["cer"] for item in results) / len(results)
841
 
 
842
  weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
843
  weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
844
 
 
848
  """Convert decimal to percentage with 2 decimal places"""
849
  return f"{value * 100:.2f}%"
850
 
851
+ def get_performance_badge(score):
852
+ """Get performance badge based on score"""
853
+ if score < 0.15:
854
+ return "πŸ† Excellent"
855
+ elif score < 0.30:
856
+ return "πŸ₯‰ Good"
857
+ else:
858
+ return "πŸ“ˆ Fair"
859
+
860
+ def add_medals_to_models(df, score_col="Combined_Score"):
861
+ """Add medals to top-performing models"""
862
+ if df.empty or score_col not in df.columns:
863
+ return df
864
+
865
+ df_copy = df.copy()
866
+
867
+ # Convert score to float for sorting
868
+ df_copy[f"{score_col}_float"] = pd.to_numeric(df_copy[score_col], errors='coerce')
869
+
870
+ # Sort by score (ascending - lower is better for error rates)
871
+ df_copy = df_copy.sort_values(by=f"{score_col}_float", ascending=True, na_position='last').reset_index(drop=True)
872
+
873
+ # Get unique scores for ranking
874
+ valid_scores = df_copy[f"{score_col}_float"].dropna().unique()
875
+ valid_scores.sort()
876
+
877
+ # Assign medals
878
+ medals = ["πŸ†", "πŸ₯ˆ", "πŸ₯‰"]
879
+
880
+ def get_medal(score):
881
+ if pd.isna(score):
882
+ return ""
883
+ rank = np.where(valid_scores == score)[0]
884
+ if len(rank) > 0 and rank[0] < len(medals):
885
+ return medals[rank[0]] + " "
886
+ return ""
887
+
888
+ df_copy["Medal"] = df_copy[f"{score_col}_float"].apply(get_medal)
889
+ df_copy["Model_Name"] = df_copy["Medal"] + df_copy["Model_Name"].astype(str)
890
+
891
+ # Clean up temporary columns
892
+ df_copy = df_copy.drop(columns=[f"{score_col}_float", "Medal"])
893
+
894
+ return df_copy
895
+
896
  def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
897
  """Format leaderboard for display with ranking and percentages"""
898
  if df is None or len(df) == 0:
899
+ return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"])
900
 
 
901
  display_df = df.copy()
902
 
903
+ # Add medals first
904
+ display_df = add_medals_to_models(display_df, sort_by)
905
+
906
+ # Sort by the specified column
907
+ display_df[f"{sort_by}_float"] = pd.to_numeric(display_df[sort_by], errors='coerce')
908
+ display_df = display_df.sort_values(f"{sort_by}_float", ascending=True, na_position='last')
909
 
910
+ # Add rank
911
  display_df.insert(0, "Rank", range(1, len(display_df) + 1))
912
 
913
+ # Format percentages
914
  for col in ["WER", "CER", "Combined_Score"]:
915
  if col in display_df.columns:
916
+ display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}" if pd.notna(x) else "---")
917
 
918
+ # Add performance badges
919
+ display_df["Performance"] = display_df["Combined_Score"].apply(lambda x: get_performance_badge(x) if pd.notna(x) else "---")
920
+
921
+ # Shorten model names for display
922
+ display_df["Model"] = display_df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in str(x) else str(x))
923
+
924
+ # Format date
925
+ if "timestamp" in display_df.columns:
926
+ display_df["Date"] = pd.to_datetime(display_df["timestamp"], errors='coerce').dt.strftime("%Y-%m-%d")
927
+ else:
928
+ display_df["Date"] = "---"
929
+
930
+ # Select and reorder columns
931
+ display_columns = ["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"]
932
+ available_columns = [col for col in display_columns if col in display_df.columns]
933
 
934
+ # Clean up temporary columns
935
+ temp_cols = [col for col in display_df.columns if col.endswith("_float")]
936
+ display_df = display_df.drop(columns=temp_cols, errors='ignore')
937
+
938
+ return display_df[available_columns]
939
+
940
+ def create_performance_chart():
941
+ """Create performance visualization chart"""
942
+ try:
943
+ df = pd.read_csv(leaderboard_file)
944
+ if len(df) == 0:
945
+ return None
946
+
947
+ # Sort by Combined_Score
948
+ df = df.sort_values("Combined_Score")
949
+
950
+ fig = go.Figure()
951
+
952
+ # Add WER bars
953
+ fig.add_trace(go.Bar(
954
+ name="WER",
955
+ x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x),
956
+ y=df["WER"] * 100,
957
+ marker_color='#ff7f0e',
958
+ hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<extra></extra>'
959
+ ))
960
+
961
+ # Add CER bars
962
+ fig.add_trace(go.Bar(
963
+ name="CER",
964
+ x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x),
965
+ y=df["CER"] * 100,
966
+ marker_color='#2ca02c',
967
+ hovertemplate='<b>%{x}</b><br>CER: %{y:.2f}%<extra></extra>'
968
+ ))
969
+
970
+ # Add Combined Score line
971
+ fig.add_trace(go.Scatter(
972
+ name="Combined Score",
973
+ x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x),
974
+ y=df["Combined_Score"] * 100,
975
+ mode='lines+markers',
976
+ line=dict(color='#d62728', width=3),
977
+ marker=dict(size=8),
978
+ hovertemplate='<b>%{x}</b><br>Combined Score: %{y:.2f}%<extra></extra>'
979
+ ))
980
+
981
+ fig.update_layout(
982
+ title={
983
+ 'text': "πŸ“Š Model Performance Comparison",
984
+ 'x': 0.5,
985
+ 'font': {'size': 18, 'family': 'Rubik'}
986
+ },
987
+ xaxis_title="Model",
988
+ yaxis_title="Error Rate (%)",
989
+ hovermode='x unified',
990
+ height=500,
991
+ showlegend=True,
992
+ plot_bgcolor='rgba(0,0,0,0)',
993
+ paper_bgcolor='rgba(0,0,0,0)',
994
+ font=dict(family="Inter", size=12),
995
+ legend=dict(
996
+ orientation="h",
997
+ yanchor="bottom",
998
+ y=1.02,
999
+ xanchor="right",
1000
+ x=1
1001
+ )
1002
+ )
1003
+
1004
+ return fig
1005
+ except Exception as e:
1006
+ print(f"Error creating chart: {str(e)}")
1007
+ return None
1008
+
1009
+ def get_leaderboard_stats():
1010
+ """Get summary statistics for the leaderboard"""
1011
+ try:
1012
+ df = pd.read_csv(leaderboard_file)
1013
+ if len(df) == 0:
1014
+ return """
1015
+ <div class="stats-grid">
1016
+ <div class="stat-card">
1017
+ <div class="stat-number">0</div>
1018
+ <div class="stat-label">Models Submitted</div>
1019
+ </div>
1020
+ </div>
1021
+ """
1022
+
1023
+ best_model = df.loc[df["Combined_Score"].idxmin()]
1024
+ total_models = len(df)
1025
+ avg_wer = df["WER"].mean()
1026
+ avg_cer = df["CER"].mean()
1027
+
1028
+ return f"""
1029
+ <div class="stats-grid">
1030
+ <div class="stat-card">
1031
+ <div class="stat-number">{total_models}</div>
1032
+ <div class="stat-label">Models Evaluated</div>
1033
+ </div>
1034
+ <div class="stat-card">
1035
+ <div class="stat-number">{format_as_percentage(best_model['Combined_Score'])}</div>
1036
+ <div class="stat-label">Best Combined Score</div>
1037
+ </div>
1038
+ <div class="stat-card">
1039
+ <div class="stat-number">{format_as_percentage(avg_wer)}</div>
1040
+ <div class="stat-label">Average WER</div>
1041
+ </div>
1042
+ <div class="stat-card">
1043
+ <div class="stat-number">{format_as_percentage(avg_cer)}</div>
1044
+ <div class="stat-label">Average CER</div>
1045
+ </div>
1046
+ </div>
1047
+
1048
+ <div style="text-align: center; margin-top: 20px;">
1049
+ <h4>πŸ† Current Champion: {best_model['Model_Name']}</h4>
1050
+ </div>
1051
+ """
1052
+ except Exception as e:
1053
+ return f"<p>Error loading stats: {str(e)}</p>"
1054
 
1055
  def update_ranking(method):
1056
  """Update leaderboard ranking based on selected method"""
 
1070
 
1071
  except Exception as e:
1072
  print(f"Error updating ranking: {str(e)}")
1073
+ return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"])
1074
 
1075
+ def compare_models(model_1_name, model_2_name):
1076
+ """Compare two models performance"""
1077
+ try:
1078
+ df = pd.read_csv(leaderboard_file)
1079
+
1080
+ if model_1_name == model_2_name:
1081
+ return pd.DataFrame([{"Info": "Please select two different models to compare."}])
1082
+
1083
+ model_1 = df[df["Model_Name"] == model_1_name]
1084
+ model_2 = df[df["Model_Name"] == model_2_name]
1085
+
1086
+ if model_1.empty or model_2.empty:
1087
+ return pd.DataFrame([{"Info": "One or both models not found in leaderboard."}])
1088
+
1089
+ m1 = model_1.iloc[0]
1090
+ m2 = model_2.iloc[0]
1091
+
1092
+ comparison_data = {
1093
+ "Metric": ["WER", "CER", "Combined Score"],
1094
+ model_1_name.split("/")[-1]: [
1095
+ f"{m1['WER']*100:.2f}%",
1096
+ f"{m1['CER']*100:.2f}%",
1097
+ f"{m1['Combined_Score']*100:.2f}%"
1098
+ ],
1099
+ model_2_name.split("/")[-1]: [
1100
+ f"{m2['WER']*100:.2f}%",
1101
+ f"{m2['CER']*100:.2f}%",
1102
+ f"{m2['Combined_Score']*100:.2f}%"
1103
+ ],
1104
+ "Difference": [
1105
+ f"{(m1['WER'] - m2['WER'])*100:+.2f}%",
1106
+ f"{(m1['CER'] - m2['CER'])*100:+.2f}%",
1107
+ f"{(m1['Combined_Score'] - m2['Combined_Score'])*100:+.2f}%"
1108
+ ]
1109
+ }
1110
+
1111
+ return pd.DataFrame(comparison_data)
1112
+
1113
+ except Exception as e:
1114
+ return pd.DataFrame([{"Error": f"Error comparing models: {str(e)}"}])
1115
+
1116
+ def process_submission(model_name, csv_file, model_type, origin_country):
1117
+ """Process a new model submission with enhanced metadata"""
1118
  if not model_name or not model_name.strip():
1119
+ return "❌ **Error:** Please provide a model name.", None, None
1120
 
1121
  if not csv_file:
1122
+ return "❌ **Error:** Please upload a CSV file.", None, None
1123
 
1124
  try:
1125
  df = pd.read_csv(csv_file)
1126
 
1127
  if len(df) == 0:
1128
+ return "❌ **Error:** Uploaded CSV is empty.", None, None
1129
 
1130
  if set(df.columns) != {"id", "text"}:
1131
+ return f"❌ **Error:** CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None, None
1132
 
1133
  if df["id"].duplicated().any():
1134
  dup_ids = df[df["id"].duplicated()]["id"].unique()
1135
+ return f"❌ **Error:** Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None, None
1136
 
1137
  missing_ids = set(references.keys()) - set(df["id"])
1138
  extra_ids = set(df["id"]) - set(references.keys())
1139
 
1140
  if missing_ids:
1141
+ return f"❌ **Error:** Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None, None
1142
 
1143
  if extra_ids:
1144
+ return f"❌ **Error:** Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None, None
1145
 
1146
  try:
1147
  avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
1148
 
 
1149
  if avg_wer < 0.001:
1150
+ return "❌ **Error:** WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None, None
1151
 
1152
  except Exception as e:
1153
+ return f"❌ **Error calculating metrics:** {str(e)}", None, None
1154
 
1155
+ # Update leaderboard
1156
  leaderboard = pd.read_csv(leaderboard_file)
1157
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
1158
  combined_score = avg_wer * 0.7 + avg_cer * 0.3
1159
 
1160
  if model_name in leaderboard["Model_Name"].values:
 
1163
  leaderboard.loc[idx, "CER"] = avg_cer
1164
  leaderboard.loc[idx, "Combined_Score"] = combined_score
1165
  leaderboard.loc[idx, "timestamp"] = timestamp
1166
+ leaderboard.loc[idx, "Type"] = model_type
1167
+ leaderboard.loc[idx, "Origin"] = origin_country
1168
  updated_leaderboard = leaderboard
1169
  else:
1170
  new_entry = pd.DataFrame(
1171
+ [[model_name, avg_wer, avg_cer, combined_score, timestamp, model_type, origin_country, "ASR"]],
1172
+ columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]
1173
  )
1174
  updated_leaderboard = pd.concat([leaderboard, new_entry])
1175
 
 
1177
  updated_leaderboard.to_csv(leaderboard_file, index=False)
1178
 
1179
  display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
1180
+ chart = create_performance_chart()
1181
+
1182
+ badge = get_performance_badge(combined_score)
1183
 
1184
+ success_msg = f"""
1185
+ βœ… **Submission processed successfully!**
1186
+
1187
+ **{model_name}** ({model_type} from {origin_country})
1188
+ - **WER:** {format_as_percentage(avg_wer)}
1189
+ - **CER:** {format_as_percentage(avg_cer)}
1190
+ - **Combined Score:** {format_as_percentage(combined_score)}
1191
+ - **Performance:** {badge}
1192
+ """
1193
+
1194
+ return success_msg, display_leaderboard, chart
1195
 
1196
  except Exception as e:
1197
+ return f"❌ **Error processing submission:** {str(e)}", None, None
1198
 
1199
  def get_current_leaderboard():
1200
  """Get the current leaderboard data for display"""
 
1202
  if os.path.exists(leaderboard_file):
1203
  current_leaderboard = pd.read_csv(leaderboard_file)
1204
 
1205
+ # Ensure all required columns exist
1206
+ required_columns = ["Combined_Score", "Type", "Origin", "Task"]
1207
+ for col in required_columns:
1208
+ if col not in current_leaderboard.columns:
1209
+ if col == "Combined_Score":
1210
+ current_leaderboard[col] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
1211
+ else:
1212
+ current_leaderboard[col] = "Unknown" if col != "Task" else "ASR"
1213
+
1214
+ current_leaderboard.to_csv(leaderboard_file, index=False)
1215
  return current_leaderboard
1216
  else:
1217
+ return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"])
1218
  except Exception as e:
1219
  print(f"Error getting leaderboard: {str(e)}")
1220
+ return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"])
1221
 
1222
  def create_leaderboard_table():
1223
  """Create and format the leaderboard table for display"""
1224
  leaderboard_data = get_current_leaderboard()
1225
  return prepare_leaderboard_for_display(leaderboard_data)
1226
 
1227
+ def df_to_html(df):
1228
+ """Convert DataFrame to HTML with custom styling"""
1229
+ if df.empty:
1230
+ return "<p style='text-align: center; color: #666;'>No data available</p>"
 
 
 
 
 
 
 
1231
 
1232
+ # Convert DataFrame to HTML
1233
+ html = df.to_html(index=False, escape=False, classes="leaderboard-table")
1234
 
1235
+ # Add custom styling
1236
+ html = html.replace('<table class="leaderboard-table"',
1237
+ '<table class="leaderboard-table" style="width: 100%; margin: 0 auto;"')
 
 
 
 
 
 
1238
 
1239
+ return html
 
1240
 
1241
+ # Main Gradio Interface
1242
+ with gr.Blocks(
1243
+ title="πŸ‡²πŸ‡± Bambara ASR Leaderboard | MALIBA-AI",
1244
+ css=sahara_style_css,
1245
+ head=custom_head_html,
1246
+ theme=gr.themes.Soft()
1247
+ ) as demo:
1248
+
1249
+ # Header Section
1250
+ gr.HTML(new_header_html)
1251
+
1252
+ # Navigation Buttons
1253
+ with gr.Row():
1254
+ gr.Button("🌐 MALIBA-AI Website", link="https://maliba-ai.org/", elem_classes=['nav-button'])
1255
+ gr.Button("πŸ“Š HF Dataset Repo", link="https://huggingface.co/datasets/sudoping01/bambara-speech-recognition-benchmark", elem_classes=['nav-button'])
1256
+ gr.Button("πŸ€— MALIBA-AI Hub", link="https://huggingface.co/MALIBA-AI", elem_classes=['nav-button'])
1257
+ gr.Button("πŸ“š Documentation", link="https://huggingface.co/spaces/MALIBA-AI/bambara-asr-leaderboard", elem_classes=['nav-button'])
1258
+
1259
+ with gr.Group(elem_classes="content-card"):
1260
+ # Stats display
1261
+ stats_html = gr.HTML(get_leaderboard_stats())
1262
+
1263
+ with gr.Tabs() as tabs:
1264
+ with gr.TabItem("πŸ… Main Leaderboard", id="main"):
1265
+ gr.HTML("<h2>Main Leaderboard</h2>")
1266
+
1267
+ initial_leaderboard = create_leaderboard_table()
1268
+
1269
+ with gr.Row():
1270
+ ranking_method = gr.Radio(
1271
+ ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"],
1272
+ label="πŸ”„ Ranking Method",
1273
+ value="Combined Score (WER 70%, CER 30%)",
1274
+ info="Choose how to rank the models"
1275
+ )
1276
+
1277
+ leaderboard_view = gr.DataFrame(
1278
+ value=initial_leaderboard,
1279
+ interactive=False,
1280
+ label="πŸ“‹ Leaderboard Rankings - Lower scores indicate better performance",
1281
+ wrap=True,
1282
+ height=400
1283
+ )
1284
+
1285
+ # Performance chart
1286
+ gr.Markdown("### πŸ“Š Visual Performance Comparison")
1287
+ performance_chart = gr.Plot(
1288
+ value=create_performance_chart(),
1289
+ label="Model Performance Visualization"
1290
+ )
1291
+
1292
+ ranking_method.change(
1293
+ fn=update_ranking,
1294
+ inputs=[ranking_method],
1295
+ outputs=[leaderboard_view]
1296
+ )
1297
+
1298
+ with gr.Accordion("πŸ“– Understanding ASR Metrics", open=False):
1299
+ gr.Markdown("""
1300
+ ## 🎯 Automatic Speech Recognition Evaluation Metrics
1301
 
1302
  ### Word Error Rate (WER)
1303
+ **WER** measures transcription accuracy at the word level:
1304
+ - **Formula:** `(Substitutions + Insertions + Deletions) / Total Reference Words`
1305
+ - **Range:** 0% (perfect) to 100%+ (very poor)
1306
+ - **Interpretation:**
1307
+ - 0-5%: πŸ† Excellent performance
1308
+ - 5-15%: πŸ₯‰ Good performance
1309
+ - 15-30%: πŸ“ˆ Fair performance
1310
+ - 30%+: Poor performance
1311
 
1312
  ### Character Error Rate (CER)
1313
+ **CER** measures transcription accuracy at the character level:
1314
+ - **Advantage:** More granular than WER, captures partial matches
1315
+ - **Benefit for Bambara:** Particularly valuable for agglutinative languages
1316
+ - **Typical Range:** Usually lower than WER values
1317
 
1318
+ ### Combined Score (Primary Ranking Metric)
1319
+ **Formula:** `Combined Score = 0.7 Γ— WER + 0.3 Γ— CER`
1320
+ - **Rationale:** Balanced evaluation emphasizing word-level accuracy
1321
+ - **Usage:** Primary metric for model ranking
1322
+
1323
+ ### 🎯 Performance Categories
1324
+ - πŸ† **Excellent**: < 15% Combined Score
1325
+ - πŸ₯‰ **Good**: 15-30% Combined Score
1326
+ - πŸ“ˆ **Fair**: > 30% Combined Score
1327
+ """)
1328
+
1329
+ with gr.TabItem("πŸ“€ Submit New Model", id="submit"):
1330
+ gr.HTML("<h2>Submit Your Bambara ASR Model</h2>")
1331
+
1332
+ gr.Markdown("""
1333
+ ### πŸš€ Ready to benchmark your model? Submit your results and join the leaderboard!
1334
+
1335
+ Follow these steps to submit your Bambara ASR model for evaluation.
1336
+ """)
1337
+
1338
+ with gr.Group(elem_classes="form-section"):
1339
+ with gr.Row():
1340
+ with gr.Column(scale=2):
1341
+ model_name_input = gr.Textbox(
1342
+ label="πŸ€– Model Name",
1343
+ placeholder="e.g., MALIBA-AI/bambara-whisper-large",
1344
+ info="Use a descriptive name (organization/model format preferred)"
1345
+ )
1346
+
1347
+ model_type = gr.Dropdown(
1348
+ label="🏷️ Model Type",
1349
+ choices=["Whisper-based", "Wav2Vec2", "Foundation", "Custom", "Fine-tuned", "Multilingual", "Other"],
1350
+ value="Custom",
1351
+ info="Select the type/architecture of your model"
1352
+ )
1353
+
1354
+ origin_country = gr.Dropdown(
1355
+ label="🌍 Origin/Institution",
1356
+ choices=["Mali", "Senegal", "Burkina Faso", "Niger", "Guinea", "Ivory Coast", "USA", "France", "Canada", "UK", "Other"],
1357
+ value="Mali",
1358
+ info="Country or region of the developing institution"
1359
+ )
1360
+
1361
+ with gr.Column(scale=1):
1362
+ gr.Markdown("""
1363
+ #### πŸ“‹ Submission Requirements
1364
+
1365
+ **CSV Format:**
1366
+ - Columns: `id`, `text`
1367
+ - Match all reference dataset IDs
1368
+ - No duplicate IDs
1369
+ - Text transcriptions in Bambara
1370
+
1371
+ **Data Quality:**
1372
+ - Clean, normalized text
1373
+ - Consistent formatting
1374
+ - Complete coverage of test set
1375
+ """)
1376
+
1377
+ csv_upload = gr.File(
1378
+ label="πŸ“ Upload Predictions CSV",
1379
+ file_types=[".csv"],
1380
+ info="Upload your model's transcriptions in the required CSV format"
1381
  )
 
 
 
 
 
1382
 
1383
+ submit_btn = gr.Button("πŸš€ Submit Model", variant="primary", size="lg", elem_classes=['gradio-button', 'primary'])
1384
+
1385
+ output_msg = gr.Markdown(label="πŸ“’ Submission Status")
1386
+
1387
+ with gr.Row():
1388
+ leaderboard_display = gr.DataFrame(
1389
+ label="πŸ“Š Updated Leaderboard",
1390
+ value=initial_leaderboard,
1391
+ interactive=False,
1392
+ wrap=True,
1393
+ height=400
1394
+ )
1395
+
1396
+ updated_chart = gr.Plot(
1397
+ label="πŸ“ˆ Updated Performance Chart"
1398
+ )
1399
+
1400
+ submit_btn.click(
1401
+ fn=process_submission,
1402
+ inputs=[model_name_input, csv_upload, model_type, origin_country],
1403
+ outputs=[output_msg, leaderboard_display, updated_chart]
1404
  )
 
1405
 
1406
+ with gr.TabItem("πŸ” Compare Models", id="compare"):
1407
+ gr.HTML("<h2>Compare Two Models</h2>")
1408
+
1409
+ gr.Markdown("### Select two models to compare their performance side-by-side")
1410
+
1411
+ with gr.Row():
1412
+ current_data = get_current_leaderboard()
1413
+ model_names = current_data["Model_Name"].tolist() if not current_data.empty else []
1414
+
1415
+ model_1_dropdown = gr.Dropdown(
1416
+ choices=model_names,
1417
+ label="πŸ€– Model 1",
1418
+ info="Select the first model for comparison"
1419
+ )
1420
+ model_2_dropdown = gr.Dropdown(
1421
+ choices=model_names,
1422
+ label="πŸ€– Model 2",
1423
+ info="Select the second model for comparison"
1424
+ )
1425
+
1426
+ compare_btn = gr.Button("⚑ Compare Models", variant="primary", elem_classes=['gradio-button', 'primary'])
1427
+
1428
+ comparison_note = gr.Markdown("""
1429
+ **Note on Comparison Results:**
1430
+ - Positive difference values (🟒) indicate Model 1 performed better
1431
+ - Negative difference values (πŸ”΄) indicate Model 2 performed better
1432
+ - Lower error rates indicate better performance
1433
+ """, visible=False)
1434
+
1435
+ comparison_output = gr.DataFrame(
1436
+ label="πŸ“Š Model Comparison Results",
1437
+ value=pd.DataFrame([{"Info": "Select two models and click Compare to see the results."}]),
1438
+ interactive=False
1439
+ )
1440
+
1441
+ def update_comparison_table(m1, m2):
1442
+ if not m1 or not m2:
1443
+ return gr.update(visible=False), pd.DataFrame([{"Info": "Please select both models before clicking Compare."}])
1444
+
1445
+ if m1 == m2:
1446
+ return gr.update(visible=False), pd.DataFrame([{"Info": "Please select two different models to compare."}])
1447
+
1448
+ df = compare_models(m1, m2)
1449
+ return gr.update(visible=True), df
1450
+
1451
+ compare_btn.click(
1452
+ fn=update_comparison_table,
1453
+ inputs=[model_1_dropdown, model_2_dropdown],
1454
+ outputs=[comparison_note, comparison_output]
1455
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1456
 
1457
+ with gr.TabItem("πŸ“Š Dataset & Methodology", id="dataset"):
1458
+ gr.HTML("<h2>Dataset & Methodology</h2>")
 
 
1459
 
1460
+ gr.Markdown("""
1461
+ ## 🎯 About the Bambara Speech Recognition Benchmark
1462
 
1463
+ ### πŸ“ˆ Dataset Overview
 
 
 
1464
 
1465
+ Our benchmark is built on the **`sudoping01/bambara-speech-recognition-benchmark`** dataset, featuring:
1466
 
1467
+ - **πŸŽ™οΈ Diverse Audio Samples:** Various speakers, dialects, and recording conditions
1468
+ - **πŸ—£οΈ Speaker Variety:** Multiple native Bambara speakers from different regions
1469
+ - **🎡 Acoustic Diversity:** Different recording environments and quality levels
1470
+ - **βœ… Quality Assurance:** Manually validated transcriptions
1471
+ - **πŸ“š Content Variety:** Multiple domains and speaking styles
1472
 
1473
+ ### πŸ”¬ Evaluation Methodology
 
 
 
1474
 
1475
+ #### Text Normalization Process
1476
+ 1. **Lowercase conversion** for consistency
1477
+ 2. **Punctuation removal** to focus on linguistic content
1478
+ 3. **Whitespace normalization** for standardized formatting
1479
+ 4. **Unicode normalization** for proper character handling
1480
 
1481
+ #### Quality Controls
1482
+ - **Outlier Detection:** Extreme error rates are capped to prevent skewing
1483
+ - **Data Validation:** Comprehensive format and completeness checks
1484
+ - **Duplicate Prevention:** Automatic detection of duplicate submissions
1485
+ - **Missing Data Handling:** Identification of incomplete submissions
1486
+
1487
+ ### πŸš€ How to Participate
1488
+
1489
+ #### Step 1: Access the Dataset
1490
+ ```python
1491
+ from datasets import load_dataset
1492
+ dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark")
1493
+ ```
1494
+
1495
+ #### Step 2: Generate Predictions
1496
+ - Process the audio files with your ASR model
1497
+ - Generate transcriptions for each audio sample
1498
+ - Ensure your model outputs text in Bambara language
1499
+
1500
+ #### Step 3: Format Results
1501
+ Create a CSV file with exactly these columns:
1502
+ - **`id`**: Sample identifier (must match dataset IDs)
1503
+ - **`text`**: Your model's transcription
1504
+
1505
+ #### Step 4: Submit & Evaluate
1506
+ - Upload your CSV using the submission form
1507
+ - Your model will be automatically evaluated
1508
+ - Results appear on the leaderboard immediately
1509
+
1510
+ ### πŸ† Recognition & Impact
1511
+
1512
+ **Top-performing models will be:**
1513
+ - Featured prominently on our leaderboard
1514
+ - Highlighted in MALIBA-AI communications
1515
+ - Considered for inclusion in production systems
1516
+ - Invited to present at community events
1517
+
1518
+ ### 🀝 Community Guidelines
1519
+
1520
+ - **Reproducibility:** Please provide model details and methodology
1521
+ - **Fair Play:** No data leakage or unfair advantages
1522
+ - **Collaboration:** Share insights and learnings with the community
1523
+ - **Attribution:** Properly cite the benchmark in publications
1524
+
1525
+ ### πŸ“š Technical Specifications
1526
+
1527
+ | Aspect | Details |
1528
+ |--------|---------|
1529
+ | **Audio Format** | WAV, various sample rates |
1530
+ | **Language** | Bambara (bam) |
1531
+ | **Evaluation Metrics** | WER, CER, Combined Score |
1532
+ | **Text Encoding** | UTF-8 |
1533
+ | **Submission Format** | CSV with id, text columns |
1534
+ """)
1535
+
1536
+ # Citation and Footer
1537
+ with gr.Group(elem_classes="content-card"):
1538
+ gr.HTML("""
1539
+ <div class="citation-block">
1540
+ <h2>πŸ“š Citation</h2>
1541
+ <p>If you use the Bambara ASR Leaderboard for your scientific publication, or if you find the resources useful, please cite our work:</p>
1542
+ <pre>
1543
+ @misc{bambara_asr_leaderboard_2025,
1544
+ title={Bambara Speech Recognition Leaderboard},
1545
+ author={MALIBA-AI Team},
1546
+ year={2025},
1547
+ url={https://huggingface.co/spaces/MALIBA-AI/bambara-asr-leaderboard},
1548
+ note={A community initiative for advancing Bambara speech recognition technology}
1549
+ }
1550
+ </pre>
1551
+ </div>
1552
+ """)
1553
 
1554
+ gr.HTML("""
1555
+ <div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 2px solid #e9ecef;">
1556
+ <h3 style="color: #7d3561; margin-bottom: 15px;">About MALIBA-AI</h3>
1557
+ <p style="font-size: 16px; line-height: 1.6; max-width: 800px; margin: 0 auto;">
1558
+ <strong>MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation</strong><br>
1559
+ <em>"No Malian Language Left Behind"</em>
1560
+ </p>
1561
+ <p style="margin-top: 15px;">
1562
+ This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
1563
+ For more information, visit <a href="https://maliba-ai.org/" style="color: #7d3561; font-weight: 600;">MALIBA-AI</a> or
1564
+ <a href="https://huggingface.co/MALIBA-AI" style="color: #7d3561; font-weight: 600;">our Hugging Face page</a>.
1565
+ </p>
1566
+ <div style="margin-top: 20px;">
1567
+ <span style="font-size: 2em;">πŸ‡²πŸ‡±</span>
1568
+ <span style="margin: 0 20px; color: #7d3561; font-weight: 600;">β€’</span>
1569
+ <span style="font-size: 2em;">🀝</span>
1570
+ <span style="margin: 0 20px; color: #7d3561; font-weight: 600;">β€’</span>
1571
+ <span style="font-size: 2em;">πŸš€</span>
1572
+ </div>
1573
+ </div>
1574
+ """)
1575
 
1576
  if __name__ == "__main__":
1577
  demo.launch()