FarmerlineML commited on
Commit
57b796f
·
verified ·
1 Parent(s): 81083e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -425
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py (MP3-robust loader + Robust HF Dataset Appending)
2
 
3
  import os
4
  import json
@@ -13,11 +13,6 @@ from transformers import pipeline
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
16
- import pandas as pd
17
- import pyarrow.parquet as pq
18
- import pyarrow as pa
19
- from huggingface_hub import HfApi
20
- from typing import Optional, Tuple, Dict, Any
21
 
22
  # Optional: modest thread hints for CPU Spaces
23
  try:
@@ -27,27 +22,95 @@ try:
27
  except Exception:
28
  pass
29
 
30
- # Setup logging with more detail
31
- logging.basicConfig(
32
- level=logging.INFO,
33
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
34
- )
35
- logger = logging.getLogger(__name__)
36
 
37
- # -------- CONFIG: Hub dataset target --------
 
 
 
38
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
39
  HF_TOKEN = os.environ.get("HF_TOKEN")
40
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
41
 
42
- # Initialize HF API client once
43
- hf_api = HfApi() if PUSH_TO_HF else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # --- Map display names to your HF Hub model IDs ---
46
  language_models = {
47
  "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
48
  "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
49
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
50
- "Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
 
51
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
52
  "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
53
  "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
@@ -65,180 +128,61 @@ language_models = {
65
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
66
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
67
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
 
 
 
68
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
69
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
70
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
71
  "Pidgin": "FarmerlineML/pidgin_nigerian",
72
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
73
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
74
- "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
75
  }
76
 
77
- # -------- Robust Dataset Push Function --------
78
- def _push_row_to_hf_dataset(row: Dict[str, Any], audio_file_path: Optional[str]) -> str:
79
- """
80
- Append a single example to the HF dataset repo using Parquet files.
81
- Each submission creates a new Parquet file to avoid overwrites.
82
- """
83
- if not PUSH_TO_HF:
84
- return "HF push disabled (missing HF_TOKEN or repo)."
85
-
86
- if not hf_api:
87
- return "HF API client not initialized."
88
-
89
- # Create a copy of the row to avoid modifying the original
90
- example = dict(row)
91
-
92
- # Generate unique identifiers for this submission
93
- timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
94
- unique_id = str(uuid.uuid4())[:8]
95
-
96
- # Handle audio file if provided and user consented
97
- audio_uploaded = False
98
- if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly", False):
99
- try:
100
- # Store reference to audio file in the dataset
101
- audio_filename = f"audio_{timestamp}_{unique_id}{os.path.splitext(audio_file_path)[1]}"
102
- example["audio_filename"] = audio_filename
103
-
104
- # Upload audio file separately
105
- logger.info(f"Uploading audio file: {audio_filename}")
106
- hf_api.upload_file(
107
- path_or_fileobj=audio_file_path,
108
- path_in_repo=f"audio/{audio_filename}",
109
- repo_id=HF_DATASET_REPO,
110
- repo_type="dataset",
111
- token=HF_TOKEN,
112
- commit_message=f"Add audio for feedback {timestamp}"
113
- )
114
- audio_uploaded = True
115
- logger.info("Audio file uploaded successfully")
116
- except Exception as e:
117
- logger.error(f"Failed to upload audio: {e}")
118
- example["audio_filename"] = None
119
- else:
120
- example["audio_filename"] = None
121
-
122
- # Normalize data types for Parquet storage
123
- def _safe_cast(value, cast_func, default=None):
124
- """Safely cast a value to a type, returning default on failure."""
125
- try:
126
- return cast_func(value) if value is not None else default
127
- except (ValueError, TypeError):
128
- return default
129
-
130
- # Type normalization
131
- example["latency_ms"] = _safe_cast(example.get("latency_ms"), int)
132
- example["score_out_of_10"] = _safe_cast(example.get("score_out_of_10"), int)
133
- example["sample_rate"] = _safe_cast(example.get("sample_rate"), int)
134
- example["rtf"] = _safe_cast(example.get("rtf"), float)
135
- example["audio_duration_s"] = _safe_cast(example.get("audio_duration_s"), float)
136
- example["share_publicly"] = bool(example.get("share_publicly", False))
137
-
138
- # Ensure all string fields are properly handled
139
- string_fields = ["timestamp", "session_id", "language_display", "model_id",
140
- "model_revision", "source", "decode_params", "transcript_hyp",
141
- "corrected_text"]
142
- for field in string_fields:
143
- if field in example and example[field] is not None:
144
- example[field] = str(example[field])
145
-
146
- # Create DataFrame and save as Parquet
147
- df = pd.DataFrame([example])
148
-
149
- # Generate Parquet filename
150
- parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
151
-
152
- # Create temporary Parquet file
153
- temp_parquet = None
154
- try:
155
- with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
156
- temp_parquet = tmp_file.name
157
- df.to_parquet(temp_parquet, engine='pyarrow', compression='snappy')
158
-
159
- # Upload Parquet file to dataset repo
160
- logger.info(f"Uploading feedback data: {parquet_filename}")
161
- hf_api.upload_file(
162
- path_or_fileobj=temp_parquet,
163
- path_in_repo=f"data/{parquet_filename}",
164
- repo_id=HF_DATASET_REPO,
165
- repo_type="dataset",
166
- token=HF_TOKEN,
167
- commit_message=f"Add feedback row {timestamp}"
168
- )
169
- logger.info("Feedback data uploaded successfully")
170
-
171
- status_msg = f"Successfully pushed to HF Dataset as {parquet_filename}"
172
- if audio_uploaded:
173
- status_msg += " (with audio)"
174
- return status_msg
175
-
176
- except Exception as e:
177
- logger.error(f"Failed to push to HF Dataset: {e}")
178
- return f"Failed to push to HF Dataset: {str(e)}"
179
- finally:
180
- # Clean up temporary file
181
- if temp_parquet and os.path.exists(temp_parquet):
182
- try:
183
- os.remove(temp_parquet)
184
- except Exception as e:
185
- logger.warning(f"Failed to remove temp file: {e}")
186
-
187
  # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
188
  TARGET_SR = 16000
189
 
190
- def _has_ffmpeg() -> bool:
191
- """Check if ffmpeg is available in the system."""
192
  return shutil.which("ffmpeg") is not None
193
 
194
- def _load_with_soundfile(path: str) -> Tuple[np.ndarray, int]:
195
- """Load audio using soundfile (for wav/flac/ogg)."""
196
  data, sr = sf.read(path, always_2d=False)
197
  if isinstance(data, np.ndarray) and data.ndim > 1:
198
  data = data.mean(axis=1)
199
  return data.astype(np.float32), sr
200
 
201
- def _load_with_ffmpeg(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
202
- """Convert audio to mono wav using ffmpeg."""
203
  if not _has_ffmpeg():
204
  raise RuntimeError("ffmpeg not available")
205
-
206
  tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
207
  tmp_wav.close()
208
-
 
 
 
 
 
 
 
209
  try:
210
- cmd = [
211
- "ffmpeg", "-hide_banner", "-loglevel", "error",
212
- "-y", "-i", path,
213
- "-ac", "1", "-ar", str(target_sr),
214
- tmp_wav.name,
215
- ]
216
- subprocess.run(cmd, check=True)
217
- data, sr = sf.read(tmp_wav.name, always_2d=False)
218
-
219
- if isinstance(data, np.ndarray) and data.ndim > 1:
220
- data = data.mean(axis=1)
221
- return data.astype(np.float32), sr
222
- finally:
223
- try:
224
- os.remove(tmp_wav.name)
225
- except Exception:
226
- pass
227
 
228
- def _resample_if_needed(y: np.ndarray, sr: int, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
229
- """Resample audio if needed."""
230
  if sr == target_sr:
231
  return y.astype(np.float32), sr
232
  y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
233
  return y_rs.astype(np.float32), target_sr
234
 
235
- def load_audio_any(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
236
  """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
237
- if not os.path.exists(path):
238
- raise FileNotFoundError(f"Audio file not found: {path}")
239
-
240
  ext = os.path.splitext(path)[1].lower()
241
-
242
  try:
243
  if ext in {".wav", ".flac", ".ogg", ".opus"}:
244
  y, sr = _load_with_soundfile(path)
@@ -248,11 +192,10 @@ def load_audio_any(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, i
248
  else:
249
  # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
250
  y, sr = librosa.load(path, sr=None, mono=True)
251
-
252
  y, sr = _resample_if_needed(y, sr, target_sr)
253
  return y, sr
254
  except Exception as e:
255
- logger.warning(f"Primary load failed for {path} ({e}). Falling back to librosa.")
256
  y, sr = librosa.load(path, sr=target_sr, mono=True)
257
  return y.astype(np.float32), sr
258
 
@@ -261,23 +204,20 @@ _PIPELINE_CACHE = {}
261
  _CACHE_ORDER = [] # usage order
262
  _CACHE_MAX_SIZE = 3 # tune for RAM
263
 
264
- def _touch_cache(key: str):
265
- """Update cache access order."""
266
  if key in _CACHE_ORDER:
267
  _CACHE_ORDER.remove(key)
268
  _CACHE_ORDER.insert(0, key)
269
 
270
  def _evict_if_needed():
271
- """Evict least recently used pipelines if cache is full."""
272
  while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
273
- if _CACHE_ORDER:
274
- oldest = _CACHE_ORDER.pop()
275
- if oldest in _PIPELINE_CACHE:
276
- logger.info(f"Evicting pipeline from cache: {oldest}")
277
- del _PIPELINE_CACHE[oldest]
278
 
279
  def get_asr_pipeline(language_display: str):
280
- """Get or create ASR pipeline for the specified language."""
281
  if language_display not in language_models:
282
  raise ValueError(f"Unknown language selection: {language_display}")
283
 
@@ -286,15 +226,13 @@ def get_asr_pipeline(language_display: str):
286
  return _PIPELINE_CACHE[language_display]
287
 
288
  model_id = language_models[language_display]
289
- logger.info(f"Loading pipeline for '{language_display}' -> {model_id}")
290
-
291
  pipe = pipeline(
292
  task="automatic-speech-recognition",
293
  model=model_id,
294
- device=-1, # CPU on Spaces
295
  chunk_length_s=30
296
  )
297
-
298
  _PIPELINE_CACHE[language_display] = pipe
299
  _touch_cache(language_display)
300
  _evict_if_needed()
@@ -302,7 +240,7 @@ def get_asr_pipeline(language_display: str):
302
 
303
  # -------- Helpers --------
304
  def _model_revision_from_pipeline(pipe) -> str:
305
- """Best-effort capture of revision/hash for reproducibility."""
306
  for attr in ("hub_revision", "revision", "_commit_hash"):
307
  val = getattr(getattr(pipe, "model", None), attr, None)
308
  if val:
@@ -313,7 +251,7 @@ def _model_revision_from_pipeline(pipe) -> str:
313
  return "unknown"
314
 
315
  # -------- Inference --------
316
- def transcribe(audio_path: str, language: str) -> Tuple[str, Optional[Dict[str, Any]]]:
317
  """
318
  Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
319
  then run it through the chosen ASR pipeline.
@@ -321,269 +259,138 @@ def transcribe(audio_path: str, language: str) -> Tuple[str, Optional[Dict[str,
321
  """
322
  if not audio_path:
323
  return "⚠️ Please upload or record an audio clip.", None
324
-
325
- try:
326
- # Load and process audio
327
- speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
328
- duration_s = float(len(speech) / float(sr))
329
-
330
- # Get ASR pipeline
331
- pipe = get_asr_pipeline(language)
332
- decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
333
-
334
- # Run inference
335
- logger.info(f"Running ASR inference for {language} on {duration_s:.2f}s audio")
336
- t0 = time.time()
337
- result = pipe({"sampling_rate": sr, "raw": speech})
338
- latency_ms = int((time.time() - t0) * 1000.0)
339
- hyp_text = result.get("text", "")
340
-
341
- # Calculate real-time factor
342
- rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
343
-
344
- # Prepare metadata
345
- meta = {
346
- "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
347
- "session_id": f"anon-{uuid.uuid4()}",
348
- "language_display": language,
349
- "model_id": language_models.get(language, "unknown"),
350
- "model_revision": _model_revision_from_pipeline(pipe),
351
- "audio_duration_s": duration_s,
352
- "sample_rate": sr,
353
- "source": "upload",
354
- "decode_params": json.dumps(decode_params),
355
- "transcript_hyp": hyp_text,
356
- "latency_ms": latency_ms,
357
- "rtf": rtf,
358
- }
359
-
360
- logger.info(f"Transcription complete. RTF: {rtf:.3f}")
361
- return hyp_text, meta
362
-
363
- except Exception as e:
364
- logger.error(f"Transcription failed: {e}")
365
- return f"❌ Transcription failed: {str(e)}", None
366
-
367
- # -------- Feedback submit --------
368
- def submit_feedback(
369
- meta: Optional[Dict[str, Any]],
370
- corrected_text: str,
371
- score: int,
372
- store_audio: bool,
373
- share_publicly: bool,
374
- audio_file_path: Optional[str]
375
- ) -> Dict[str, Any]:
376
  """
377
- Submit feedback to HF Dataset with improved error handling.
378
  """
379
  if not meta:
380
- return {
381
- "status": "❌ No transcription metadata available. Please transcribe first.",
382
- "success": False
383
- }
384
-
385
- # Prepare row data
386
  row = dict(meta)
387
  row.update({
388
  "corrected_text": (corrected_text or "").strip(),
389
  "score_out_of_10": int(score) if score is not None else None,
390
  "share_publicly": bool(share_publicly),
391
  })
392
-
393
- # Push to HF Dataset
394
  try:
395
  audio_to_push = audio_file_path if store_audio else None
396
  hf_status = _push_row_to_hf_dataset(row, audio_to_push)
397
-
398
- return {
399
- "status": f"✅ {hf_status}",
400
- "success": True,
401
- "latency_ms": row["latency_ms"],
402
- "rtf": f"{row['rtf']:.3f}",
403
- "model_id": row["model_id"],
404
- "model_revision": row["model_revision"],
405
- "language": row["language_display"],
406
- }
407
  except Exception as e:
408
- logger.error(f"Failed to submit feedback: {e}")
409
- return {
410
- "status": f"❌ Failed to submit feedback: {str(e)}",
411
- "success": False
412
- }
413
-
414
- # -------- Gradio UI --------
415
- def create_demo():
416
- """Create the Gradio demo interface."""
417
-
418
- with gr.Blocks(
419
- title="🌐 Multilingual ASR Demo",
420
- theme=gr.themes.Soft()
421
- ) as demo:
422
- gr.Markdown(
423
- """
424
- # 🎙️ Multilingual Speech-to-Text Demo
425
-
426
- Upload an audio file (MP3, WAV, FLAC, M4A, OGG, etc.) or record via your microphone.
427
- Then choose the language/model and hit **Transcribe**.
428
-
429
- ---
430
- """
431
- )
432
-
433
- with gr.Row():
434
- with gr.Column(scale=1):
435
- lang = gr.Dropdown(
436
- choices=list(language_models.keys()),
437
- value=list(language_models.keys())[0],
438
- label="Select Language / Model",
439
- info="Choose the language of your audio"
440
- )
441
-
442
- audio = gr.Audio(
443
- sources=["upload", "microphone"],
444
- type="filepath",
445
- label="Upload or Record Audio",
446
- elem_id="audio-input"
447
- )
448
-
449
- btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
450
-
451
- with gr.Column(scale=1):
452
- output = gr.Textbox(
453
- label="Transcription",
454
- placeholder="Transcription will appear here...",
455
- lines=5
456
- )
457
-
458
- # Status indicators
459
- with gr.Row():
460
- status_box = gr.Textbox(
461
- label="Status",
462
- interactive=False,
463
- placeholder="Ready",
464
- max_lines=1
465
- )
466
-
467
- # Hidden state to carry metadata from transcribe -> feedback
468
- meta_state = gr.State(value=None)
469
-
470
- # Evaluation section
471
- with gr.Accordion("📝 Evaluation & Feedback", open=False):
472
- gr.Markdown(
473
- """
474
- Help us improve! Please provide feedback on the transcription quality.
475
- """
476
- )
477
-
478
- with gr.Row():
479
- corrected_tb = gr.Textbox(
480
- label="Corrected transcript (optional)",
481
- placeholder="If there are errors, type the correct transcription here...",
482
- lines=4,
483
- value=""
484
- )
485
-
486
- with gr.Row():
487
- score_slider = gr.Slider(
488
- minimum=0,
489
- maximum=10,
490
- step=1,
491
- label="Quality Score (0 = terrible, 10 = perfect)",
492
- value=7,
493
- info="Rate the transcription quality"
494
- )
495
-
496
- with gr.Row():
497
- store_audio_cb = gr.Checkbox(
498
- label="Allow storing my audio for research/evaluation",
499
- value=False,
500
- info="Audio will be stored securely and used only for improving the models"
501
- )
502
- share_cb = gr.Checkbox(
503
- label="Allow sharing this example publicly",
504
- value=False,
505
- info="Your example may be used in public datasets or demos"
506
- )
507
-
508
- submit_btn = gr.Button("📤 Submit Feedback", variant="secondary")
509
-
510
- results_json = gr.JSON(
511
- label="Submission Result",
512
- visible=True
513
- )
514
-
515
- # Examples section
516
- with gr.Accordion("📚 Example Usage", open=False):
517
- gr.Markdown(
518
- """
519
- ### Tips for best results:
520
- - Speak clearly and at a normal pace
521
- - Minimize background noise
522
- - Keep recordings under 30 seconds for optimal performance
523
- - Select the correct language before transcribing
524
-
525
- ### Supported formats:
526
- WAV, MP3, FLAC, M4A, OGG, OPUS, and more!
527
- """
528
- )
529
-
530
- # Wire up events
531
- def _transcribe_and_update(audio_path, language):
532
- """Transcribe and update UI components."""
533
- if not audio_path:
534
- return "", None, "", "⚠️ Please provide audio"
535
-
536
- status_box_val = f"🔄 Processing {language}..."
537
- hyp, meta = transcribe(audio_path, language)
538
-
539
- if meta:
540
- status_msg = f"✅ Done! (RTF: {meta['rtf']:.3f})"
541
- # Pre-fill corrected with hypothesis for easy edits
542
- return hyp, meta, hyp, status_msg
543
- else:
544
- return hyp, None, "", "❌ Transcription failed"
545
-
546
- btn.click(
547
- fn=_transcribe_and_update,
548
- inputs=[audio, lang],
549
- outputs=[output, meta_state, corrected_tb, status_box]
550
- )
551
-
552
- submit_btn.click(
553
- fn=submit_feedback,
554
- inputs=[
555
- meta_state,
556
- corrected_tb,
557
- score_slider,
558
- store_audio_cb,
559
- share_cb,
560
- audio
561
- ],
562
- outputs=results_json
563
  )
564
-
565
- # Auto-focus on audio input when page loads
566
- demo.load(
567
- fn=lambda: "Ready",
568
- inputs=[],
569
- outputs=[status_box]
570
  )
571
-
572
- return demo
573
 
574
- # -------- Main --------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  if __name__ == "__main__":
576
- # Log startup info
577
- logger.info(f"Starting ASR Demo")
578
- logger.info(f"HF Dataset Repo: {HF_DATASET_REPO}")
579
- logger.info(f"Push to HF enabled: {PUSH_TO_HF}")
580
- logger.info(f"Available languages: {len(language_models)}")
581
-
582
- # Create and launch demo
583
- demo = create_demo()
584
- demo.queue(max_size=10) # Limit queue size for stability
585
- demo.launch(
586
- server_name="0.0.0.0",
587
- server_port=7860,
588
- share=False # Set to True if you want a public link
589
- )
 
1
+ # app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
2
 
3
  import os
4
  import json
 
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
 
 
 
 
 
16
 
17
  # Optional: modest thread hints for CPU Spaces
18
  try:
 
22
  except Exception:
23
  pass
24
 
25
+ # Basic logging so we can verify which model is loaded per inference
26
+ logging.basicConfig(level=logging.INFO)
 
 
 
 
27
 
28
+ # --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
29
+ from datasets import Dataset, Features, Value, Audio, load_dataset
30
+
31
+ # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
32
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
35
 
36
+ HF_FEATURES = Features({
37
+ "timestamp": Value("string"),
38
+ "session_id": Value("string"),
39
+ "language_display": Value("string"),
40
+ "model_id": Value("string"),
41
+ "model_revision": Value("string"),
42
+
43
+ "audio": Audio(sampling_rate=None), # uploaded only if user consents
44
+ "audio_duration_s": Value("float32"),
45
+ "sample_rate": Value("int32"),
46
+ "source": Value("string"),
47
+ "decode_params": Value("string"),
48
+
49
+ "transcript_hyp": Value("string"),
50
+ "corrected_text": Value("string"),
51
+
52
+ "latency_ms": Value("int32"),
53
+ "rtf": Value("float32"),
54
+
55
+ "score_out_of_10": Value("int32"),
56
+ "share_publicly": Value("bool"),
57
+ })
58
+
59
+ def _push_row_to_hf_dataset(row, audio_file_path):
60
+ """
61
+ Append a single example to the HF dataset repo (train split).
62
+ If user didn't consent or no audio path, 'audio' field is None.
63
+ """
64
+ if not PUSH_TO_HF:
65
+ return "HF push disabled (missing HF_TOKEN or repo)."
66
+
67
+ example = dict(row)
68
+
69
+ # Audio: only include if user consented and file exists
70
+ example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
71
+
72
+ # Normalize types
73
+ def _to_int(v):
74
+ try:
75
+ return int(v)
76
+ except Exception:
77
+ return None
78
+ def _to_float(v):
79
+ try:
80
+ return float(v)
81
+ except Exception:
82
+ return None
83
+
84
+ for k in ["latency_ms", "score_out_of_10", "sample_rate"]:
85
+ example[k] = _to_int(example.get(k))
86
+ for k in ["rtf", "audio_duration_s"]:
87
+ example[k] = _to_float(example.get(k))
88
+
89
+ ds = Dataset.from_list([example], features=HF_FEATURES)
90
+
91
+ # Load existing split if present, then append
92
+ try:
93
+ existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
94
+ merged = existing.concatenate(ds)
95
+ except Exception:
96
+ merged = ds
97
+
98
+ merged.push_to_hub(
99
+ HF_DATASET_REPO,
100
+ split="train",
101
+ private=True,
102
+ token=HF_TOKEN,
103
+ commit_message="append feedback row"
104
+ )
105
+ return "Pushed to HF Dataset."
106
 
107
  # --- Map display names to your HF Hub model IDs ---
108
  language_models = {
109
  "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
110
  "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
111
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
112
+ "Luganda": "FarmerlineML/w2v-bert-2.0_luganda", # active
113
+ # "Luganda (FKD)": "FarmerlineML/luganda_fkd", # commented out per request
114
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
115
  "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
116
  "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
 
128
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
129
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
130
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
131
+ # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
132
+ # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
133
+ # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
134
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
135
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
136
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
137
  "Pidgin": "FarmerlineML/pidgin_nigerian",
138
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
139
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
140
+ "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
141
  }
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
144
  TARGET_SR = 16000
145
 
146
+ def _has_ffmpeg():
 
147
  return shutil.which("ffmpeg") is not None
148
 
149
+ def _load_with_soundfile(path):
 
150
  data, sr = sf.read(path, always_2d=False)
151
  if isinstance(data, np.ndarray) and data.ndim > 1:
152
  data = data.mean(axis=1)
153
  return data.astype(np.float32), sr
154
 
155
+ def _load_with_ffmpeg(path, target_sr=TARGET_SR):
156
+ # Convert to mono 16k wav in a temp file using ffmpeg
157
  if not _has_ffmpeg():
158
  raise RuntimeError("ffmpeg not available")
 
159
  tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
160
  tmp_wav.close()
161
+ cmd = [
162
+ "ffmpeg", "-hide_banner", "-loglevel", "error",
163
+ "-y", "-i", path,
164
+ "-ac", "1", "-ar", str(target_sr),
165
+ tmp_wav.name,
166
+ ]
167
+ subprocess.run(cmd, check=True)
168
+ data, sr = sf.read(tmp_wav.name, always_2d=False)
169
  try:
170
+ os.remove(tmp_wav.name)
171
+ except Exception:
172
+ pass
173
+ if isinstance(data, np.ndarray) and data.ndim > 1:
174
+ data = data.mean(axis=1)
175
+ return data.astype(np.float32), sr
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ def _resample_if_needed(y, sr, target_sr=TARGET_SR):
 
178
  if sr == target_sr:
179
  return y.astype(np.float32), sr
180
  y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
181
  return y_rs.astype(np.float32), target_sr
182
 
183
+ def load_audio_any(path, target_sr=TARGET_SR):
184
  """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
 
 
 
185
  ext = os.path.splitext(path)[1].lower()
 
186
  try:
187
  if ext in {".wav", ".flac", ".ogg", ".opus"}:
188
  y, sr = _load_with_soundfile(path)
 
192
  else:
193
  # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
194
  y, sr = librosa.load(path, sr=None, mono=True)
 
195
  y, sr = _resample_if_needed(y, sr, target_sr)
196
  return y, sr
197
  except Exception as e:
198
+ logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
199
  y, sr = librosa.load(path, sr=target_sr, mono=True)
200
  return y.astype(np.float32), sr
201
 
 
204
  _CACHE_ORDER = [] # usage order
205
  _CACHE_MAX_SIZE = 3 # tune for RAM
206
 
207
+ def _touch_cache(key):
 
208
  if key in _CACHE_ORDER:
209
  _CACHE_ORDER.remove(key)
210
  _CACHE_ORDER.insert(0, key)
211
 
212
  def _evict_if_needed():
 
213
  while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
214
+ oldest = _CACHE_ORDER.pop()
215
+ try:
216
+ del _PIPELINE_CACHE[oldest]
217
+ except KeyError:
218
+ pass
219
 
220
  def get_asr_pipeline(language_display: str):
 
221
  if language_display not in language_models:
222
  raise ValueError(f"Unknown language selection: {language_display}")
223
 
 
226
  return _PIPELINE_CACHE[language_display]
227
 
228
  model_id = language_models[language_display]
229
+ logging.info(f"[ASR] Loading pipeline for '{language_display}' -> {model_id}")
 
230
  pipe = pipeline(
231
  task="automatic-speech-recognition",
232
  model=model_id,
233
+ device=-1, # CPU on Spaces (explicit)
234
  chunk_length_s=30
235
  )
 
236
  _PIPELINE_CACHE[language_display] = pipe
237
  _touch_cache(language_display)
238
  _evict_if_needed()
 
240
 
241
  # -------- Helpers --------
242
  def _model_revision_from_pipeline(pipe) -> str:
243
+ # Best-effort capture of revision/hash for reproducibility
244
  for attr in ("hub_revision", "revision", "_commit_hash"):
245
  val = getattr(getattr(pipe, "model", None), attr, None)
246
  if val:
 
251
  return "unknown"
252
 
253
  # -------- Inference --------
254
+ def transcribe(audio_path: str, language: str):
255
  """
256
  Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
257
  then run it through the chosen ASR pipeline.
 
259
  """
260
  if not audio_path:
261
  return "⚠️ Please upload or record an audio clip.", None
262
+
263
+ speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
264
+ duration_s = float(len(speech) / float(sr))
265
+
266
+ pipe = get_asr_pipeline(language)
267
+ decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
268
+
269
+ t0 = time.time()
270
+ result = pipe({"sampling_rate": sr, "raw": speech})
271
+ latency_ms = int((time.time() - t0) * 1000.0)
272
+ hyp_text = result.get("text", "")
273
+
274
+ rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
275
+
276
+ meta = {
277
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
278
+ "session_id": f"anon-{uuid.uuid4()}",
279
+ "language_display": language,
280
+ "model_id": language_models.get(language, "unknown"),
281
+ "model_revision": _model_revision_from_pipeline(pipe),
282
+ "audio_duration_s": duration_s,
283
+ "sample_rate": sr,
284
+ "source": "upload",
285
+ "decode_params": json.dumps(decode_params),
286
+ "transcript_hyp": hyp_text,
287
+ "latency_ms": latency_ms,
288
+ "rtf": rtf,
289
+ }
290
+ return hyp_text, meta
291
+
292
+ # -------- Feedback submit (minimal) --------
293
+ def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  """
295
+ Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
296
  """
297
  if not meta:
298
+ return {"status": "No transcription metadata available. Please transcribe first."}
299
+
 
 
 
 
300
  row = dict(meta)
301
  row.update({
302
  "corrected_text": (corrected_text or "").strip(),
303
  "score_out_of_10": int(score) if score is not None else None,
304
  "share_publicly": bool(share_publicly),
305
  })
306
+
 
307
  try:
308
  audio_to_push = audio_file_path if store_audio else None
309
  hf_status = _push_row_to_hf_dataset(row, audio_to_push)
310
+ status = f"Feedback saved. {hf_status}"
 
 
 
 
 
 
 
 
 
311
  except Exception as e:
312
+ status = f"Failed to push to HF Dataset: {e}"
313
+
314
+ return {
315
+ "status": status,
316
+ "latency_ms": row["latency_ms"],
317
+ "rtf": row["rtf"],
318
+ "model_id": row["model_id"],
319
+ "model_revision": row["model_revision"],
320
+ "language": row["language_display"],
321
+ }
322
+
323
+ # -------- UI (original preserved; additions appended) --------
324
+ with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
325
+ gr.Markdown(
326
+ """
327
+ ## 🎙️ Multilingual Speech-to-Text
328
+ Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
329
+ Then choose the language/model and hit **Transcribe**.
330
+ """
331
+ )
332
+
333
+ with gr.Row():
334
+ lang = gr.Dropdown(
335
+ choices=list(language_models.keys()),
336
+ value=list(language_models.keys())[0],
337
+ label="Select Language / Model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  )
339
+
340
+ with gr.Row():
341
+ audio = gr.Audio(
342
+ sources=["upload", "microphone"],
343
+ type="filepath",
344
+ label="Upload or Record Audio"
345
  )
 
 
346
 
347
+ btn = gr.Button("Transcribe")
348
+ output = gr.Textbox(label="Transcription")
349
+
350
+ # Hidden state to carry metadata from transcribe -> feedback
351
+ meta_state = gr.State(value=None)
352
+
353
+ # Keep original behavior: output shows transcript
354
+ # Also capture meta into the hidden state
355
+ def _transcribe_and_store(audio_path, language):
356
+ hyp, meta = transcribe(audio_path, language)
357
+ # Pre-fill corrected with hypothesis for easy edits
358
+ return hyp, meta, hyp
359
+
360
+ # --- Minimal Evaluation (score + optional corrected text) ---
361
+ with gr.Accordion("Evaluation", open=False):
362
+ with gr.Row():
363
+ corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="")
364
+ with gr.Row():
365
+ score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7)
366
+ with gr.Row():
367
+ store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False)
368
+ share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False)
369
+
370
+ submit_btn = gr.Button("Submit")
371
+ results_json = gr.JSON(label="Status")
372
+
373
+ # Wire events
374
+ btn.click(
375
+ fn=_transcribe_and_store,
376
+ inputs=[audio, lang],
377
+ outputs=[output, meta_state, corrected_tb]
378
+ )
379
+
380
+ submit_btn.click(
381
+ fn=submit_feedback,
382
+ inputs=[
383
+ meta_state,
384
+ corrected_tb,
385
+ score_slider,
386
+ store_audio_cb,
387
+ share_cb,
388
+ audio # raw file path from gr.Audio
389
+ ],
390
+ outputs=results_json
391
+ )
392
+
393
+ # Keep Spaces stable under load
394
  if __name__ == "__main__":
395
+ demo.queue()
396
+ demo.launch()