FarmerlineML commited on
Commit
81083e5
Β·
verified Β·
1 Parent(s): f4f8ccf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +425 -232
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
2
 
3
  import os
4
  import json
@@ -13,6 +13,11 @@ from transformers import pipeline
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
 
 
 
 
 
16
 
17
  # Optional: modest thread hints for CPU Spaces
18
  try:
@@ -22,95 +27,27 @@ try:
22
  except Exception:
23
  pass
24
 
25
- # Basic logging so we can verify which model is loaded per inference
26
- logging.basicConfig(level=logging.INFO)
 
 
 
 
27
 
28
- # --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
29
- from datasets import Dataset, Features, Value, Audio, load_dataset
30
-
31
- # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
32
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
35
 
36
- HF_FEATURES = Features({
37
- "timestamp": Value("string"),
38
- "session_id": Value("string"),
39
- "language_display": Value("string"),
40
- "model_id": Value("string"),
41
- "model_revision": Value("string"),
42
-
43
- "audio": Audio(sampling_rate=None), # uploaded only if user consents
44
- "audio_duration_s": Value("float32"),
45
- "sample_rate": Value("int32"),
46
- "source": Value("string"),
47
- "decode_params": Value("string"),
48
-
49
- "transcript_hyp": Value("string"),
50
- "corrected_text": Value("string"),
51
-
52
- "latency_ms": Value("int32"),
53
- "rtf": Value("float32"),
54
-
55
- "score_out_of_10": Value("int32"),
56
- "share_publicly": Value("bool"),
57
- })
58
-
59
- def _push_row_to_hf_dataset(row, audio_file_path):
60
- """
61
- Append a single example to the HF dataset repo (train split).
62
- If user didn't consent or no audio path, 'audio' field is None.
63
- """
64
- if not PUSH_TO_HF:
65
- return "HF push disabled (missing HF_TOKEN or repo)."
66
-
67
- example = dict(row)
68
-
69
- # Audio: only include if user consented and file exists
70
- example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
71
-
72
- # Normalize types
73
- def _to_int(v):
74
- try:
75
- return int(v)
76
- except Exception:
77
- return None
78
- def _to_float(v):
79
- try:
80
- return float(v)
81
- except Exception:
82
- return None
83
-
84
- for k in ["latency_ms", "score_out_of_10", "sample_rate"]:
85
- example[k] = _to_int(example.get(k))
86
- for k in ["rtf", "audio_duration_s"]:
87
- example[k] = _to_float(example.get(k))
88
-
89
- ds = Dataset.from_list([example], features=HF_FEATURES)
90
-
91
- # Load existing split if present, then append
92
- try:
93
- existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
94
- merged = existing.concatenate(ds)
95
- except Exception:
96
- merged = ds
97
-
98
- merged.push_to_hub(
99
- HF_DATASET_REPO,
100
- split="train",
101
- private=True,
102
- token=HF_TOKEN,
103
- commit_message="append feedback row"
104
- )
105
- return "Pushed to HF Dataset."
106
 
107
  # --- Map display names to your HF Hub model IDs ---
108
  language_models = {
109
  "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
110
  "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
111
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
112
- "Luganda": "FarmerlineML/w2v-bert-2.0_luganda", # active
113
- # "Luganda (FKD)": "FarmerlineML/luganda_fkd", # commented out per request
114
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
115
  "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
116
  "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
@@ -128,61 +65,180 @@ language_models = {
128
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
129
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
130
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
131
- # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
132
- # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
133
- # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
134
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
135
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
136
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
137
  "Pidgin": "FarmerlineML/pidgin_nigerian",
138
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
139
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
140
- "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
141
  }
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
144
  TARGET_SR = 16000
145
 
146
- def _has_ffmpeg():
 
147
  return shutil.which("ffmpeg") is not None
148
 
149
- def _load_with_soundfile(path):
 
150
  data, sr = sf.read(path, always_2d=False)
151
  if isinstance(data, np.ndarray) and data.ndim > 1:
152
  data = data.mean(axis=1)
153
  return data.astype(np.float32), sr
154
 
155
- def _load_with_ffmpeg(path, target_sr=TARGET_SR):
156
- # Convert to mono 16k wav in a temp file using ffmpeg
157
  if not _has_ffmpeg():
158
  raise RuntimeError("ffmpeg not available")
 
159
  tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
160
  tmp_wav.close()
161
- cmd = [
162
- "ffmpeg", "-hide_banner", "-loglevel", "error",
163
- "-y", "-i", path,
164
- "-ac", "1", "-ar", str(target_sr),
165
- tmp_wav.name,
166
- ]
167
- subprocess.run(cmd, check=True)
168
- data, sr = sf.read(tmp_wav.name, always_2d=False)
169
  try:
170
- os.remove(tmp_wav.name)
171
- except Exception:
172
- pass
173
- if isinstance(data, np.ndarray) and data.ndim > 1:
174
- data = data.mean(axis=1)
175
- return data.astype(np.float32), sr
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- def _resample_if_needed(y, sr, target_sr=TARGET_SR):
 
178
  if sr == target_sr:
179
  return y.astype(np.float32), sr
180
  y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
181
  return y_rs.astype(np.float32), target_sr
182
 
183
- def load_audio_any(path, target_sr=TARGET_SR):
184
  """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
 
 
 
185
  ext = os.path.splitext(path)[1].lower()
 
186
  try:
187
  if ext in {".wav", ".flac", ".ogg", ".opus"}:
188
  y, sr = _load_with_soundfile(path)
@@ -192,10 +248,11 @@ def load_audio_any(path, target_sr=TARGET_SR):
192
  else:
193
  # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
194
  y, sr = librosa.load(path, sr=None, mono=True)
 
195
  y, sr = _resample_if_needed(y, sr, target_sr)
196
  return y, sr
197
  except Exception as e:
198
- logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
199
  y, sr = librosa.load(path, sr=target_sr, mono=True)
200
  return y.astype(np.float32), sr
201
 
@@ -204,20 +261,23 @@ _PIPELINE_CACHE = {}
204
  _CACHE_ORDER = [] # usage order
205
  _CACHE_MAX_SIZE = 3 # tune for RAM
206
 
207
- def _touch_cache(key):
 
208
  if key in _CACHE_ORDER:
209
  _CACHE_ORDER.remove(key)
210
  _CACHE_ORDER.insert(0, key)
211
 
212
  def _evict_if_needed():
 
213
  while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
214
- oldest = _CACHE_ORDER.pop()
215
- try:
216
- del _PIPELINE_CACHE[oldest]
217
- except KeyError:
218
- pass
219
 
220
  def get_asr_pipeline(language_display: str):
 
221
  if language_display not in language_models:
222
  raise ValueError(f"Unknown language selection: {language_display}")
223
 
@@ -226,13 +286,15 @@ def get_asr_pipeline(language_display: str):
226
  return _PIPELINE_CACHE[language_display]
227
 
228
  model_id = language_models[language_display]
229
- logging.info(f"[ASR] Loading pipeline for '{language_display}' -> {model_id}")
 
230
  pipe = pipeline(
231
  task="automatic-speech-recognition",
232
  model=model_id,
233
- device=-1, # CPU on Spaces (explicit)
234
  chunk_length_s=30
235
  )
 
236
  _PIPELINE_CACHE[language_display] = pipe
237
  _touch_cache(language_display)
238
  _evict_if_needed()
@@ -240,7 +302,7 @@ def get_asr_pipeline(language_display: str):
240
 
241
  # -------- Helpers --------
242
  def _model_revision_from_pipeline(pipe) -> str:
243
- # Best-effort capture of revision/hash for reproducibility
244
  for attr in ("hub_revision", "revision", "_commit_hash"):
245
  val = getattr(getattr(pipe, "model", None), attr, None)
246
  if val:
@@ -251,7 +313,7 @@ def _model_revision_from_pipeline(pipe) -> str:
251
  return "unknown"
252
 
253
  # -------- Inference --------
254
- def transcribe(audio_path: str, language: str):
255
  """
256
  Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
257
  then run it through the chosen ASR pipeline.
@@ -259,138 +321,269 @@ def transcribe(audio_path: str, language: str):
259
  """
260
  if not audio_path:
261
  return "⚠️ Please upload or record an audio clip.", None
262
-
263
- speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
264
- duration_s = float(len(speech) / float(sr))
265
-
266
- pipe = get_asr_pipeline(language)
267
- decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
268
-
269
- t0 = time.time()
270
- result = pipe({"sampling_rate": sr, "raw": speech})
271
- latency_ms = int((time.time() - t0) * 1000.0)
272
- hyp_text = result.get("text", "")
273
-
274
- rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
275
-
276
- meta = {
277
- "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
278
- "session_id": f"anon-{uuid.uuid4()}",
279
- "language_display": language,
280
- "model_id": language_models.get(language, "unknown"),
281
- "model_revision": _model_revision_from_pipeline(pipe),
282
- "audio_duration_s": duration_s,
283
- "sample_rate": sr,
284
- "source": "upload",
285
- "decode_params": json.dumps(decode_params),
286
- "transcript_hyp": hyp_text,
287
- "latency_ms": latency_ms,
288
- "rtf": rtf,
289
- }
290
- return hyp_text, meta
291
-
292
- # -------- Feedback submit (minimal) --------
293
- def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  """
295
- Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
296
  """
297
  if not meta:
298
- return {"status": "No transcription metadata available. Please transcribe first."}
299
-
 
 
 
 
300
  row = dict(meta)
301
  row.update({
302
  "corrected_text": (corrected_text or "").strip(),
303
  "score_out_of_10": int(score) if score is not None else None,
304
  "share_publicly": bool(share_publicly),
305
  })
306
-
 
307
  try:
308
  audio_to_push = audio_file_path if store_audio else None
309
  hf_status = _push_row_to_hf_dataset(row, audio_to_push)
310
- status = f"Feedback saved. {hf_status}"
 
 
 
 
 
 
 
 
 
311
  except Exception as e:
312
- status = f"Failed to push to HF Dataset: {e}"
313
-
314
- return {
315
- "status": status,
316
- "latency_ms": row["latency_ms"],
317
- "rtf": row["rtf"],
318
- "model_id": row["model_id"],
319
- "model_revision": row["model_revision"],
320
- "language": row["language_display"],
321
- }
322
-
323
- # -------- UI (original preserved; additions appended) --------
324
- with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
325
- gr.Markdown(
326
- """
327
- ## πŸŽ™οΈ Multilingual Speech-to-Text
328
- Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
329
- Then choose the language/model and hit **Transcribe**.
330
- """
331
- )
332
-
333
- with gr.Row():
334
- lang = gr.Dropdown(
335
- choices=list(language_models.keys()),
336
- value=list(language_models.keys())[0],
337
- label="Select Language / Model"
338
- )
339
-
340
- with gr.Row():
341
- audio = gr.Audio(
342
- sources=["upload", "microphone"],
343
- type="filepath",
344
- label="Upload or Record Audio"
345
  )
346
-
347
- btn = gr.Button("Transcribe")
348
- output = gr.Textbox(label="Transcription")
349
-
350
- # Hidden state to carry metadata from transcribe -> feedback
351
- meta_state = gr.State(value=None)
352
-
353
- # Keep original behavior: output shows transcript
354
- # Also capture meta into the hidden state
355
- def _transcribe_and_store(audio_path, language):
356
- hyp, meta = transcribe(audio_path, language)
357
- # Pre-fill corrected with hypothesis for easy edits
358
- return hyp, meta, hyp
359
-
360
- # --- Minimal Evaluation (score + optional corrected text) ---
361
- with gr.Accordion("Evaluation", open=False):
362
- with gr.Row():
363
- corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="")
364
  with gr.Row():
365
- score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7)
366
- with gr.Row():
367
- store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False)
368
- share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False)
369
-
370
- submit_btn = gr.Button("Submit")
371
- results_json = gr.JSON(label="Status")
372
-
373
- # Wire events
374
- btn.click(
375
- fn=_transcribe_and_store,
376
- inputs=[audio, lang],
377
- outputs=[output, meta_state, corrected_tb]
378
- )
379
-
380
- submit_btn.click(
381
- fn=submit_feedback,
382
- inputs=[
383
- meta_state,
384
- corrected_tb,
385
- score_slider,
386
- store_audio_cb,
387
- share_cb,
388
- audio # raw file path from gr.Audio
389
- ],
390
- outputs=results_json
391
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
- # Keep Spaces stable under load
394
  if __name__ == "__main__":
395
- demo.queue()
396
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py (MP3-robust loader + Robust HF Dataset Appending)
2
 
3
  import os
4
  import json
 
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
16
+ import pandas as pd
17
+ import pyarrow.parquet as pq
18
+ import pyarrow as pa
19
+ from huggingface_hub import HfApi
20
+ from typing import Optional, Tuple, Dict, Any
21
 
22
  # Optional: modest thread hints for CPU Spaces
23
  try:
 
27
  except Exception:
28
  pass
29
 
30
+ # Setup logging with more detail
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
34
+ )
35
+ logger = logging.getLogger(__name__)
36
 
37
+ # -------- CONFIG: Hub dataset target --------
 
 
 
38
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
39
  HF_TOKEN = os.environ.get("HF_TOKEN")
40
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
41
 
42
+ # Initialize HF API client once
43
+ hf_api = HfApi() if PUSH_TO_HF else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # --- Map display names to your HF Hub model IDs ---
46
  language_models = {
47
  "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
48
  "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
49
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
50
+ "Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
 
51
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
52
  "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
53
  "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
 
65
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
66
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
67
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
 
 
 
68
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
69
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
70
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
71
  "Pidgin": "FarmerlineML/pidgin_nigerian",
72
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
73
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
74
+ "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
75
  }
76
 
77
+ # -------- Robust Dataset Push Function --------
78
+ def _push_row_to_hf_dataset(row: Dict[str, Any], audio_file_path: Optional[str]) -> str:
79
+ """
80
+ Append a single example to the HF dataset repo using Parquet files.
81
+ Each submission creates a new Parquet file to avoid overwrites.
82
+ """
83
+ if not PUSH_TO_HF:
84
+ return "HF push disabled (missing HF_TOKEN or repo)."
85
+
86
+ if not hf_api:
87
+ return "HF API client not initialized."
88
+
89
+ # Create a copy of the row to avoid modifying the original
90
+ example = dict(row)
91
+
92
+ # Generate unique identifiers for this submission
93
+ timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
94
+ unique_id = str(uuid.uuid4())[:8]
95
+
96
+ # Handle audio file if provided and user consented
97
+ audio_uploaded = False
98
+ if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly", False):
99
+ try:
100
+ # Store reference to audio file in the dataset
101
+ audio_filename = f"audio_{timestamp}_{unique_id}{os.path.splitext(audio_file_path)[1]}"
102
+ example["audio_filename"] = audio_filename
103
+
104
+ # Upload audio file separately
105
+ logger.info(f"Uploading audio file: {audio_filename}")
106
+ hf_api.upload_file(
107
+ path_or_fileobj=audio_file_path,
108
+ path_in_repo=f"audio/{audio_filename}",
109
+ repo_id=HF_DATASET_REPO,
110
+ repo_type="dataset",
111
+ token=HF_TOKEN,
112
+ commit_message=f"Add audio for feedback {timestamp}"
113
+ )
114
+ audio_uploaded = True
115
+ logger.info("Audio file uploaded successfully")
116
+ except Exception as e:
117
+ logger.error(f"Failed to upload audio: {e}")
118
+ example["audio_filename"] = None
119
+ else:
120
+ example["audio_filename"] = None
121
+
122
+ # Normalize data types for Parquet storage
123
+ def _safe_cast(value, cast_func, default=None):
124
+ """Safely cast a value to a type, returning default on failure."""
125
+ try:
126
+ return cast_func(value) if value is not None else default
127
+ except (ValueError, TypeError):
128
+ return default
129
+
130
+ # Type normalization
131
+ example["latency_ms"] = _safe_cast(example.get("latency_ms"), int)
132
+ example["score_out_of_10"] = _safe_cast(example.get("score_out_of_10"), int)
133
+ example["sample_rate"] = _safe_cast(example.get("sample_rate"), int)
134
+ example["rtf"] = _safe_cast(example.get("rtf"), float)
135
+ example["audio_duration_s"] = _safe_cast(example.get("audio_duration_s"), float)
136
+ example["share_publicly"] = bool(example.get("share_publicly", False))
137
+
138
+ # Ensure all string fields are properly handled
139
+ string_fields = ["timestamp", "session_id", "language_display", "model_id",
140
+ "model_revision", "source", "decode_params", "transcript_hyp",
141
+ "corrected_text"]
142
+ for field in string_fields:
143
+ if field in example and example[field] is not None:
144
+ example[field] = str(example[field])
145
+
146
+ # Create DataFrame and save as Parquet
147
+ df = pd.DataFrame([example])
148
+
149
+ # Generate Parquet filename
150
+ parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
151
+
152
+ # Create temporary Parquet file
153
+ temp_parquet = None
154
+ try:
155
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
156
+ temp_parquet = tmp_file.name
157
+ df.to_parquet(temp_parquet, engine='pyarrow', compression='snappy')
158
+
159
+ # Upload Parquet file to dataset repo
160
+ logger.info(f"Uploading feedback data: {parquet_filename}")
161
+ hf_api.upload_file(
162
+ path_or_fileobj=temp_parquet,
163
+ path_in_repo=f"data/{parquet_filename}",
164
+ repo_id=HF_DATASET_REPO,
165
+ repo_type="dataset",
166
+ token=HF_TOKEN,
167
+ commit_message=f"Add feedback row {timestamp}"
168
+ )
169
+ logger.info("Feedback data uploaded successfully")
170
+
171
+ status_msg = f"Successfully pushed to HF Dataset as {parquet_filename}"
172
+ if audio_uploaded:
173
+ status_msg += " (with audio)"
174
+ return status_msg
175
+
176
+ except Exception as e:
177
+ logger.error(f"Failed to push to HF Dataset: {e}")
178
+ return f"Failed to push to HF Dataset: {str(e)}"
179
+ finally:
180
+ # Clean up temporary file
181
+ if temp_parquet and os.path.exists(temp_parquet):
182
+ try:
183
+ os.remove(temp_parquet)
184
+ except Exception as e:
185
+ logger.warning(f"Failed to remove temp file: {e}")
186
+
187
  # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
188
  TARGET_SR = 16000
189
 
190
+ def _has_ffmpeg() -> bool:
191
+ """Check if ffmpeg is available in the system."""
192
  return shutil.which("ffmpeg") is not None
193
 
194
+ def _load_with_soundfile(path: str) -> Tuple[np.ndarray, int]:
195
+ """Load audio using soundfile (for wav/flac/ogg)."""
196
  data, sr = sf.read(path, always_2d=False)
197
  if isinstance(data, np.ndarray) and data.ndim > 1:
198
  data = data.mean(axis=1)
199
  return data.astype(np.float32), sr
200
 
201
+ def _load_with_ffmpeg(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
202
+ """Convert audio to mono wav using ffmpeg."""
203
  if not _has_ffmpeg():
204
  raise RuntimeError("ffmpeg not available")
205
+
206
  tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
207
  tmp_wav.close()
208
+
 
 
 
 
 
 
 
209
  try:
210
+ cmd = [
211
+ "ffmpeg", "-hide_banner", "-loglevel", "error",
212
+ "-y", "-i", path,
213
+ "-ac", "1", "-ar", str(target_sr),
214
+ tmp_wav.name,
215
+ ]
216
+ subprocess.run(cmd, check=True)
217
+ data, sr = sf.read(tmp_wav.name, always_2d=False)
218
+
219
+ if isinstance(data, np.ndarray) and data.ndim > 1:
220
+ data = data.mean(axis=1)
221
+ return data.astype(np.float32), sr
222
+ finally:
223
+ try:
224
+ os.remove(tmp_wav.name)
225
+ except Exception:
226
+ pass
227
 
228
+ def _resample_if_needed(y: np.ndarray, sr: int, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
229
+ """Resample audio if needed."""
230
  if sr == target_sr:
231
  return y.astype(np.float32), sr
232
  y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
233
  return y_rs.astype(np.float32), target_sr
234
 
235
+ def load_audio_any(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
236
  """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
237
+ if not os.path.exists(path):
238
+ raise FileNotFoundError(f"Audio file not found: {path}")
239
+
240
  ext = os.path.splitext(path)[1].lower()
241
+
242
  try:
243
  if ext in {".wav", ".flac", ".ogg", ".opus"}:
244
  y, sr = _load_with_soundfile(path)
 
248
  else:
249
  # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
250
  y, sr = librosa.load(path, sr=None, mono=True)
251
+
252
  y, sr = _resample_if_needed(y, sr, target_sr)
253
  return y, sr
254
  except Exception as e:
255
+ logger.warning(f"Primary load failed for {path} ({e}). Falling back to librosa.")
256
  y, sr = librosa.load(path, sr=target_sr, mono=True)
257
  return y.astype(np.float32), sr
258
 
 
261
  _CACHE_ORDER = [] # usage order
262
  _CACHE_MAX_SIZE = 3 # tune for RAM
263
 
264
+ def _touch_cache(key: str):
265
+ """Update cache access order."""
266
  if key in _CACHE_ORDER:
267
  _CACHE_ORDER.remove(key)
268
  _CACHE_ORDER.insert(0, key)
269
 
270
  def _evict_if_needed():
271
+ """Evict least recently used pipelines if cache is full."""
272
  while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
273
+ if _CACHE_ORDER:
274
+ oldest = _CACHE_ORDER.pop()
275
+ if oldest in _PIPELINE_CACHE:
276
+ logger.info(f"Evicting pipeline from cache: {oldest}")
277
+ del _PIPELINE_CACHE[oldest]
278
 
279
  def get_asr_pipeline(language_display: str):
280
+ """Get or create ASR pipeline for the specified language."""
281
  if language_display not in language_models:
282
  raise ValueError(f"Unknown language selection: {language_display}")
283
 
 
286
  return _PIPELINE_CACHE[language_display]
287
 
288
  model_id = language_models[language_display]
289
+ logger.info(f"Loading pipeline for '{language_display}' -> {model_id}")
290
+
291
  pipe = pipeline(
292
  task="automatic-speech-recognition",
293
  model=model_id,
294
+ device=-1, # CPU on Spaces
295
  chunk_length_s=30
296
  )
297
+
298
  _PIPELINE_CACHE[language_display] = pipe
299
  _touch_cache(language_display)
300
  _evict_if_needed()
 
302
 
303
  # -------- Helpers --------
304
  def _model_revision_from_pipeline(pipe) -> str:
305
+ """Best-effort capture of revision/hash for reproducibility."""
306
  for attr in ("hub_revision", "revision", "_commit_hash"):
307
  val = getattr(getattr(pipe, "model", None), attr, None)
308
  if val:
 
313
  return "unknown"
314
 
315
  # -------- Inference --------
316
+ def transcribe(audio_path: str, language: str) -> Tuple[str, Optional[Dict[str, Any]]]:
317
  """
318
  Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
319
  then run it through the chosen ASR pipeline.
 
321
  """
322
  if not audio_path:
323
  return "⚠️ Please upload or record an audio clip.", None
324
+
325
+ try:
326
+ # Load and process audio
327
+ speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
328
+ duration_s = float(len(speech) / float(sr))
329
+
330
+ # Get ASR pipeline
331
+ pipe = get_asr_pipeline(language)
332
+ decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
333
+
334
+ # Run inference
335
+ logger.info(f"Running ASR inference for {language} on {duration_s:.2f}s audio")
336
+ t0 = time.time()
337
+ result = pipe({"sampling_rate": sr, "raw": speech})
338
+ latency_ms = int((time.time() - t0) * 1000.0)
339
+ hyp_text = result.get("text", "")
340
+
341
+ # Calculate real-time factor
342
+ rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
343
+
344
+ # Prepare metadata
345
+ meta = {
346
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
347
+ "session_id": f"anon-{uuid.uuid4()}",
348
+ "language_display": language,
349
+ "model_id": language_models.get(language, "unknown"),
350
+ "model_revision": _model_revision_from_pipeline(pipe),
351
+ "audio_duration_s": duration_s,
352
+ "sample_rate": sr,
353
+ "source": "upload",
354
+ "decode_params": json.dumps(decode_params),
355
+ "transcript_hyp": hyp_text,
356
+ "latency_ms": latency_ms,
357
+ "rtf": rtf,
358
+ }
359
+
360
+ logger.info(f"Transcription complete. RTF: {rtf:.3f}")
361
+ return hyp_text, meta
362
+
363
+ except Exception as e:
364
+ logger.error(f"Transcription failed: {e}")
365
+ return f"❌ Transcription failed: {str(e)}", None
366
+
367
+ # -------- Feedback submit --------
368
+ def submit_feedback(
369
+ meta: Optional[Dict[str, Any]],
370
+ corrected_text: str,
371
+ score: int,
372
+ store_audio: bool,
373
+ share_publicly: bool,
374
+ audio_file_path: Optional[str]
375
+ ) -> Dict[str, Any]:
376
  """
377
+ Submit feedback to HF Dataset with improved error handling.
378
  """
379
  if not meta:
380
+ return {
381
+ "status": "❌ No transcription metadata available. Please transcribe first.",
382
+ "success": False
383
+ }
384
+
385
+ # Prepare row data
386
  row = dict(meta)
387
  row.update({
388
  "corrected_text": (corrected_text or "").strip(),
389
  "score_out_of_10": int(score) if score is not None else None,
390
  "share_publicly": bool(share_publicly),
391
  })
392
+
393
+ # Push to HF Dataset
394
  try:
395
  audio_to_push = audio_file_path if store_audio else None
396
  hf_status = _push_row_to_hf_dataset(row, audio_to_push)
397
+
398
+ return {
399
+ "status": f"βœ… {hf_status}",
400
+ "success": True,
401
+ "latency_ms": row["latency_ms"],
402
+ "rtf": f"{row['rtf']:.3f}",
403
+ "model_id": row["model_id"],
404
+ "model_revision": row["model_revision"],
405
+ "language": row["language_display"],
406
+ }
407
  except Exception as e:
408
+ logger.error(f"Failed to submit feedback: {e}")
409
+ return {
410
+ "status": f"❌ Failed to submit feedback: {str(e)}",
411
+ "success": False
412
+ }
413
+
414
+ # -------- Gradio UI --------
415
+ def create_demo():
416
+ """Create the Gradio demo interface."""
417
+
418
+ with gr.Blocks(
419
+ title="🌐 Multilingual ASR Demo",
420
+ theme=gr.themes.Soft()
421
+ ) as demo:
422
+ gr.Markdown(
423
+ """
424
+ # πŸŽ™οΈ Multilingual Speech-to-Text Demo
425
+
426
+ Upload an audio file (MP3, WAV, FLAC, M4A, OGG, etc.) or record via your microphone.
427
+ Then choose the language/model and hit **Transcribe**.
428
+
429
+ ---
430
+ """
 
 
 
 
 
 
 
 
 
 
431
  )
432
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  with gr.Row():
434
+ with gr.Column(scale=1):
435
+ lang = gr.Dropdown(
436
+ choices=list(language_models.keys()),
437
+ value=list(language_models.keys())[0],
438
+ label="Select Language / Model",
439
+ info="Choose the language of your audio"
440
+ )
441
+
442
+ audio = gr.Audio(
443
+ sources=["upload", "microphone"],
444
+ type="filepath",
445
+ label="Upload or Record Audio",
446
+ elem_id="audio-input"
447
+ )
448
+
449
+ btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
450
+
451
+ with gr.Column(scale=1):
452
+ output = gr.Textbox(
453
+ label="Transcription",
454
+ placeholder="Transcription will appear here...",
455
+ lines=5
456
+ )
457
+
458
+ # Status indicators
459
+ with gr.Row():
460
+ status_box = gr.Textbox(
461
+ label="Status",
462
+ interactive=False,
463
+ placeholder="Ready",
464
+ max_lines=1
465
+ )
466
+
467
+ # Hidden state to carry metadata from transcribe -> feedback
468
+ meta_state = gr.State(value=None)
469
+
470
+ # Evaluation section
471
+ with gr.Accordion("πŸ“ Evaluation & Feedback", open=False):
472
+ gr.Markdown(
473
+ """
474
+ Help us improve! Please provide feedback on the transcription quality.
475
+ """
476
+ )
477
+
478
+ with gr.Row():
479
+ corrected_tb = gr.Textbox(
480
+ label="Corrected transcript (optional)",
481
+ placeholder="If there are errors, type the correct transcription here...",
482
+ lines=4,
483
+ value=""
484
+ )
485
+
486
+ with gr.Row():
487
+ score_slider = gr.Slider(
488
+ minimum=0,
489
+ maximum=10,
490
+ step=1,
491
+ label="Quality Score (0 = terrible, 10 = perfect)",
492
+ value=7,
493
+ info="Rate the transcription quality"
494
+ )
495
+
496
+ with gr.Row():
497
+ store_audio_cb = gr.Checkbox(
498
+ label="Allow storing my audio for research/evaluation",
499
+ value=False,
500
+ info="Audio will be stored securely and used only for improving the models"
501
+ )
502
+ share_cb = gr.Checkbox(
503
+ label="Allow sharing this example publicly",
504
+ value=False,
505
+ info="Your example may be used in public datasets or demos"
506
+ )
507
+
508
+ submit_btn = gr.Button("πŸ“€ Submit Feedback", variant="secondary")
509
+
510
+ results_json = gr.JSON(
511
+ label="Submission Result",
512
+ visible=True
513
+ )
514
+
515
+ # Examples section
516
+ with gr.Accordion("πŸ“š Example Usage", open=False):
517
+ gr.Markdown(
518
+ """
519
+ ### Tips for best results:
520
+ - Speak clearly and at a normal pace
521
+ - Minimize background noise
522
+ - Keep recordings under 30 seconds for optimal performance
523
+ - Select the correct language before transcribing
524
+
525
+ ### Supported formats:
526
+ WAV, MP3, FLAC, M4A, OGG, OPUS, and more!
527
+ """
528
+ )
529
+
530
+ # Wire up events
531
+ def _transcribe_and_update(audio_path, language):
532
+ """Transcribe and update UI components."""
533
+ if not audio_path:
534
+ return "", None, "", "⚠️ Please provide audio"
535
+
536
+ status_box_val = f"πŸ”„ Processing {language}..."
537
+ hyp, meta = transcribe(audio_path, language)
538
+
539
+ if meta:
540
+ status_msg = f"βœ… Done! (RTF: {meta['rtf']:.3f})"
541
+ # Pre-fill corrected with hypothesis for easy edits
542
+ return hyp, meta, hyp, status_msg
543
+ else:
544
+ return hyp, None, "", "❌ Transcription failed"
545
+
546
+ btn.click(
547
+ fn=_transcribe_and_update,
548
+ inputs=[audio, lang],
549
+ outputs=[output, meta_state, corrected_tb, status_box]
550
+ )
551
+
552
+ submit_btn.click(
553
+ fn=submit_feedback,
554
+ inputs=[
555
+ meta_state,
556
+ corrected_tb,
557
+ score_slider,
558
+ store_audio_cb,
559
+ share_cb,
560
+ audio
561
+ ],
562
+ outputs=results_json
563
+ )
564
+
565
+ # Auto-focus on audio input when page loads
566
+ demo.load(
567
+ fn=lambda: "Ready",
568
+ inputs=[],
569
+ outputs=[status_box]
570
+ )
571
+
572
+ return demo
573
 
574
+ # -------- Main --------
575
  if __name__ == "__main__":
576
+ # Log startup info
577
+ logger.info(f"Starting ASR Demo")
578
+ logger.info(f"HF Dataset Repo: {HF_DATASET_REPO}")
579
+ logger.info(f"Push to HF enabled: {PUSH_TO_HF}")
580
+ logger.info(f"Available languages: {len(language_models)}")
581
+
582
+ # Create and launch demo
583
+ demo = create_demo()
584
+ demo.queue(max_size=10) # Limit queue size for stability
585
+ demo.launch(
586
+ server_name="0.0.0.0",
587
+ server_port=7860,
588
+ share=False # Set to True if you want a public link
589
+ )