FarmerlineML commited on
Commit
ea4b615
·
verified ·
1 Parent(s): b9361f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -17
app.py CHANGED
@@ -1,14 +1,18 @@
1
- # app.py (simplified + fixed language alignment; Luganda fkd commented out)
2
 
3
  import os
4
  import json
5
  import time
6
  import uuid
7
  import logging
 
 
 
8
  import gradio as gr
9
  from transformers import pipeline
10
  import numpy as np
11
- import librosa # pip install librosa
 
12
 
13
  # Optional: modest thread hints for CPU Spaces
14
  try:
@@ -101,12 +105,12 @@ def _push_row_to_hf_dataset(row, audio_file_path):
101
  return "Pushed to HF Dataset."
102
 
103
  # --- Map display names to your HF Hub model IDs ---
104
- # --- EDIT THIS: map display names to your HF Hub model IDs ---
105
  language_models = {
106
  "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
107
  "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
108
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
109
- # "Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
 
110
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
111
  "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
112
  "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
@@ -124,21 +128,77 @@ language_models = {
124
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
125
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
126
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
127
- # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
128
- # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
129
- # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
130
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
131
- "Luganda": "FarmerlineML/luganda_fkd",
132
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
133
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
134
  "Pidgin": "FarmerlineML/pidgin_nigerian",
135
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
136
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
137
- #"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
138
-
139
- # add more as needed
140
  }
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # -------- Lazy-load pipeline cache (Space-safe) --------
143
  _PIPELINE_CACHE = {}
144
  _CACHE_ORDER = [] # usage order
@@ -193,15 +253,15 @@ def _model_revision_from_pipeline(pipe) -> str:
193
  # -------- Inference --------
194
  def transcribe(audio_path: str, language: str):
195
  """
196
- Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
197
- convert to mono, then run it through the chosen ASR pipeline.
198
- Returns transcript (unchanged behavior) and a meta dict for feedback.
199
  """
200
  if not audio_path:
201
  return "⚠️ Please upload or record an audio clip.", None
202
 
203
- speech, sr = librosa.load(audio_path, sr=None, mono=True)
204
- duration_s = float(librosa.get_duration(y=speech, sr=sr))
205
 
206
  pipe = get_asr_pipeline(language)
207
  decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
@@ -233,7 +293,6 @@ def transcribe(audio_path: str, language: str):
233
  def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
234
  """
235
  Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
236
- No WER/CER computations.
237
  """
238
  if not meta:
239
  return {"status": "No transcription metadata available. Please transcribe first."}
 
1
+ # app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
2
 
3
  import os
4
  import json
5
  import time
6
  import uuid
7
  import logging
8
+ import shutil
9
+ import subprocess
10
+ import tempfile
11
  import gradio as gr
12
  from transformers import pipeline
13
  import numpy as np
14
+ import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
+ import librosa # fallback / resampling
16
 
17
  # Optional: modest thread hints for CPU Spaces
18
  try:
 
105
  return "Pushed to HF Dataset."
106
 
107
  # --- Map display names to your HF Hub model IDs ---
 
108
  language_models = {
109
  "Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
110
  "Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
111
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
112
+ "Luganda": "FarmerlineML/w2v-bert-2.0_luganda", # active
113
+ # "Luganda (FKD)": "FarmerlineML/luganda_fkd", # commented out per request
114
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
115
  "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
116
  "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
 
128
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
129
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
130
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
131
+ # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
132
+ # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
133
+ # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
134
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
 
135
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
136
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
137
  "Pidgin": "FarmerlineML/pidgin_nigerian",
138
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
139
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
140
+ "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
 
 
141
  }
142
 
143
+ # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
144
+ TARGET_SR = 16000
145
+
146
+ def _has_ffmpeg():
147
+ return shutil.which("ffmpeg") is not None
148
+
149
+ def _load_with_soundfile(path):
150
+ data, sr = sf.read(path, always_2d=False)
151
+ if isinstance(data, np.ndarray) and data.ndim > 1:
152
+ data = data.mean(axis=1)
153
+ return data.astype(np.float32), sr
154
+
155
+ def _load_with_ffmpeg(path, target_sr=TARGET_SR):
156
+ # Convert to mono 16k wav in a temp file using ffmpeg
157
+ if not _has_ffmpeg():
158
+ raise RuntimeError("ffmpeg not available")
159
+ tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
160
+ tmp_wav.close()
161
+ cmd = [
162
+ "ffmpeg", "-hide_banner", "-loglevel", "error",
163
+ "-y", "-i", path,
164
+ "-ac", "1", "-ar", str(target_sr),
165
+ tmp_wav.name,
166
+ ]
167
+ subprocess.run(cmd, check=True)
168
+ data, sr = sf.read(tmp_wav.name, always_2d=False)
169
+ try:
170
+ os.remove(tmp_wav.name)
171
+ except Exception:
172
+ pass
173
+ if isinstance(data, np.ndarray) and data.ndim > 1:
174
+ data = data.mean(axis=1)
175
+ return data.astype(np.float32), sr
176
+
177
+ def _resample_if_needed(y, sr, target_sr=TARGET_SR):
178
+ if sr == target_sr:
179
+ return y.astype(np.float32), sr
180
+ y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
181
+ return y_rs.astype(np.float32), target_sr
182
+
183
+ def load_audio_any(path, target_sr=TARGET_SR):
184
+ """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
185
+ ext = os.path.splitext(path)[1].lower()
186
+ try:
187
+ if ext in {".wav", ".flac", ".ogg", ".opus"}:
188
+ y, sr = _load_with_soundfile(path)
189
+ elif _has_ffmpeg():
190
+ y, sr = _load_with_ffmpeg(path, target_sr=target_sr)
191
+ return y, sr # already mono+16k
192
+ else:
193
+ # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
194
+ y, sr = librosa.load(path, sr=None, mono=True)
195
+ y, sr = _resample_if_needed(y, sr, target_sr)
196
+ return y, sr
197
+ except Exception as e:
198
+ logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
199
+ y, sr = librosa.load(path, sr=target_sr, mono=True)
200
+ return y.astype(np.float32), sr
201
+
202
  # -------- Lazy-load pipeline cache (Space-safe) --------
203
  _PIPELINE_CACHE = {}
204
  _CACHE_ORDER = [] # usage order
 
253
  # -------- Inference --------
254
  def transcribe(audio_path: str, language: str):
255
  """
256
+ Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
257
+ then run it through the chosen ASR pipeline.
258
+ Returns transcript and a meta dict for feedback.
259
  """
260
  if not audio_path:
261
  return "⚠️ Please upload or record an audio clip.", None
262
 
263
+ speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
264
+ duration_s = float(len(speech) / float(sr))
265
 
266
  pipe = get_asr_pipeline(language)
267
  decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
 
293
  def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
294
  """
295
  Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
 
296
  """
297
  if not meta:
298
  return {"status": "No transcription metadata available. Please transcribe first."}