Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,18 @@
|
|
1 |
-
# app.py (
|
2 |
|
3 |
import os
|
4 |
import json
|
5 |
import time
|
6 |
import uuid
|
7 |
import logging
|
|
|
|
|
|
|
8 |
import gradio as gr
|
9 |
from transformers import pipeline
|
10 |
import numpy as np
|
11 |
-
import
|
|
|
12 |
|
13 |
# Optional: modest thread hints for CPU Spaces
|
14 |
try:
|
@@ -101,12 +105,12 @@ def _push_row_to_hf_dataset(row, audio_file_path):
|
|
101 |
return "Pushed to HF Dataset."
|
102 |
|
103 |
# --- Map display names to your HF Hub model IDs ---
|
104 |
-
# --- EDIT THIS: map display names to your HF Hub model IDs ---
|
105 |
language_models = {
|
106 |
"Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
|
107 |
"Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
|
108 |
"Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
|
109 |
-
|
|
|
110 |
"Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
|
111 |
"Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
|
112 |
"Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
|
@@ -124,21 +128,77 @@ language_models = {
|
|
124 |
"Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
|
125 |
"Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
|
126 |
"Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
|
127 |
-
# "WOLOF":
|
128 |
-
# "HAITIAN CREOLE":
|
129 |
-
# "KABYLE":
|
130 |
"Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
|
131 |
-
"Luganda": "FarmerlineML/luganda_fkd",
|
132 |
"Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
|
133 |
"Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
|
134 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
135 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
136 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
137 |
-
|
138 |
-
|
139 |
-
# add more as needed
|
140 |
}
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
# -------- Lazy-load pipeline cache (Space-safe) --------
|
143 |
_PIPELINE_CACHE = {}
|
144 |
_CACHE_ORDER = [] # usage order
|
@@ -193,15 +253,15 @@ def _model_revision_from_pipeline(pipe) -> str:
|
|
193 |
# -------- Inference --------
|
194 |
def transcribe(audio_path: str, language: str):
|
195 |
"""
|
196 |
-
|
197 |
-
|
198 |
-
Returns transcript
|
199 |
"""
|
200 |
if not audio_path:
|
201 |
return "⚠️ Please upload or record an audio clip.", None
|
202 |
|
203 |
-
speech, sr =
|
204 |
-
duration_s = float(
|
205 |
|
206 |
pipe = get_asr_pipeline(language)
|
207 |
decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
|
@@ -233,7 +293,6 @@ def transcribe(audio_path: str, language: str):
|
|
233 |
def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
|
234 |
"""
|
235 |
Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
|
236 |
-
No WER/CER computations.
|
237 |
"""
|
238 |
if not meta:
|
239 |
return {"status": "No transcription metadata available. Please transcribe first."}
|
|
|
1 |
+
# app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
|
2 |
|
3 |
import os
|
4 |
import json
|
5 |
import time
|
6 |
import uuid
|
7 |
import logging
|
8 |
+
import shutil
|
9 |
+
import subprocess
|
10 |
+
import tempfile
|
11 |
import gradio as gr
|
12 |
from transformers import pipeline
|
13 |
import numpy as np
|
14 |
+
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
15 |
+
import librosa # fallback / resampling
|
16 |
|
17 |
# Optional: modest thread hints for CPU Spaces
|
18 |
try:
|
|
|
105 |
return "Pushed to HF Dataset."
|
106 |
|
107 |
# --- Map display names to your HF Hub model IDs ---
|
|
|
108 |
language_models = {
|
109 |
"Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
|
110 |
"Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
|
111 |
"Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
|
112 |
+
"Luganda": "FarmerlineML/w2v-bert-2.0_luganda", # active
|
113 |
+
# "Luganda (FKD)": "FarmerlineML/luganda_fkd", # commented out per request
|
114 |
"Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
|
115 |
"Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
|
116 |
"Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
|
|
|
128 |
"Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
|
129 |
"Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
|
130 |
"Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
|
131 |
+
# "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
|
132 |
+
# "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
|
133 |
+
# "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
|
134 |
"Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
|
|
|
135 |
"Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
|
136 |
"Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
|
137 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
138 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
139 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
140 |
+
"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
|
|
|
|
|
141 |
}
|
142 |
|
143 |
+
# -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
|
144 |
+
TARGET_SR = 16000
|
145 |
+
|
146 |
+
def _has_ffmpeg():
|
147 |
+
return shutil.which("ffmpeg") is not None
|
148 |
+
|
149 |
+
def _load_with_soundfile(path):
|
150 |
+
data, sr = sf.read(path, always_2d=False)
|
151 |
+
if isinstance(data, np.ndarray) and data.ndim > 1:
|
152 |
+
data = data.mean(axis=1)
|
153 |
+
return data.astype(np.float32), sr
|
154 |
+
|
155 |
+
def _load_with_ffmpeg(path, target_sr=TARGET_SR):
|
156 |
+
# Convert to mono 16k wav in a temp file using ffmpeg
|
157 |
+
if not _has_ffmpeg():
|
158 |
+
raise RuntimeError("ffmpeg not available")
|
159 |
+
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
160 |
+
tmp_wav.close()
|
161 |
+
cmd = [
|
162 |
+
"ffmpeg", "-hide_banner", "-loglevel", "error",
|
163 |
+
"-y", "-i", path,
|
164 |
+
"-ac", "1", "-ar", str(target_sr),
|
165 |
+
tmp_wav.name,
|
166 |
+
]
|
167 |
+
subprocess.run(cmd, check=True)
|
168 |
+
data, sr = sf.read(tmp_wav.name, always_2d=False)
|
169 |
+
try:
|
170 |
+
os.remove(tmp_wav.name)
|
171 |
+
except Exception:
|
172 |
+
pass
|
173 |
+
if isinstance(data, np.ndarray) and data.ndim > 1:
|
174 |
+
data = data.mean(axis=1)
|
175 |
+
return data.astype(np.float32), sr
|
176 |
+
|
177 |
+
def _resample_if_needed(y, sr, target_sr=TARGET_SR):
|
178 |
+
if sr == target_sr:
|
179 |
+
return y.astype(np.float32), sr
|
180 |
+
y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
|
181 |
+
return y_rs.astype(np.float32), target_sr
|
182 |
+
|
183 |
+
def load_audio_any(path, target_sr=TARGET_SR):
|
184 |
+
"""Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
|
185 |
+
ext = os.path.splitext(path)[1].lower()
|
186 |
+
try:
|
187 |
+
if ext in {".wav", ".flac", ".ogg", ".opus"}:
|
188 |
+
y, sr = _load_with_soundfile(path)
|
189 |
+
elif _has_ffmpeg():
|
190 |
+
y, sr = _load_with_ffmpeg(path, target_sr=target_sr)
|
191 |
+
return y, sr # already mono+16k
|
192 |
+
else:
|
193 |
+
# Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
|
194 |
+
y, sr = librosa.load(path, sr=None, mono=True)
|
195 |
+
y, sr = _resample_if_needed(y, sr, target_sr)
|
196 |
+
return y, sr
|
197 |
+
except Exception as e:
|
198 |
+
logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
|
199 |
+
y, sr = librosa.load(path, sr=target_sr, mono=True)
|
200 |
+
return y.astype(np.float32), sr
|
201 |
+
|
202 |
# -------- Lazy-load pipeline cache (Space-safe) --------
|
203 |
_PIPELINE_CACHE = {}
|
204 |
_CACHE_ORDER = [] # usage order
|
|
|
253 |
# -------- Inference --------
|
254 |
def transcribe(audio_path: str, language: str):
|
255 |
"""
|
256 |
+
Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
|
257 |
+
then run it through the chosen ASR pipeline.
|
258 |
+
Returns transcript and a meta dict for feedback.
|
259 |
"""
|
260 |
if not audio_path:
|
261 |
return "⚠️ Please upload or record an audio clip.", None
|
262 |
|
263 |
+
speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
|
264 |
+
duration_s = float(len(speech) / float(sr))
|
265 |
|
266 |
pipe = get_asr_pipeline(language)
|
267 |
decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
|
|
|
293 |
def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
|
294 |
"""
|
295 |
Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
|
|
|
296 |
"""
|
297 |
if not meta:
|
298 |
return {"status": "No transcription metadata available. Please transcribe first."}
|