FarmerlineML commited on
Commit
edff215
·
verified ·
1 Parent(s): 0a9945e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -173
app.py CHANGED
@@ -1,16 +1,9 @@
1
  # app.py
2
 
3
- import os
4
- import time
5
- import datetime as dt
6
- import pandas as pd
7
  import gradio as gr
8
  from transformers import pipeline
9
  import numpy as np
10
  import librosa # pip install librosa
11
- from jiwer import wer # pip install jiwer
12
-
13
- LOG_PATH = "feedback_logs.csv"
14
 
15
  # --- EDIT THIS: map display names to your HF Hub model IDs ---
16
  language_models = {
@@ -35,14 +28,19 @@ language_models = {
35
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
36
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
37
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
 
 
 
38
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
39
- "Luganda (FKD)": "FarmerlineML/luganda_fkd",
40
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
41
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
42
  "Pidgin": "FarmerlineML/pidgin_nigerian",
43
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
44
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
45
  #"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
 
 
46
  }
47
 
48
  # Pre-load pipelines for each language on CPU (device=-1)
@@ -56,194 +54,53 @@ asr_pipelines = {
56
  for lang, model_id in language_models.items()
57
  }
58
 
59
- def transcribe(audio_path: str, language: str):
 
60
  """
61
  Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
62
  convert to mono, then run it through the chosen ASR pipeline.
63
- Returns (transcript, runtime_seconds, duration_seconds).
64
  """
65
  if not audio_path:
66
- return "⚠️ Please upload or record an audio clip.", 0.0, 0.0
67
 
68
  # librosa.load returns a 1D np.ndarray (mono) and the sample rate
69
  speech, sr = librosa.load(audio_path, sr=None, mono=True)
70
- duration_s = librosa.get_duration(y=speech, sr=sr)
71
 
72
- t0 = time.time()
73
  result = asr_pipelines[language]({
74
  "sampling_rate": sr,
75
  "raw": speech
76
  })
77
- runtime_s = time.time() - t0
78
- text = result.get("text", "")
79
- return text, round(runtime_s, 3), round(duration_s, 3)
80
-
81
- def compute_wer(pred: str, ref: str) -> float:
82
- if not ref or not pred:
83
- return None
84
- try:
85
- return float(wer(ref, pred))
86
- except Exception:
87
- return None
88
-
89
- def ensure_logfile():
90
- if not os.path.exists(LOG_PATH):
91
- pd.DataFrame(columns=[
92
- "timestamp", "language", "model_id", "audio_filename",
93
- "duration_s", "runtime_s", "transcript", "reference",
94
- "wer", "score_10", "feedback",
95
- "domain", "environment", "accent_locale"
96
- ]).to_csv(LOG_PATH, index=False)
97
-
98
- def save_feedback(language: str,
99
- transcript: str,
100
- reference: str,
101
- score_10: int,
102
- feedback: str,
103
- audio_file: str,
104
- duration_s: float,
105
- runtime_s: float,
106
- domain: str,
107
- environment: str,
108
- accent_locale: str):
109
- ensure_logfile()
110
- model_id = language_models.get(language, "")
111
- audio_filename = os.path.basename(audio_file) if audio_file else ""
112
-
113
- w = compute_wer(transcript, reference)
114
-
115
- row = {
116
- "timestamp": dt.datetime.utcnow().isoformat(),
117
- "language": language,
118
- "model_id": model_id,
119
- "audio_filename": audio_filename,
120
- "duration_s": duration_s,
121
- "runtime_s": runtime_s,
122
- "transcript": transcript,
123
- "reference": reference,
124
- "wer": w,
125
- "score_10": score_10,
126
- "feedback": feedback,
127
- "domain": domain,
128
- "environment": environment,
129
- "accent_locale": accent_locale
130
- }
131
- try:
132
- df = pd.read_csv(LOG_PATH)
133
- df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
134
- df.to_csv(LOG_PATH, index=False)
135
- msg = "✅ Feedback saved."
136
- if w is not None:
137
- msg += f" WER: {w:.3f}"
138
- return msg
139
- except Exception as e:
140
- return f"❌ Could not save feedback: {e}"
141
-
142
- def load_metrics():
143
- ensure_logfile()
144
- df = pd.read_csv(LOG_PATH)
145
- if df.empty:
146
- return "No feedback yet.", None, None, df
147
 
148
- # Aggregates
149
- # Per-language means:
150
- per_lang = df.groupby("language").agg(
151
- n=("wer", "count"),
152
- mean_WER=("wer", "mean"),
153
- mean_score=("score_10", "mean"),
154
- mean_runtime_s=("runtime_s", "mean"),
155
- mean_duration_s=("duration_s", "mean")
156
- ).reset_index().sort_values(by="mean_WER", ascending=True)
157
 
158
- # Per-domain (optional):
159
- per_domain = df.groupby("domain").agg(
160
- n=("wer", "count"),
161
- mean_WER=("wer", "mean"),
162
- mean_score=("score_10", "mean")
163
- ).reset_index().sort_values(by="mean_WER", ascending=True)
164
-
165
- return "📊 Metrics updated.", per_lang, per_domain, df
166
-
167
- with gr.Blocks(title="🌐 Multilingual ASR Demo", theme=gr.themes.Soft()) as demo:
168
  gr.Markdown(
169
  """
170
- ## 🎙️ Multilingual Speech-to-Text + Feedback & Benchmarking
171
  Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
172
- Choose the language/model and hit **Transcribe**.
173
- Optionally provide a **reference transcript** to compute WER, then leave a score & feedback.
174
  """
175
  )
176
 
177
- with gr.Tabs():
178
- with gr.Tab("ASR"):
179
- with gr.Row():
180
- lang = gr.Dropdown(
181
- choices=list(language_models.keys()),
182
- value=list(language_models.keys())[0],
183
- label="Select Language / Model"
184
- )
185
-
186
- with gr.Row():
187
- audio = gr.Audio(
188
- sources=["upload", "microphone"],
189
- type="filepath",
190
- label="Upload or Record Audio"
191
- )
192
-
193
- btn = gr.Button("Transcribe", variant="primary")
194
- output = gr.Textbox(label="Transcription", lines=6)
195
- runtime = gr.Number(label="Model runtime (s)", precision=3, interactive=False)
196
- duration = gr.Number(label="Audio duration (s)", precision=3, interactive=False)
197
-
198
- # Feedback / Benchmark block
199
- gr.Markdown("### 📝 Feedback & WER (optional)")
200
- with gr.Row():
201
- reference = gr.Textbox(label="Reference transcript (optional, for WER)", lines=4, placeholder="Paste the ground-truth text here to compute WER")
202
- with gr.Row():
203
- score = gr.Slider(0, 10, step=1, value=8, label="Overall quality score (0–10)")
204
- with gr.Row():
205
- domain = gr.Dropdown(
206
- ["General", "Conversational", "News", "Agriculture", "Healthcare", "Education", "Customer support", "Finance", "Legal", "Entertainment", "Other"],
207
- value="General",
208
- label="Domain/topic"
209
- )
210
- environment = gr.Dropdown(
211
- ["Quiet", "Office", "Outdoor", "Vehicle", "Crowd/Market", "Radio/Phone", "Other"],
212
- value="Quiet",
213
- label="Recording environment"
214
- )
215
- accent_locale = gr.Textbox(label="Accent / Locale (e.g., Accra, Nairobi, Lagos)", placeholder="Optional")
216
-
217
- feedback = gr.Textbox(label="Free-text feedback", lines=4, placeholder="What worked well? What failed? Any specific words or sounds?")
218
-
219
- save_btn = gr.Button("Save Feedback", variant="secondary")
220
- save_msg = gr.Markdown("")
221
-
222
- # Wire up
223
- btn.click(
224
- fn=transcribe,
225
- inputs=[audio, lang],
226
- outputs=[output, runtime, duration]
227
- )
228
 
229
- save_btn.click(
230
- fn=save_feedback,
231
- inputs=[lang, output, reference, score, feedback, audio, duration, runtime, domain, environment, accent_locale],
232
- outputs=save_msg
233
- )
 
234
 
235
- with gr.Tab("Metrics"):
236
- refresh = gr.Button("Refresh metrics", variant="primary")
237
- metrics_msg = gr.Markdown()
238
- per_lang_df = gr.Dataframe(interactive=False, label="Per-language summary (lower WER is better)")
239
- per_domain_df = gr.Dataframe(interactive=False, label="Per-domain summary")
240
- logs_df = gr.Dataframe(interactive=False, label="Raw feedback log")
241
 
242
- refresh.click(
243
- fn=load_metrics,
244
- inputs=[],
245
- outputs=[metrics_msg, per_lang_df, per_domain_df, logs_df]
246
- )
247
 
248
  if __name__ == "__main__":
249
- demo.launch()
 
1
  # app.py
2
 
 
 
 
 
3
  import gradio as gr
4
  from transformers import pipeline
5
  import numpy as np
6
  import librosa # pip install librosa
 
 
 
7
 
8
  # --- EDIT THIS: map display names to your HF Hub model IDs ---
9
  language_models = {
 
28
  "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
29
  "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
30
  "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
31
+ # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
32
+ # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
33
+ # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
34
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
35
+ "Luganda": "FarmerlineML/luganda_fkd",
36
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
37
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
38
  "Pidgin": "FarmerlineML/pidgin_nigerian",
39
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
40
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
41
  #"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
42
+
43
+ # add more as needed
44
  }
45
 
46
  # Pre-load pipelines for each language on CPU (device=-1)
 
54
  for lang, model_id in language_models.items()
55
  }
56
 
57
+
58
+ def transcribe(audio_path: str, language: str) -> str:
59
  """
60
  Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
61
  convert to mono, then run it through the chosen ASR pipeline.
 
62
  """
63
  if not audio_path:
64
+ return "⚠️ Please upload or record an audio clip."
65
 
66
  # librosa.load returns a 1D np.ndarray (mono) and the sample rate
67
  speech, sr = librosa.load(audio_path, sr=None, mono=True)
 
68
 
69
+ # Call the Hugging Face ASR pipeline
70
  result = asr_pipelines[language]({
71
  "sampling_rate": sr,
72
  "raw": speech
73
  })
74
+ return result.get("text", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
 
 
 
 
 
 
 
 
 
76
 
77
+ with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
 
 
 
 
 
 
 
 
 
78
  gr.Markdown(
79
  """
80
+ ## 🎙️ Multilingual Speech-to-Text
81
  Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
82
+ Then choose the language/model and hit **Transcribe**.
 
83
  """
84
  )
85
 
86
+ with gr.Row():
87
+ lang = gr.Dropdown(
88
+ choices=list(language_models.keys()),
89
+ value=list(language_models.keys())[0],
90
+ label="Select Language / Model"
91
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ with gr.Row():
94
+ audio = gr.Audio(
95
+ sources=["upload", "microphone"],
96
+ type="filepath",
97
+ label="Upload or Record Audio"
98
+ )
99
 
100
+ btn = gr.Button("Transcribe")
101
+ output = gr.Textbox(label="Transcription")
 
 
 
 
102
 
103
+ btn.click(fn=transcribe, inputs=[audio, lang], outputs=output)
 
 
 
 
104
 
105
  if __name__ == "__main__":
106
+ demo.launch()