FarmerlineML commited on
Commit
ad14e8e
Β·
verified Β·
1 Parent(s): 69a004d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -46
app.py CHANGED
@@ -1,9 +1,16 @@
1
  # app.py
2
 
 
 
 
 
3
  import gradio as gr
4
  from transformers import pipeline
5
  import numpy as np
6
  import librosa # pip install librosa
 
 
 
7
 
8
  # --- EDIT THIS: map display names to your HF Hub model IDs ---
9
  language_models = {
@@ -12,34 +19,30 @@ language_models = {
12
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
13
  "Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
14
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
15
- "FANTE": "misterkissi/w2v2-lg-xls-r-300m-fante",
16
- "BEMBA": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
17
- "BAMBARA": "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara",
18
- "DAGAARE": "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare",
19
- "KINYARWANDA": "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda",
20
- "FULA": "DarliAI/kissi-wav2vec2-fula-fleurs-full",
21
- "OROMO": "DarliAI/kissi-w2v-bert-2.0-oromo",
22
- "RUNYANKORE": "misterkissi/w2v2-lg-xls-r-300m-runyankore",
23
- "GA": "misterkissi/w2v2-lg-xls-r-300m-ga",
24
- "VAI": "misterkissi/whisper-small-vai",
25
- "KASEM": "misterkissi/w2v2-lg-xls-r-300m-kasem",
26
- "LINGALA": "misterkissi/w2v2-lg-xls-r-300m-lingala",
27
- "FONGBE": "misterkissi/whisper-small-fongbe",
28
- "AMHARIC": "misterkissi/w2v2-lg-xls-r-1b-amharic",
29
- "XHOSA": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
30
- "TSONGA": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
31
- # "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
32
- # "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
33
- # "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
34
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
35
- "Luganda": "FarmerlineML/luganda_fkd",
36
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
37
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
38
  "Pidgin": "FarmerlineML/pidgin_nigerian",
39
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
40
- "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1"
41
-
42
- # add more as needed
43
  }
44
 
45
  # Pre-load pipelines for each language on CPU (device=-1)
@@ -53,53 +56,194 @@ asr_pipelines = {
53
  for lang, model_id in language_models.items()
54
  }
55
 
56
-
57
- def transcribe(audio_path: str, language: str) -> str:
58
  """
59
  Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
60
  convert to mono, then run it through the chosen ASR pipeline.
 
61
  """
62
  if not audio_path:
63
- return "⚠️ Please upload or record an audio clip."
64
 
65
  # librosa.load returns a 1D np.ndarray (mono) and the sample rate
66
  speech, sr = librosa.load(audio_path, sr=None, mono=True)
 
67
 
68
- # Call the Hugging Face ASR pipeline
69
  result = asr_pipelines[language]({
70
  "sampling_rate": sr,
71
  "raw": speech
72
  })
73
- return result.get("text", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
 
 
 
 
 
 
 
 
75
 
76
- with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
 
 
 
 
 
 
 
 
 
77
  gr.Markdown(
78
  """
79
- ## πŸŽ™οΈ Multilingual Speech-to-Text
80
  Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
81
- Then choose the language/model and hit **Transcribe**.
 
82
  """
83
  )
84
 
85
- with gr.Row():
86
- lang = gr.Dropdown(
87
- choices=list(language_models.keys()),
88
- value=list(language_models.keys())[0],
89
- label="Select Language / Model"
90
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- with gr.Row():
93
- audio = gr.Audio(
94
- sources=["upload", "microphone"],
95
- type="filepath",
96
- label="Upload or Record Audio"
97
- )
98
 
99
- btn = gr.Button("Transcribe")
100
- output = gr.Textbox(label="Transcription")
 
 
 
 
101
 
102
- btn.click(fn=transcribe, inputs=[audio, lang], outputs=output)
 
 
 
 
103
 
104
  if __name__ == "__main__":
105
  demo.launch()
 
1
  # app.py
2
 
3
+ import os
4
+ import time
5
+ import datetime as dt
6
+ import pandas as pd
7
  import gradio as gr
8
  from transformers import pipeline
9
  import numpy as np
10
  import librosa # pip install librosa
11
+ from jiwer import wer # pip install jiwer
12
+
13
+ LOG_PATH = "feedback_logs.csv"
14
 
15
  # --- EDIT THIS: map display names to your HF Hub model IDs ---
16
  language_models = {
 
19
  "Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
20
  "Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
21
  "Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
22
+ "Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
23
+ "Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
24
+ "Bambara": "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara",
25
+ "Dagaare": "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare",
26
+ "Kinyarwanda": "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda",
27
+ "Fula": "DarliAI/kissi-wav2vec2-fula-fleurs-full",
28
+ "Oromo": "DarliAI/kissi-w2v-bert-2.0-oromo",
29
+ "Runynakore": "misterkissi/w2v2-lg-xls-r-300m-runyankore",
30
+ "Ga": "misterkissi/w2v2-lg-xls-r-300m-ga",
31
+ "Vai": "misterkissi/whisper-small-vai",
32
+ "Kasem": "misterkissi/w2v2-lg-xls-r-300m-kasem",
33
+ "Lingala": "misterkissi/w2v2-lg-xls-r-300m-lingala",
34
+ "Fongbe": "misterkissi/whisper-small-fongbe",
35
+ "Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
36
+ "Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
37
+ "Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
 
 
 
38
  "Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
39
+ "Luganda (FKD)": "FarmerlineML/luganda_fkd",
40
  "Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
41
  "Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
42
  "Pidgin": "FarmerlineML/pidgin_nigerian",
43
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
44
+ "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
45
+ "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
 
46
  }
47
 
48
  # Pre-load pipelines for each language on CPU (device=-1)
 
56
  for lang, model_id in language_models.items()
57
  }
58
 
59
+ def transcribe(audio_path: str, language: str):
 
60
  """
61
  Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
62
  convert to mono, then run it through the chosen ASR pipeline.
63
+ Returns (transcript, runtime_seconds, duration_seconds).
64
  """
65
  if not audio_path:
66
+ return "⚠️ Please upload or record an audio clip.", 0.0, 0.0
67
 
68
  # librosa.load returns a 1D np.ndarray (mono) and the sample rate
69
  speech, sr = librosa.load(audio_path, sr=None, mono=True)
70
+ duration_s = librosa.get_duration(y=speech, sr=sr)
71
 
72
+ t0 = time.time()
73
  result = asr_pipelines[language]({
74
  "sampling_rate": sr,
75
  "raw": speech
76
  })
77
+ runtime_s = time.time() - t0
78
+ text = result.get("text", "")
79
+ return text, round(runtime_s, 3), round(duration_s, 3)
80
+
81
+ def compute_wer(pred: str, ref: str) -> float:
82
+ if not ref or not pred:
83
+ return None
84
+ try:
85
+ return float(wer(ref, pred))
86
+ except Exception:
87
+ return None
88
+
89
+ def ensure_logfile():
90
+ if not os.path.exists(LOG_PATH):
91
+ pd.DataFrame(columns=[
92
+ "timestamp", "language", "model_id", "audio_filename",
93
+ "duration_s", "runtime_s", "transcript", "reference",
94
+ "wer", "score_10", "feedback",
95
+ "domain", "environment", "accent_locale"
96
+ ]).to_csv(LOG_PATH, index=False)
97
+
98
+ def save_feedback(language: str,
99
+ transcript: str,
100
+ reference: str,
101
+ score_10: int,
102
+ feedback: str,
103
+ audio_file: str,
104
+ duration_s: float,
105
+ runtime_s: float,
106
+ domain: str,
107
+ environment: str,
108
+ accent_locale: str):
109
+ ensure_logfile()
110
+ model_id = language_models.get(language, "")
111
+ audio_filename = os.path.basename(audio_file) if audio_file else ""
112
+
113
+ w = compute_wer(transcript, reference)
114
+
115
+ row = {
116
+ "timestamp": dt.datetime.utcnow().isoformat(),
117
+ "language": language,
118
+ "model_id": model_id,
119
+ "audio_filename": audio_filename,
120
+ "duration_s": duration_s,
121
+ "runtime_s": runtime_s,
122
+ "transcript": transcript,
123
+ "reference": reference,
124
+ "wer": w,
125
+ "score_10": score_10,
126
+ "feedback": feedback,
127
+ "domain": domain,
128
+ "environment": environment,
129
+ "accent_locale": accent_locale
130
+ }
131
+ try:
132
+ df = pd.read_csv(LOG_PATH)
133
+ df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
134
+ df.to_csv(LOG_PATH, index=False)
135
+ msg = "βœ… Feedback saved."
136
+ if w is not None:
137
+ msg += f" WER: {w:.3f}"
138
+ return msg
139
+ except Exception as e:
140
+ return f"❌ Could not save feedback: {e}"
141
+
142
+ def load_metrics():
143
+ ensure_logfile()
144
+ df = pd.read_csv(LOG_PATH)
145
+ if df.empty:
146
+ return "No feedback yet.", None, None, df
147
 
148
+ # Aggregates
149
+ # Per-language means:
150
+ per_lang = df.groupby("language").agg(
151
+ n=("wer", "count"),
152
+ mean_WER=("wer", "mean"),
153
+ mean_score=("score_10", "mean"),
154
+ mean_runtime_s=("runtime_s", "mean"),
155
+ mean_duration_s=("duration_s", "mean")
156
+ ).reset_index().sort_values(by="mean_WER", ascending=True)
157
 
158
+ # Per-domain (optional):
159
+ per_domain = df.groupby("domain").agg(
160
+ n=("wer", "count"),
161
+ mean_WER=("wer", "mean"),
162
+ mean_score=("score_10", "mean")
163
+ ).reset_index().sort_values(by="mean_WER", ascending=True)
164
+
165
+ return "πŸ“Š Metrics updated.", per_lang, per_domain, df
166
+
167
+ with gr.Blocks(title="🌐 Multilingual ASR Demo", theme=gr.themes.Soft()) as demo:
168
  gr.Markdown(
169
  """
170
+ ## πŸŽ™οΈ Multilingual Speech-to-Text + Feedback & Benchmarking
171
  Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
172
+ Choose the language/model and hit **Transcribe**.
173
+ Optionally provide a **reference transcript** to compute WER, then leave a score & feedback.
174
  """
175
  )
176
 
177
+ with gr.Tabs():
178
+ with gr.Tab("ASR"):
179
+ with gr.Row():
180
+ lang = gr.Dropdown(
181
+ choices=list(language_models.keys()),
182
+ value=list(language_models.keys())[0],
183
+ label="Select Language / Model"
184
+ )
185
+
186
+ with gr.Row():
187
+ audio = gr.Audio(
188
+ sources=["upload", "microphone"],
189
+ type="filepath",
190
+ label="Upload or Record Audio"
191
+ )
192
+
193
+ btn = gr.Button("Transcribe", variant="primary")
194
+ output = gr.Textbox(label="Transcription", lines=6)
195
+ runtime = gr.Number(label="Model runtime (s)", precision=3, interactive=False)
196
+ duration = gr.Number(label="Audio duration (s)", precision=3, interactive=False)
197
+
198
+ # Feedback / Benchmark block
199
+ gr.Markdown("### πŸ“ Feedback & WER (optional)")
200
+ with gr.Row():
201
+ reference = gr.Textbox(label="Reference transcript (optional, for WER)", lines=4, placeholder="Paste the ground-truth text here to compute WER")
202
+ with gr.Row():
203
+ score = gr.Slider(0, 10, step=1, value=8, label="Overall quality score (0–10)")
204
+ with gr.Row():
205
+ domain = gr.Dropdown(
206
+ ["General", "Conversational", "News", "Agriculture", "Healthcare", "Education", "Customer support", "Finance", "Legal", "Entertainment", "Other"],
207
+ value="General",
208
+ label="Domain/topic"
209
+ )
210
+ environment = gr.Dropdown(
211
+ ["Quiet", "Office", "Outdoor", "Vehicle", "Crowd/Market", "Radio/Phone", "Other"],
212
+ value="Quiet",
213
+ label="Recording environment"
214
+ )
215
+ accent_locale = gr.Textbox(label="Accent / Locale (e.g., Accra, Nairobi, Lagos)", placeholder="Optional")
216
+
217
+ feedback = gr.Textbox(label="Free-text feedback", lines=4, placeholder="What worked well? What failed? Any specific words or sounds?")
218
+
219
+ save_btn = gr.Button("Save Feedback", variant="secondary")
220
+ save_msg = gr.Markdown("")
221
+
222
+ # Wire up
223
+ btn.click(
224
+ fn=transcribe,
225
+ inputs=[audio, lang],
226
+ outputs=[output, runtime, duration]
227
+ )
228
 
229
+ save_btn.click(
230
+ fn=save_feedback,
231
+ inputs=[lang, output, reference, score, feedback, audio, duration, runtime, domain, environment, accent_locale],
232
+ outputs=save_msg
233
+ )
 
234
 
235
+ with gr.Tab("Metrics"):
236
+ refresh = gr.Button("Refresh metrics", variant="primary")
237
+ metrics_msg = gr.Markdown()
238
+ per_lang_df = gr.Dataframe(interactive=False, label="Per-language summary (lower WER is better)")
239
+ per_domain_df = gr.Dataframe(interactive=False, label="Per-domain summary")
240
+ logs_df = gr.Dataframe(interactive=False, label="Raw feedback log")
241
 
242
+ refresh.click(
243
+ fn=load_metrics,
244
+ inputs=[],
245
+ outputs=[metrics_msg, per_lang_df, per_domain_df, logs_df]
246
+ )
247
 
248
  if __name__ == "__main__":
249
  demo.launch()