FarmerlineML commited on
Commit
b6c35e7
·
verified ·
1 Parent(s): dc9e10c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -109
app.py CHANGED
@@ -1,7 +1,6 @@
1
  # app.py
2
 
3
  import os
4
- import csv
5
  import json
6
  import time
7
  import uuid
@@ -10,6 +9,9 @@ from transformers import pipeline
10
  import numpy as np
11
  import librosa # pip install librosa
12
 
 
 
 
13
  # Optional but recommended for better jiwer performance
14
  # pip install python-Levenshtein
15
  try:
@@ -18,12 +20,90 @@ try:
18
  except Exception:
19
  HAS_JIWER = False
20
 
21
- # -------- CONFIG: storage paths (Space-friendly) --------
22
- DATA_DIR = "/home/user/data"
23
- AUDIO_DIR = os.path.join(DATA_DIR, "audio")
24
- LOG_CSV = os.path.join(DATA_DIR, "logs.csv")
25
- os.makedirs(DATA_DIR, exist_ok=True)
26
- os.makedirs(AUDIO_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # --- EDIT THIS: map display names to your HF Hub model IDs ---
29
  language_models = {
@@ -58,14 +138,13 @@ language_models = {
58
  "Pidgin": "FarmerlineML/pidgin_nigerian",
59
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
60
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
61
- "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
62
  }
63
 
64
  # -------- Lazy-load pipeline cache (Space-safe) --------
65
- # Small LRU-style cache to avoid loading all models into RAM
66
  _PIPELINE_CACHE = {}
67
- _CACHE_ORDER = [] # keeps track of usage order
68
- _CACHE_MAX_SIZE = 3 # adjust if you have more RAM
69
 
70
  def _touch_cache(key):
71
  if key in _CACHE_ORDER:
@@ -74,7 +153,7 @@ def _touch_cache(key):
74
 
75
  def _evict_if_needed():
76
  while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
77
- oldest = _CACHE_ORDER.pop() # least-recently used
78
  try:
79
  del _PIPELINE_CACHE[oldest]
80
  except KeyError:
@@ -88,7 +167,7 @@ def get_asr_pipeline(language_display: str):
88
  pipe = pipeline(
89
  task="automatic-speech-recognition",
90
  model=model_id,
91
- device=-1, # force CPU usage on Spaces CPU
92
  chunk_length_s=30
93
  )
94
  _PIPELINE_CACHE[language_display] = pipe
@@ -103,43 +182,14 @@ def _model_revision_from_pipeline(pipe) -> str:
103
  val = getattr(getattr(pipe, "model", None), attr, None)
104
  if val:
105
  return str(val)
106
- # Fallback to config name_or_path or unknown
107
  try:
108
  return str(getattr(pipe.model.config, "_name_or_path", "unknown"))
109
  except Exception:
110
  return "unknown"
111
 
112
- def _append_log_row(row: dict):
113
- field_order = [
114
- "timestamp", "session_id",
115
- "language_display", "model_id", "model_revision",
116
- "audio_duration_s", "sample_rate", "source",
117
- "decode_params",
118
- "transcript_hyp",
119
- "reference_text", "corrected_text",
120
- "latency_ms", "rtf",
121
- "wer", "cer",
122
- "subs", "ins", "dels",
123
- "score_out_of_10", "feedback_text",
124
- "tags",
125
- "store_audio", "audio_path"
126
- ]
127
- file_exists = os.path.isfile(LOG_CSV)
128
- with open(LOG_CSV, "a", newline="", encoding="utf-8") as f:
129
- writer = csv.DictWriter(f, fieldnames=field_order)
130
- if not file_exists:
131
- writer.writeheader()
132
- # Ensure all fields exist
133
- for k in field_order:
134
- row.setdefault(k, "")
135
- writer.writerow(row)
136
-
137
  def _compute_metrics(hyp: str, ref_or_corrected: str):
138
  if not HAS_JIWER or not ref_or_corrected or not hyp:
139
- return {
140
- "wer": None, "cer": None,
141
- "subs": None, "ins": None, "dels": None
142
- }
143
  try:
144
  measures = compute_measures(ref_or_corrected, hyp)
145
  return {
@@ -150,24 +200,18 @@ def _compute_metrics(hyp: str, ref_or_corrected: str):
150
  "dels": measures.get("deletions"),
151
  }
152
  except Exception:
153
- # Be resilient if jiwer errors on edge cases
154
- return {
155
- "wer": None, "cer": None,
156
- "subs": None, "ins": None, "dels": None
157
- }
158
 
159
  # -------- Inference --------
160
  def transcribe(audio_path: str, language: str):
161
  """
162
  Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
163
  convert to mono, then run it through the chosen ASR pipeline.
164
- Returns only the transcript (to keep existing behavior),
165
- while metadata is stored in a hidden state for the feedback step.
166
  """
167
  if not audio_path:
168
  return "⚠️ Please upload or record an audio clip.", None
169
 
170
- # librosa.load returns a 1D np.ndarray (mono) and the sample rate
171
  speech, sr = librosa.load(audio_path, sr=None, mono=True)
172
  duration_s = float(librosa.get_duration(y=speech, sr=sr))
173
 
@@ -181,7 +225,6 @@ def transcribe(audio_path: str, language: str):
181
 
182
  rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
183
 
184
- # Prepare metadata for the feedback logger
185
  meta = {
186
  "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
187
  "session_id": f"anon-{uuid.uuid4()}",
@@ -190,90 +233,61 @@ def transcribe(audio_path: str, language: str):
190
  "model_revision": _model_revision_from_pipeline(pipe),
191
  "audio_duration_s": duration_s,
192
  "sample_rate": sr,
193
- "source": "upload", # gr.Audio combines both; we don't distinguish here
194
  "decode_params": json.dumps(decode_params),
195
  "transcript_hyp": hyp_text,
196
  "latency_ms": latency_ms,
197
  "rtf": rtf,
198
- # Placeholders to be filled on feedback submit
199
- "reference_text": "",
200
- "corrected_text": "",
201
- "wer": "",
202
- "cer": "",
203
- "subs": "",
204
- "ins": "",
205
- "dels": "",
206
- "score_out_of_10": "",
207
- "feedback_text": "",
208
- "tags": "",
209
- "store_audio": False,
210
- "audio_path": ""
211
  }
212
-
213
  return hyp_text, meta
214
 
215
  # -------- Feedback submit --------
216
  def submit_feedback(meta, reference_text, corrected_text, score, feedback_text,
217
  tags, store_audio, share_publicly, audio_file_path):
218
  """
219
- Compute metrics (if possible), optionally store audio (consented),
220
- and append a row to CSV. Returns a compact dict for display.
221
  """
222
  if not meta:
223
  return {"status": "No transcription metadata available. Please transcribe first."}
224
 
225
- # Choose text to compare against hyp: prefer explicit reference, else corrected
226
- ref_for_metrics = reference_text.strip() if reference_text else ""
227
- corrected_text = corrected_text.strip() if corrected_text else ""
228
  if not ref_for_metrics and corrected_text:
229
  ref_for_metrics = corrected_text
230
 
231
  metrics = _compute_metrics(meta.get("transcript_hyp", ""), ref_for_metrics)
232
 
233
- # Handle audio storage (optional, consented)
234
- stored_path = ""
235
- if store_audio and audio_file_path:
236
- try:
237
- # Copy the original file to AUDIO_DIR with a random name
238
- ext = os.path.splitext(audio_file_path)[1] or ".wav"
239
- stored_path = os.path.join(AUDIO_DIR, f"{uuid.uuid4()}{ext}")
240
- # Simple byte copy
241
- with open(audio_file_path, "rb") as src, open(stored_path, "wb") as dst:
242
- dst.write(src.read())
243
- except Exception:
244
- stored_path = ""
245
-
246
- # Build log row
247
- row = dict(meta) # start from recorded meta
248
  row.update({
249
  "reference_text": reference_text or "",
250
  "corrected_text": corrected_text or "",
251
- "wer": metrics["wer"] if metrics["wer"] is not None else "",
252
- "cer": metrics["cer"] if metrics["cer"] is not None else "",
253
- "subs": metrics["subs"] if metrics["subs"] is not None else "",
254
- "ins": metrics["ins"] if metrics["ins"] is not None else "",
255
- "dels": metrics["dels"] if metrics["dels"] is not None else "",
256
- "score_out_of_10": score if score is not None else "",
257
  "feedback_text": feedback_text or "",
258
- "tags": json.dumps({"labels": tags or [], "share_publicly": bool(share_publicly)}),
259
- "store_audio": bool(store_audio),
260
- "audio_path": stored_path
261
  })
262
 
263
  try:
264
- _append_log_row(row)
265
- status = "Feedback saved."
 
 
266
  except Exception as e:
267
- status = f"Failed to save feedback: {e}"
268
 
269
- # Compact result to show back to user
270
  return {
271
  "status": status,
272
- "wer": row["wer"] if row["wer"] != "" else None,
273
- "cer": row["cer"] if row["cer"] != "" else None,
274
- "subs": row["subs"] if row["subs"] != "" else None,
275
- "ins": row["ins"] if row["ins"] != "" else None,
276
- "dels": row["dels"] if row["dels"] != "" else None,
277
  "latency_ms": row["latency_ms"],
278
  "rtf": row["rtf"],
279
  "model_id": row["model_id"],
@@ -314,10 +328,10 @@ with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
314
  # Also capture meta into the hidden state
315
  def _transcribe_and_store(audio_path, language):
316
  hyp, meta = transcribe(audio_path, language)
317
- # For convenience, populate corrected_text with the hyp by default
318
  return hyp, meta, hyp
319
 
320
- # --- Evaluation & Feedback (appended UI, no style/font changes) ---
321
  with gr.Accordion("Evaluation & Feedback", open=False):
322
  with gr.Row():
323
  reference_tb = gr.Textbox(label="Reference text (optional)", lines=4, value="")
@@ -362,7 +376,7 @@ with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
362
  outputs=results_json
363
  )
364
 
365
- # Use a queue to keep Spaces stable under load
366
  if __name__ == "__main__":
367
- demo.queue() # enable_queue=True by default in recent Gradio
368
  demo.launch()
 
1
  # app.py
2
 
3
  import os
 
4
  import json
5
  import time
6
  import uuid
 
9
  import numpy as np
10
  import librosa # pip install librosa
11
 
12
+ # --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
13
+ from datasets import Dataset, Features, Value, Audio, load_dataset
14
+
15
  # Optional but recommended for better jiwer performance
16
  # pip install python-Levenshtein
17
  try:
 
20
  except Exception:
21
  HAS_JIWER = False
22
 
23
+ # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
24
+ HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
25
+ HF_TOKEN = os.environ.get("HF_TOKEN")
26
+ PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
27
+
28
+ HF_FEATURES = Features({
29
+ "timestamp": Value("string"),
30
+ "session_id": Value("string"),
31
+ "language_display": Value("string"),
32
+ "model_id": Value("string"),
33
+ "model_revision": Value("string"),
34
+
35
+ "audio": Audio(sampling_rate=None), # uploaded only if user consents
36
+ "audio_duration_s": Value("float32"),
37
+ "sample_rate": Value("int32"),
38
+ "source": Value("string"),
39
+ "decode_params": Value("string"),
40
+
41
+ "transcript_hyp": Value("string"),
42
+ "reference_text": Value("string"),
43
+ "corrected_text": Value("string"),
44
+
45
+ "latency_ms": Value("int32"),
46
+ "rtf": Value("float32"),
47
+
48
+ "wer": Value("float32"),
49
+ "cer": Value("float32"),
50
+ "subs": Value("int32"),
51
+ "ins": Value("int32"),
52
+ "dels": Value("int32"),
53
+
54
+ "score_out_of_10": Value("int32"),
55
+ "feedback_text": Value("string"),
56
+ "tags": Value("string"),
57
+ "share_publicly": Value("bool"),
58
+ })
59
+
60
+ def _push_row_to_hf_dataset(row, audio_file_path):
61
+ """
62
+ Append a single example to the HF dataset repo (train split).
63
+ If user didn't consent or no audio path, 'audio' field is None.
64
+ """
65
+ if not PUSH_TO_HF:
66
+ return "HF push disabled (missing HF_TOKEN or repo)."
67
+
68
+ example = dict(row)
69
+
70
+ # Audio: only include if user consented and file exists
71
+ example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
72
+
73
+ # Normalize types
74
+ def _to_int(v):
75
+ try:
76
+ return int(v)
77
+ except Exception:
78
+ return None
79
+ def _to_float(v):
80
+ try:
81
+ return float(v)
82
+ except Exception:
83
+ return None
84
+
85
+ for k in ["subs", "ins", "dels", "latency_ms", "score_out_of_10", "sample_rate"]:
86
+ example[k] = _to_int(example.get(k))
87
+ for k in ["wer", "cer", "rtf", "audio_duration_s"]:
88
+ example[k] = _to_float(example.get(k))
89
+
90
+ ds = Dataset.from_list([example], features=HF_FEATURES)
91
+
92
+ # Load existing split if present, then append
93
+ try:
94
+ existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
95
+ merged = existing.concatenate(ds)
96
+ except Exception:
97
+ merged = ds
98
+
99
+ merged.push_to_hub(
100
+ HF_DATASET_REPO,
101
+ split="train",
102
+ private=True,
103
+ token=HF_TOKEN,
104
+ commit_message="append feedback row"
105
+ )
106
+ return "Pushed to HF Dataset."
107
 
108
  # --- EDIT THIS: map display names to your HF Hub model IDs ---
109
  language_models = {
 
138
  "Pidgin": "FarmerlineML/pidgin_nigerian",
139
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
140
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
141
+ #"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
142
  }
143
 
144
  # -------- Lazy-load pipeline cache (Space-safe) --------
 
145
  _PIPELINE_CACHE = {}
146
+ _CACHE_ORDER = [] # usage order
147
+ _CACHE_MAX_SIZE = 3 # tune for RAM
148
 
149
  def _touch_cache(key):
150
  if key in _CACHE_ORDER:
 
153
 
154
  def _evict_if_needed():
155
  while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
156
+ oldest = _CACHE_ORDER.pop()
157
  try:
158
  del _PIPELINE_CACHE[oldest]
159
  except KeyError:
 
167
  pipe = pipeline(
168
  task="automatic-speech-recognition",
169
  model=model_id,
170
+ device=-1, # CPU on Spaces (explicit)
171
  chunk_length_s=30
172
  )
173
  _PIPELINE_CACHE[language_display] = pipe
 
182
  val = getattr(getattr(pipe, "model", None), attr, None)
183
  if val:
184
  return str(val)
 
185
  try:
186
  return str(getattr(pipe.model.config, "_name_or_path", "unknown"))
187
  except Exception:
188
  return "unknown"
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  def _compute_metrics(hyp: str, ref_or_corrected: str):
191
  if not HAS_JIWER or not ref_or_corrected or not hyp:
192
+ return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None}
 
 
 
193
  try:
194
  measures = compute_measures(ref_or_corrected, hyp)
195
  return {
 
200
  "dels": measures.get("deletions"),
201
  }
202
  except Exception:
203
+ return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None}
 
 
 
 
204
 
205
  # -------- Inference --------
206
  def transcribe(audio_path: str, language: str):
207
  """
208
  Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
209
  convert to mono, then run it through the chosen ASR pipeline.
210
+ Returns transcript (unchanged behavior) and a meta dict for feedback.
 
211
  """
212
  if not audio_path:
213
  return "⚠️ Please upload or record an audio clip.", None
214
 
 
215
  speech, sr = librosa.load(audio_path, sr=None, mono=True)
216
  duration_s = float(librosa.get_duration(y=speech, sr=sr))
217
 
 
225
 
226
  rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
227
 
 
228
  meta = {
229
  "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
230
  "session_id": f"anon-{uuid.uuid4()}",
 
233
  "model_revision": _model_revision_from_pipeline(pipe),
234
  "audio_duration_s": duration_s,
235
  "sample_rate": sr,
236
+ "source": "upload",
237
  "decode_params": json.dumps(decode_params),
238
  "transcript_hyp": hyp_text,
239
  "latency_ms": latency_ms,
240
  "rtf": rtf,
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  }
 
242
  return hyp_text, meta
243
 
244
  # -------- Feedback submit --------
245
  def submit_feedback(meta, reference_text, corrected_text, score, feedback_text,
246
  tags, store_audio, share_publicly, audio_file_path):
247
  """
248
+ Compute metrics (if possible) and push a row to HF Dataset immediately.
249
+ No local CSV/audio writes.
250
  """
251
  if not meta:
252
  return {"status": "No transcription metadata available. Please transcribe first."}
253
 
254
+ ref_for_metrics = (reference_text or "").strip()
255
+ corrected_text = (corrected_text or "").strip()
 
256
  if not ref_for_metrics and corrected_text:
257
  ref_for_metrics = corrected_text
258
 
259
  metrics = _compute_metrics(meta.get("transcript_hyp", ""), ref_for_metrics)
260
 
261
+ row = dict(meta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  row.update({
263
  "reference_text": reference_text or "",
264
  "corrected_text": corrected_text or "",
265
+ "wer": metrics["wer"],
266
+ "cer": metrics["cer"],
267
+ "subs": metrics["subs"],
268
+ "ins": metrics["ins"],
269
+ "dels": metrics["dels"],
270
+ "score_out_of_10": int(score) if score is not None else None,
271
  "feedback_text": feedback_text or "",
272
+ "tags": json.dumps({"labels": tags or []}),
273
+ "share_publicly": bool(share_publicly),
 
274
  })
275
 
276
  try:
277
+ # Use the temporary upload path from Gradio iff the user consented
278
+ audio_to_push = audio_file_path if store_audio else None
279
+ hf_status = _push_row_to_hf_dataset(row, audio_to_push)
280
+ status = f"Feedback saved. {hf_status}"
281
  except Exception as e:
282
+ status = f"Failed to push to HF Dataset: {e}"
283
 
 
284
  return {
285
  "status": status,
286
+ "wer": row["wer"],
287
+ "cer": row["cer"],
288
+ "subs": row["subs"],
289
+ "ins": row["ins"],
290
+ "dels": row["dels"],
291
  "latency_ms": row["latency_ms"],
292
  "rtf": row["rtf"],
293
  "model_id": row["model_id"],
 
328
  # Also capture meta into the hidden state
329
  def _transcribe_and_store(audio_path, language):
330
  hyp, meta = transcribe(audio_path, language)
331
+ # Pre-fill corrected with hypothesis for easy edits
332
  return hyp, meta, hyp
333
 
334
+ # --- Evaluation & Feedback (no style changes) ---
335
  with gr.Accordion("Evaluation & Feedback", open=False):
336
  with gr.Row():
337
  reference_tb = gr.Textbox(label="Reference text (optional)", lines=4, value="")
 
376
  outputs=results_json
377
  )
378
 
379
+ # Keep Spaces stable under load
380
  if __name__ == "__main__":
381
+ demo.queue()
382
  demo.launch()