Spaces:
Sleeping
Sleeping
# app.py | |
import os | |
import json | |
import time | |
import uuid | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
import librosa # pip install librosa | |
# --- External logging: push to a HF Dataset repo on each submit (no local storage) --- | |
from datasets import Dataset, Features, Value, Audio, load_dataset | |
# Optional but recommended for better jiwer performance | |
# pip install python-Levenshtein | |
try: | |
from jiwer import compute_measures, wer as jiwer_wer, cer as jiwer_cer | |
HAS_JIWER = True | |
except Exception: | |
HAS_JIWER = False | |
# -------- CONFIG: Hub dataset target (no persistent storage needed) -------- | |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs") | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO) | |
HF_FEATURES = Features({ | |
"timestamp": Value("string"), | |
"session_id": Value("string"), | |
"language_display": Value("string"), | |
"model_id": Value("string"), | |
"model_revision": Value("string"), | |
"audio": Audio(sampling_rate=None), # uploaded only if user consents | |
"audio_duration_s": Value("float32"), | |
"sample_rate": Value("int32"), | |
"source": Value("string"), | |
"decode_params": Value("string"), | |
"transcript_hyp": Value("string"), | |
"reference_text": Value("string"), | |
"corrected_text": Value("string"), | |
"latency_ms": Value("int32"), | |
"rtf": Value("float32"), | |
"wer": Value("float32"), | |
"cer": Value("float32"), | |
"subs": Value("int32"), | |
"ins": Value("int32"), | |
"dels": Value("int32"), | |
"score_out_of_10": Value("int32"), | |
"feedback_text": Value("string"), | |
"tags": Value("string"), | |
"share_publicly": Value("bool"), | |
}) | |
def _push_row_to_hf_dataset(row, audio_file_path): | |
""" | |
Append a single example to the HF dataset repo (train split). | |
If user didn't consent or no audio path, 'audio' field is None. | |
""" | |
if not PUSH_TO_HF: | |
return "HF push disabled (missing HF_TOKEN or repo)." | |
example = dict(row) | |
# Audio: only include if user consented and file exists | |
example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None | |
# Normalize types | |
def _to_int(v): | |
try: | |
return int(v) | |
except Exception: | |
return None | |
def _to_float(v): | |
try: | |
return float(v) | |
except Exception: | |
return None | |
for k in ["subs", "ins", "dels", "latency_ms", "score_out_of_10", "sample_rate"]: | |
example[k] = _to_int(example.get(k)) | |
for k in ["wer", "cer", "rtf", "audio_duration_s"]: | |
example[k] = _to_float(example.get(k)) | |
ds = Dataset.from_list([example], features=HF_FEATURES) | |
# Load existing split if present, then append | |
try: | |
existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN) | |
merged = existing.concatenate(ds) | |
except Exception: | |
merged = ds | |
merged.push_to_hub( | |
HF_DATASET_REPO, | |
split="train", | |
private=True, | |
token=HF_TOKEN, | |
commit_message="append feedback row" | |
) | |
return "Pushed to HF Dataset." | |
# --- EDIT THIS: map display names to your HF Hub model IDs --- | |
language_models = { | |
"Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1", | |
"Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2", | |
"Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha", | |
"Luganda": "FarmerlineML/w2v-bert-2.0_luganda", | |
"Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha", | |
"Fante": "misterkissi/w2v2-lg-xls-r-300m-fante", | |
"Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba", | |
"Bambara": "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara", | |
"Dagaare": "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare", | |
"Kinyarwanda": "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda", | |
"Fula": "DarliAI/kissi-wav2vec2-fula-fleurs-full", | |
"Oromo": "DarliAI/kissi-w2v-bert-2.0-oromo", | |
"Runynakore": "misterkissi/w2v2-lg-xls-r-300m-runyankore", | |
"Ga": "misterkissi/w2v2-lg-xls-r-300m-ga", | |
"Vai": "misterkissi/whisper-small-vai", | |
"Kasem": "misterkissi/w2v2-lg-xls-r-300m-kasem", | |
"Lingala": "misterkissi/w2v2-lg-xls-r-300m-lingala", | |
"Fongbe": "misterkissi/whisper-small-fongbe", | |
"Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic", | |
"Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa", | |
"Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga", | |
# "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof", | |
# "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole", | |
# "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle", | |
"Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1", | |
"Luganda": "FarmerlineML/luganda_fkd", | |
"Luo": "FarmerlineML/w2v-bert-2.0_luo_v2", | |
"Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha", | |
"Pidgin": "FarmerlineML/pidgin_nigerian", | |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu", | |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1", | |
#"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3" | |
} | |
# -------- Lazy-load pipeline cache (Space-safe) -------- | |
_PIPELINE_CACHE = {} | |
_CACHE_ORDER = [] # usage order | |
_CACHE_MAX_SIZE = 3 # tune for RAM | |
def _touch_cache(key): | |
if key in _CACHE_ORDER: | |
_CACHE_ORDER.remove(key) | |
_CACHE_ORDER.insert(0, key) | |
def _evict_if_needed(): | |
while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE: | |
oldest = _CACHE_ORDER.pop() | |
try: | |
del _PIPELINE_CACHE[oldest] | |
except KeyError: | |
pass | |
def get_asr_pipeline(language_display: str): | |
if language_display in _PIPELINE_CACHE: | |
_touch_cache(language_display) | |
return _PIPELINE_CACHE[language_display] | |
model_id = language_models[language_display] | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
model=model_id, | |
device=-1, # CPU on Spaces (explicit) | |
chunk_length_s=30 | |
) | |
_PIPELINE_CACHE[language_display] = pipe | |
_touch_cache(language_display) | |
_evict_if_needed() | |
return pipe | |
# -------- Helpers -------- | |
def _model_revision_from_pipeline(pipe) -> str: | |
# Best-effort capture of revision/hash for reproducibility | |
for attr in ("hub_revision", "revision", "_commit_hash"): | |
val = getattr(getattr(pipe, "model", None), attr, None) | |
if val: | |
return str(val) | |
try: | |
return str(getattr(pipe.model.config, "_name_or_path", "unknown")) | |
except Exception: | |
return "unknown" | |
def _compute_metrics(hyp: str, ref_or_corrected: str): | |
if not HAS_JIWER or not ref_or_corrected or not hyp: | |
return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None} | |
try: | |
measures = compute_measures(ref_or_corrected, hyp) | |
return { | |
"wer": measures.get("wer"), | |
"cer": jiwer_cer(ref_or_corrected, hyp), | |
"subs": measures.get("substitutions"), | |
"ins": measures.get("insertions"), | |
"dels": measures.get("deletions"), | |
} | |
except Exception: | |
return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None} | |
# -------- Inference -------- | |
def transcribe(audio_path: str, language: str): | |
""" | |
Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.), | |
convert to mono, then run it through the chosen ASR pipeline. | |
Returns transcript (unchanged behavior) and a meta dict for feedback. | |
""" | |
if not audio_path: | |
return "⚠️ Please upload or record an audio clip.", None | |
speech, sr = librosa.load(audio_path, sr=None, mono=True) | |
duration_s = float(librosa.get_duration(y=speech, sr=sr)) | |
pipe = get_asr_pipeline(language) | |
decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)} | |
t0 = time.time() | |
result = pipe({"sampling_rate": sr, "raw": speech}) | |
latency_ms = int((time.time() - t0) * 1000.0) | |
hyp_text = result.get("text", "") | |
rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9) | |
meta = { | |
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), | |
"session_id": f"anon-{uuid.uuid4()}", | |
"language_display": language, | |
"model_id": language_models.get(language, "unknown"), | |
"model_revision": _model_revision_from_pipeline(pipe), | |
"audio_duration_s": duration_s, | |
"sample_rate": sr, | |
"source": "upload", | |
"decode_params": json.dumps(decode_params), | |
"transcript_hyp": hyp_text, | |
"latency_ms": latency_ms, | |
"rtf": rtf, | |
} | |
return hyp_text, meta | |
# -------- Feedback submit -------- | |
def submit_feedback(meta, reference_text, corrected_text, score, feedback_text, | |
tags, store_audio, share_publicly, audio_file_path): | |
""" | |
Compute metrics (if possible) and push a row to HF Dataset immediately. | |
No local CSV/audio writes. | |
""" | |
if not meta: | |
return {"status": "No transcription metadata available. Please transcribe first."} | |
ref_for_metrics = (reference_text or "").strip() | |
corrected_text = (corrected_text or "").strip() | |
if not ref_for_metrics and corrected_text: | |
ref_for_metrics = corrected_text | |
metrics = _compute_metrics(meta.get("transcript_hyp", ""), ref_for_metrics) | |
row = dict(meta) | |
row.update({ | |
"reference_text": reference_text or "", | |
"corrected_text": corrected_text or "", | |
"wer": metrics["wer"], | |
"cer": metrics["cer"], | |
"subs": metrics["subs"], | |
"ins": metrics["ins"], | |
"dels": metrics["dels"], | |
"score_out_of_10": int(score) if score is not None else None, | |
"feedback_text": feedback_text or "", | |
"tags": json.dumps({"labels": tags or []}), | |
"share_publicly": bool(share_publicly), | |
}) | |
try: | |
# Use the temporary upload path from Gradio iff the user consented | |
audio_to_push = audio_file_path if store_audio else None | |
hf_status = _push_row_to_hf_dataset(row, audio_to_push) | |
status = f"Feedback saved. {hf_status}" | |
except Exception as e: | |
status = f"Failed to push to HF Dataset: {e}" | |
return { | |
"status": status, | |
"wer": row["wer"], | |
"cer": row["cer"], | |
"subs": row["subs"], | |
"ins": row["ins"], | |
"dels": row["dels"], | |
"latency_ms": row["latency_ms"], | |
"rtf": row["rtf"], | |
"model_id": row["model_id"], | |
"model_revision": row["model_revision"] | |
} | |
# -------- UI (original preserved; additions appended) -------- | |
with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo: | |
gr.Markdown( | |
""" | |
## 🎙️ Multilingual Speech-to-Text | |
Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone. | |
Then choose the language/model and hit **Transcribe**. | |
""" | |
) | |
with gr.Row(): | |
lang = gr.Dropdown( | |
choices=list(language_models.keys()), | |
value=list(language_models.keys())[0], | |
label="Select Language / Model" | |
) | |
with gr.Row(): | |
audio = gr.Audio( | |
sources=["upload", "microphone"], | |
type="filepath", | |
label="Upload or Record Audio" | |
) | |
btn = gr.Button("Transcribe") | |
output = gr.Textbox(label="Transcription") | |
# Hidden state to carry metadata from transcribe -> feedback | |
meta_state = gr.State(value=None) | |
# Keep original behavior: output shows transcript | |
# Also capture meta into the hidden state | |
def _transcribe_and_store(audio_path, language): | |
hyp, meta = transcribe(audio_path, language) | |
# Pre-fill corrected with hypothesis for easy edits | |
return hyp, meta, hyp | |
# --- Evaluation & Feedback (no style changes) --- | |
with gr.Accordion("Evaluation & Feedback", open=False): | |
with gr.Row(): | |
reference_tb = gr.Textbox(label="Reference text (optional)", lines=4, value="") | |
with gr.Row(): | |
corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="") | |
with gr.Row(): | |
score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7) | |
with gr.Row(): | |
feedback_tb = gr.Textbox(label="Feedback (what went right/wrong?)", lines=3, value="") | |
with gr.Row(): | |
tags_cb = gr.CheckboxGroup( | |
["noisy", "far-field", "code-switching", "numbers-heavy", "named-entities", "read-speech", "spontaneous", "call-center", "voicenote"], | |
label="Slice tags (select any that apply)" | |
) | |
with gr.Row(): | |
store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False) | |
share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False) | |
submit_btn = gr.Button("Submit Feedback / Compute Metrics") | |
results_json = gr.JSON(label="Metrics & Status") | |
# Wire events | |
btn.click( | |
fn=_transcribe_and_store, | |
inputs=[audio, lang], | |
outputs=[output, meta_state, corrected_tb] | |
) | |
submit_btn.click( | |
fn=submit_feedback, | |
inputs=[ | |
meta_state, | |
reference_tb, | |
corrected_tb, | |
score_slider, | |
feedback_tb, | |
tags_cb, | |
store_audio_cb, | |
share_cb, | |
audio # raw file path from gr.Audio | |
], | |
outputs=results_json | |
) | |
# Keep Spaces stable under load | |
if __name__ == "__main__": | |
demo.queue() | |
demo.launch() | |