Evaluation-2 / app.py
FarmerlineML's picture
Update app.py
b6c35e7 verified
raw
history blame
14.2 kB
# app.py
import os
import json
import time
import uuid
import gradio as gr
from transformers import pipeline
import numpy as np
import librosa # pip install librosa
# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
from datasets import Dataset, Features, Value, Audio, load_dataset
# Optional but recommended for better jiwer performance
# pip install python-Levenshtein
try:
from jiwer import compute_measures, wer as jiwer_wer, cer as jiwer_cer
HAS_JIWER = True
except Exception:
HAS_JIWER = False
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
HF_TOKEN = os.environ.get("HF_TOKEN")
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
HF_FEATURES = Features({
"timestamp": Value("string"),
"session_id": Value("string"),
"language_display": Value("string"),
"model_id": Value("string"),
"model_revision": Value("string"),
"audio": Audio(sampling_rate=None), # uploaded only if user consents
"audio_duration_s": Value("float32"),
"sample_rate": Value("int32"),
"source": Value("string"),
"decode_params": Value("string"),
"transcript_hyp": Value("string"),
"reference_text": Value("string"),
"corrected_text": Value("string"),
"latency_ms": Value("int32"),
"rtf": Value("float32"),
"wer": Value("float32"),
"cer": Value("float32"),
"subs": Value("int32"),
"ins": Value("int32"),
"dels": Value("int32"),
"score_out_of_10": Value("int32"),
"feedback_text": Value("string"),
"tags": Value("string"),
"share_publicly": Value("bool"),
})
def _push_row_to_hf_dataset(row, audio_file_path):
"""
Append a single example to the HF dataset repo (train split).
If user didn't consent or no audio path, 'audio' field is None.
"""
if not PUSH_TO_HF:
return "HF push disabled (missing HF_TOKEN or repo)."
example = dict(row)
# Audio: only include if user consented and file exists
example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
# Normalize types
def _to_int(v):
try:
return int(v)
except Exception:
return None
def _to_float(v):
try:
return float(v)
except Exception:
return None
for k in ["subs", "ins", "dels", "latency_ms", "score_out_of_10", "sample_rate"]:
example[k] = _to_int(example.get(k))
for k in ["wer", "cer", "rtf", "audio_duration_s"]:
example[k] = _to_float(example.get(k))
ds = Dataset.from_list([example], features=HF_FEATURES)
# Load existing split if present, then append
try:
existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
merged = existing.concatenate(ds)
except Exception:
merged = ds
merged.push_to_hub(
HF_DATASET_REPO,
split="train",
private=True,
token=HF_TOKEN,
commit_message="append feedback row"
)
return "Pushed to HF Dataset."
# --- EDIT THIS: map display names to your HF Hub model IDs ---
language_models = {
"Akan (Asante Twi)": "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
"Ewe": "FarmerlineML/w2v-bert-2.0_ewe_2",
"Kiswahili": "FarmerlineML/w2v-bert-2.0_swahili_alpha",
"Luganda": "FarmerlineML/w2v-bert-2.0_luganda",
"Brazilian Portuguese": "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
"Fante": "misterkissi/w2v2-lg-xls-r-300m-fante",
"Bemba": "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
"Bambara": "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara",
"Dagaare": "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare",
"Kinyarwanda": "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda",
"Fula": "DarliAI/kissi-wav2vec2-fula-fleurs-full",
"Oromo": "DarliAI/kissi-w2v-bert-2.0-oromo",
"Runynakore": "misterkissi/w2v2-lg-xls-r-300m-runyankore",
"Ga": "misterkissi/w2v2-lg-xls-r-300m-ga",
"Vai": "misterkissi/whisper-small-vai",
"Kasem": "misterkissi/w2v2-lg-xls-r-300m-kasem",
"Lingala": "misterkissi/w2v2-lg-xls-r-300m-lingala",
"Fongbe": "misterkissi/whisper-small-fongbe",
"Amharic": "misterkissi/w2v2-lg-xls-r-1b-amharic",
"Xhosa": "misterkissi/w2v2-lg-xls-r-300m-xhosa",
"Tsonga": "misterkissi/w2v2-lg-xls-r-300m-tsonga",
# "WOLOF": "misterkissi/w2v2-lg-xls-r-1b-wolof",
# "HAITIAN CREOLE": "misterkissi/whisper-small-haitian-creole",
# "KABYLE": "misterkissi/w2v2-lg-xls-r-1b-kabyle",
"Yoruba": "FarmerlineML/w2v-bert-2.0_yoruba_v1",
"Luganda": "FarmerlineML/luganda_fkd",
"Luo": "FarmerlineML/w2v-bert-2.0_luo_v2",
"Somali": "FarmerlineML/w2v-bert-2.0_somali_alpha",
"Pidgin": "FarmerlineML/pidgin_nigerian",
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
#"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3"
}
# -------- Lazy-load pipeline cache (Space-safe) --------
_PIPELINE_CACHE = {}
_CACHE_ORDER = [] # usage order
_CACHE_MAX_SIZE = 3 # tune for RAM
def _touch_cache(key):
if key in _CACHE_ORDER:
_CACHE_ORDER.remove(key)
_CACHE_ORDER.insert(0, key)
def _evict_if_needed():
while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
oldest = _CACHE_ORDER.pop()
try:
del _PIPELINE_CACHE[oldest]
except KeyError:
pass
def get_asr_pipeline(language_display: str):
if language_display in _PIPELINE_CACHE:
_touch_cache(language_display)
return _PIPELINE_CACHE[language_display]
model_id = language_models[language_display]
pipe = pipeline(
task="automatic-speech-recognition",
model=model_id,
device=-1, # CPU on Spaces (explicit)
chunk_length_s=30
)
_PIPELINE_CACHE[language_display] = pipe
_touch_cache(language_display)
_evict_if_needed()
return pipe
# -------- Helpers --------
def _model_revision_from_pipeline(pipe) -> str:
# Best-effort capture of revision/hash for reproducibility
for attr in ("hub_revision", "revision", "_commit_hash"):
val = getattr(getattr(pipe, "model", None), attr, None)
if val:
return str(val)
try:
return str(getattr(pipe.model.config, "_name_or_path", "unknown"))
except Exception:
return "unknown"
def _compute_metrics(hyp: str, ref_or_corrected: str):
if not HAS_JIWER or not ref_or_corrected or not hyp:
return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None}
try:
measures = compute_measures(ref_or_corrected, hyp)
return {
"wer": measures.get("wer"),
"cer": jiwer_cer(ref_or_corrected, hyp),
"subs": measures.get("substitutions"),
"ins": measures.get("insertions"),
"dels": measures.get("deletions"),
}
except Exception:
return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None}
# -------- Inference --------
def transcribe(audio_path: str, language: str):
"""
Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
convert to mono, then run it through the chosen ASR pipeline.
Returns transcript (unchanged behavior) and a meta dict for feedback.
"""
if not audio_path:
return "⚠️ Please upload or record an audio clip.", None
speech, sr = librosa.load(audio_path, sr=None, mono=True)
duration_s = float(librosa.get_duration(y=speech, sr=sr))
pipe = get_asr_pipeline(language)
decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
t0 = time.time()
result = pipe({"sampling_rate": sr, "raw": speech})
latency_ms = int((time.time() - t0) * 1000.0)
hyp_text = result.get("text", "")
rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
meta = {
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"session_id": f"anon-{uuid.uuid4()}",
"language_display": language,
"model_id": language_models.get(language, "unknown"),
"model_revision": _model_revision_from_pipeline(pipe),
"audio_duration_s": duration_s,
"sample_rate": sr,
"source": "upload",
"decode_params": json.dumps(decode_params),
"transcript_hyp": hyp_text,
"latency_ms": latency_ms,
"rtf": rtf,
}
return hyp_text, meta
# -------- Feedback submit --------
def submit_feedback(meta, reference_text, corrected_text, score, feedback_text,
tags, store_audio, share_publicly, audio_file_path):
"""
Compute metrics (if possible) and push a row to HF Dataset immediately.
No local CSV/audio writes.
"""
if not meta:
return {"status": "No transcription metadata available. Please transcribe first."}
ref_for_metrics = (reference_text or "").strip()
corrected_text = (corrected_text or "").strip()
if not ref_for_metrics and corrected_text:
ref_for_metrics = corrected_text
metrics = _compute_metrics(meta.get("transcript_hyp", ""), ref_for_metrics)
row = dict(meta)
row.update({
"reference_text": reference_text or "",
"corrected_text": corrected_text or "",
"wer": metrics["wer"],
"cer": metrics["cer"],
"subs": metrics["subs"],
"ins": metrics["ins"],
"dels": metrics["dels"],
"score_out_of_10": int(score) if score is not None else None,
"feedback_text": feedback_text or "",
"tags": json.dumps({"labels": tags or []}),
"share_publicly": bool(share_publicly),
})
try:
# Use the temporary upload path from Gradio iff the user consented
audio_to_push = audio_file_path if store_audio else None
hf_status = _push_row_to_hf_dataset(row, audio_to_push)
status = f"Feedback saved. {hf_status}"
except Exception as e:
status = f"Failed to push to HF Dataset: {e}"
return {
"status": status,
"wer": row["wer"],
"cer": row["cer"],
"subs": row["subs"],
"ins": row["ins"],
"dels": row["dels"],
"latency_ms": row["latency_ms"],
"rtf": row["rtf"],
"model_id": row["model_id"],
"model_revision": row["model_revision"]
}
# -------- UI (original preserved; additions appended) --------
with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
gr.Markdown(
"""
## 🎙️ Multilingual Speech-to-Text
Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
Then choose the language/model and hit **Transcribe**.
"""
)
with gr.Row():
lang = gr.Dropdown(
choices=list(language_models.keys()),
value=list(language_models.keys())[0],
label="Select Language / Model"
)
with gr.Row():
audio = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload or Record Audio"
)
btn = gr.Button("Transcribe")
output = gr.Textbox(label="Transcription")
# Hidden state to carry metadata from transcribe -> feedback
meta_state = gr.State(value=None)
# Keep original behavior: output shows transcript
# Also capture meta into the hidden state
def _transcribe_and_store(audio_path, language):
hyp, meta = transcribe(audio_path, language)
# Pre-fill corrected with hypothesis for easy edits
return hyp, meta, hyp
# --- Evaluation & Feedback (no style changes) ---
with gr.Accordion("Evaluation & Feedback", open=False):
with gr.Row():
reference_tb = gr.Textbox(label="Reference text (optional)", lines=4, value="")
with gr.Row():
corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="")
with gr.Row():
score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7)
with gr.Row():
feedback_tb = gr.Textbox(label="Feedback (what went right/wrong?)", lines=3, value="")
with gr.Row():
tags_cb = gr.CheckboxGroup(
["noisy", "far-field", "code-switching", "numbers-heavy", "named-entities", "read-speech", "spontaneous", "call-center", "voicenote"],
label="Slice tags (select any that apply)"
)
with gr.Row():
store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False)
share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False)
submit_btn = gr.Button("Submit Feedback / Compute Metrics")
results_json = gr.JSON(label="Metrics & Status")
# Wire events
btn.click(
fn=_transcribe_and_store,
inputs=[audio, lang],
outputs=[output, meta_state, corrected_tb]
)
submit_btn.click(
fn=submit_feedback,
inputs=[
meta_state,
reference_tb,
corrected_tb,
score_slider,
feedback_tb,
tags_cb,
store_audio_cb,
share_cb,
audio # raw file path from gr.Audio
],
outputs=results_json
)
# Keep Spaces stable under load
if __name__ == "__main__":
demo.queue()
demo.launch()