Evaluation-2

Sleeping

File size: 14,165 Bytes

# app.py

import os
import json
import time
import uuid
import gradio as gr
from transformers import pipeline
import numpy as np
import librosa  # pip install librosa

# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
from datasets import Dataset, Features, Value, Audio, load_dataset

# Optional but recommended for better jiwer performance
# pip install python-Levenshtein
try:
    from jiwer import compute_measures, wer as jiwer_wer, cer as jiwer_cer
    HAS_JIWER = True
except Exception:
    HAS_JIWER = False

# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
HF_TOKEN = os.environ.get("HF_TOKEN")
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)

HF_FEATURES = Features({
    "timestamp":        Value("string"),
    "session_id":       Value("string"),
    "language_display": Value("string"),
    "model_id":         Value("string"),
    "model_revision":   Value("string"),

    "audio":            Audio(sampling_rate=None),   # uploaded only if user consents
    "audio_duration_s": Value("float32"),
    "sample_rate":      Value("int32"),
    "source":           Value("string"),
    "decode_params":    Value("string"),

    "transcript_hyp":   Value("string"),
    "reference_text":   Value("string"),
    "corrected_text":   Value("string"),

    "latency_ms":       Value("int32"),
    "rtf":              Value("float32"),

    "wer":              Value("float32"),
    "cer":              Value("float32"),
    "subs":             Value("int32"),
    "ins":              Value("int32"),
    "dels":             Value("int32"),

    "score_out_of_10":  Value("int32"),
    "feedback_text":    Value("string"),
    "tags":             Value("string"),
    "share_publicly":   Value("bool"),
})

def _push_row_to_hf_dataset(row, audio_file_path):
    """
    Append a single example to the HF dataset repo (train split).
    If user didn't consent or no audio path, 'audio' field is None.
    """
    if not PUSH_TO_HF:
        return "HF push disabled (missing HF_TOKEN or repo)."

    example = dict(row)

    # Audio: only include if user consented and file exists
    example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None

    # Normalize types
    def _to_int(v):
        try:
            return int(v)
        except Exception:
            return None
    def _to_float(v):
        try:
            return float(v)
        except Exception:
            return None

    for k in ["subs", "ins", "dels", "latency_ms", "score_out_of_10", "sample_rate"]:
        example[k] = _to_int(example.get(k))
    for k in ["wer", "cer", "rtf", "audio_duration_s"]:
        example[k] = _to_float(example.get(k))

    ds = Dataset.from_list([example], features=HF_FEATURES)

    # Load existing split if present, then append
    try:
        existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
        merged = existing.concatenate(ds)
    except Exception:
        merged = ds

    merged.push_to_hub(
        HF_DATASET_REPO,
        split="train",
        private=True,
        token=HF_TOKEN,
        commit_message="append feedback row"
    )
    return "Pushed to HF Dataset."

# --- EDIT THIS: map display names to your HF Hub model IDs ---
language_models = {
    "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
    "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
    "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",
    "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
    "Fante":                    "misterkissi/w2v2-lg-xls-r-300m-fante", 
    "Bemba":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
    "Bambara":                  "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara",
    "Dagaare":                  "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare",
    "Kinyarwanda":              "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda",
    "Fula":                     "DarliAI/kissi-wav2vec2-fula-fleurs-full",
    "Oromo":                    "DarliAI/kissi-w2v-bert-2.0-oromo",
    "Runynakore":               "misterkissi/w2v2-lg-xls-r-300m-runyankore",
    "Ga":                       "misterkissi/w2v2-lg-xls-r-300m-ga",
    "Vai":                      "misterkissi/whisper-small-vai",
    "Kasem":                    "misterkissi/w2v2-lg-xls-r-300m-kasem",
    "Lingala":                  "misterkissi/w2v2-lg-xls-r-300m-lingala",
    "Fongbe":                   "misterkissi/whisper-small-fongbe",
    "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
    "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
    "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
    # "WOLOF":                  "misterkissi/w2v2-lg-xls-r-1b-wolof",
    # "HAITIAN CREOLE":         "misterkissi/whisper-small-haitian-creole",
    # "KABYLE":                 "misterkissi/w2v2-lg-xls-r-1b-kabyle",
    "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
    "Luganda":                  "FarmerlineML/luganda_fkd",
    "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
    "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
    "Pidgin":                   "FarmerlineML/pidgin_nigerian",
    "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
    "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
    #"Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3"
}

# -------- Lazy-load pipeline cache (Space-safe) --------
_PIPELINE_CACHE = {}
_CACHE_ORDER = []  # usage order
_CACHE_MAX_SIZE = 3  # tune for RAM

def _touch_cache(key):
    if key in _CACHE_ORDER:
        _CACHE_ORDER.remove(key)
    _CACHE_ORDER.insert(0, key)

def _evict_if_needed():
    while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
        oldest = _CACHE_ORDER.pop()
        try:
            del _PIPELINE_CACHE[oldest]
        except KeyError:
            pass

def get_asr_pipeline(language_display: str):
    if language_display in _PIPELINE_CACHE:
        _touch_cache(language_display)
        return _PIPELINE_CACHE[language_display]
    model_id = language_models[language_display]
    pipe = pipeline(
        task="automatic-speech-recognition",
        model=model_id,
        device=-1,          # CPU on Spaces (explicit)
        chunk_length_s=30
    )
    _PIPELINE_CACHE[language_display] = pipe
    _touch_cache(language_display)
    _evict_if_needed()
    return pipe

# -------- Helpers --------
def _model_revision_from_pipeline(pipe) -> str:
    # Best-effort capture of revision/hash for reproducibility
    for attr in ("hub_revision", "revision", "_commit_hash"):
        val = getattr(getattr(pipe, "model", None), attr, None)
        if val:
            return str(val)
    try:
        return str(getattr(pipe.model.config, "_name_or_path", "unknown"))
    except Exception:
        return "unknown"

def _compute_metrics(hyp: str, ref_or_corrected: str):
    if not HAS_JIWER or not ref_or_corrected or not hyp:
        return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None}
    try:
        measures = compute_measures(ref_or_corrected, hyp)
        return {
            "wer": measures.get("wer"),
            "cer": jiwer_cer(ref_or_corrected, hyp),
            "subs": measures.get("substitutions"),
            "ins": measures.get("insertions"),
            "dels": measures.get("deletions"),
        }
    except Exception:
        return {"wer": None, "cer": None, "subs": None, "ins": None, "dels": None}

# -------- Inference --------
def transcribe(audio_path: str, language: str):
    """
    Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
    convert to mono, then run it through the chosen ASR pipeline.
    Returns transcript (unchanged behavior) and a meta dict for feedback.
    """
    if not audio_path:
        return "⚠️ Please upload or record an audio clip.", None

    speech, sr = librosa.load(audio_path, sr=None, mono=True)
    duration_s = float(librosa.get_duration(y=speech, sr=sr))

    pipe = get_asr_pipeline(language)
    decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}

    t0 = time.time()
    result = pipe({"sampling_rate": sr, "raw": speech})
    latency_ms = int((time.time() - t0) * 1000.0)
    hyp_text = result.get("text", "")

    rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)

    meta = {
        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "session_id": f"anon-{uuid.uuid4()}",
        "language_display": language,
        "model_id": language_models.get(language, "unknown"),
        "model_revision": _model_revision_from_pipeline(pipe),
        "audio_duration_s": duration_s,
        "sample_rate": sr,
        "source": "upload",
        "decode_params": json.dumps(decode_params),
        "transcript_hyp": hyp_text,
        "latency_ms": latency_ms,
        "rtf": rtf,
    }
    return hyp_text, meta

# -------- Feedback submit --------
def submit_feedback(meta, reference_text, corrected_text, score, feedback_text,
                    tags, store_audio, share_publicly, audio_file_path):
    """
    Compute metrics (if possible) and push a row to HF Dataset immediately.
    No local CSV/audio writes.
    """
    if not meta:
        return {"status": "No transcription metadata available. Please transcribe first."}

    ref_for_metrics = (reference_text or "").strip()
    corrected_text = (corrected_text or "").strip()
    if not ref_for_metrics and corrected_text:
        ref_for_metrics = corrected_text

    metrics = _compute_metrics(meta.get("transcript_hyp", ""), ref_for_metrics)

    row = dict(meta)
    row.update({
        "reference_text": reference_text or "",
        "corrected_text": corrected_text or "",
        "wer": metrics["wer"],
        "cer": metrics["cer"],
        "subs": metrics["subs"],
        "ins": metrics["ins"],
        "dels": metrics["dels"],
        "score_out_of_10": int(score) if score is not None else None,
        "feedback_text": feedback_text or "",
        "tags": json.dumps({"labels": tags or []}),
        "share_publicly": bool(share_publicly),
    })

    try:
        # Use the temporary upload path from Gradio iff the user consented
        audio_to_push = audio_file_path if store_audio else None
        hf_status = _push_row_to_hf_dataset(row, audio_to_push)
        status = f"Feedback saved. {hf_status}"
    except Exception as e:
        status = f"Failed to push to HF Dataset: {e}"

    return {
        "status": status,
        "wer": row["wer"],
        "cer": row["cer"],
        "subs": row["subs"],
        "ins": row["ins"],
        "dels": row["dels"],
        "latency_ms": row["latency_ms"],
        "rtf": row["rtf"],
        "model_id": row["model_id"],
        "model_revision": row["model_revision"]
    }

# -------- UI (original preserved; additions appended) --------
with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
    gr.Markdown(
        """
        ## 🎙️ Multilingual Speech-to-Text   
        Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.  
        Then choose the language/model and hit **Transcribe**.
        """
    )

    with gr.Row():
        lang = gr.Dropdown(
            choices=list(language_models.keys()),
            value=list(language_models.keys())[0],
            label="Select Language / Model"
        )

    with gr.Row():
        audio = gr.Audio(
            sources=["upload", "microphone"],
            type="filepath",
            label="Upload or Record Audio"
        )

    btn = gr.Button("Transcribe")
    output = gr.Textbox(label="Transcription")

    # Hidden state to carry metadata from transcribe -> feedback
    meta_state = gr.State(value=None)

    # Keep original behavior: output shows transcript
    # Also capture meta into the hidden state
    def _transcribe_and_store(audio_path, language):
        hyp, meta = transcribe(audio_path, language)
        # Pre-fill corrected with hypothesis for easy edits
        return hyp, meta, hyp

    # --- Evaluation & Feedback (no style changes) ---
    with gr.Accordion("Evaluation & Feedback", open=False):
        with gr.Row():
            reference_tb = gr.Textbox(label="Reference text (optional)", lines=4, value="")
        with gr.Row():
            corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="")
        with gr.Row():
            score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7)
        with gr.Row():
            feedback_tb = gr.Textbox(label="Feedback (what went right/wrong?)", lines=3, value="")
        with gr.Row():
            tags_cb = gr.CheckboxGroup(
                ["noisy", "far-field", "code-switching", "numbers-heavy", "named-entities", "read-speech", "spontaneous", "call-center", "voicenote"],
                label="Slice tags (select any that apply)"
            )
        with gr.Row():
            store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False)
            share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False)

        submit_btn = gr.Button("Submit Feedback / Compute Metrics")
        results_json = gr.JSON(label="Metrics & Status")

    # Wire events
    btn.click(
        fn=_transcribe_and_store,
        inputs=[audio, lang],
        outputs=[output, meta_state, corrected_tb]
    )

    submit_btn.click(
        fn=submit_feedback,
        inputs=[
            meta_state,
            reference_tb,
            corrected_tb,
            score_slider,
            feedback_tb,
            tags_cb,
            store_audio_cb,
            share_cb,
            audio  # raw file path from gr.Audio
        ],
        outputs=results_json
    )

# Keep Spaces stable under load
if __name__ == "__main__":
    demo.queue()
    demo.launch()