Spaces:

DarliAI
/

DarliAI_ASR

Sleeping

App Files Files Community

FarmerlineML commited on 16 days ago

Commit

edff215

verified ·

1 Parent(s): 0a9945e

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -173

app.py CHANGED Viewed

@@ -1,16 +1,9 @@
 # app.py
-import os
-import time
-import datetime as dt
-import pandas as pd
 import gradio as gr
 from transformers import pipeline
 import numpy as np
 import librosa  # pip install librosa
-from jiwer import wer  # pip install jiwer
-LOG_PATH = "feedback_logs.csv"
 # --- EDIT THIS: map display names to your HF Hub model IDs ---
 language_models = {
@@ -35,14 +28,19 @@ language_models = {
     "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
     "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
     "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
     "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
-    "Luganda (FKD)":            "FarmerlineML/luganda_fkd",
     "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
     "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
     #"Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3"
 }
 # Pre-load pipelines for each language on CPU (device=-1)
@@ -56,194 +54,53 @@ asr_pipelines = {
     for lang, model_id in language_models.items()
 }
-def transcribe(audio_path: str, language: str):
     """
     Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
     convert to mono, then run it through the chosen ASR pipeline.
-    Returns (transcript, runtime_seconds, duration_seconds).
     """
     if not audio_path:
-        return "⚠️ Please upload or record an audio clip.", 0.0, 0.0
     # librosa.load returns a 1D np.ndarray (mono) and the sample rate
     speech, sr = librosa.load(audio_path, sr=None, mono=True)
-    duration_s = librosa.get_duration(y=speech, sr=sr)
-    t0 = time.time()
     result = asr_pipelines[language]({
         "sampling_rate": sr,
         "raw": speech
     })
-    runtime_s = time.time() - t0
-    text = result.get("text", "")
-    return text, round(runtime_s, 3), round(duration_s, 3)
-def compute_wer(pred: str, ref: str) -> float:
-    if not ref or not pred:
-        return None
-    try:
-        return float(wer(ref, pred))
-    except Exception:
-        return None
-def ensure_logfile():
-    if not os.path.exists(LOG_PATH):
-        pd.DataFrame(columns=[
-            "timestamp", "language", "model_id", "audio_filename",
-            "duration_s", "runtime_s", "transcript", "reference",
-            "wer", "score_10", "feedback",
-            "domain", "environment", "accent_locale"
-        ]).to_csv(LOG_PATH, index=False)
-def save_feedback(language: str,
-                  transcript: str,
-                  reference: str,
-                  score_10: int,
-                  feedback: str,
-                  audio_file: str,
-                  duration_s: float,
-                  runtime_s: float,
-                  domain: str,
-                  environment: str,
-                  accent_locale: str):
-    ensure_logfile()
-    model_id = language_models.get(language, "")
-    audio_filename = os.path.basename(audio_file) if audio_file else ""
-    w = compute_wer(transcript, reference)
-    row = {
-        "timestamp": dt.datetime.utcnow().isoformat(),
-        "language": language,
-        "model_id": model_id,
-        "audio_filename": audio_filename,
-        "duration_s": duration_s,
-        "runtime_s": runtime_s,
-        "transcript": transcript,
-        "reference": reference,
-        "wer": w,
-        "score_10": score_10,
-        "feedback": feedback,
-        "domain": domain,
-        "environment": environment,
-        "accent_locale": accent_locale
-    }
-    try:
-        df = pd.read_csv(LOG_PATH)
-        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
-        df.to_csv(LOG_PATH, index=False)
-        msg = "✅ Feedback saved."
-        if w is not None:
-            msg += f" WER: {w:.3f}"
-        return msg
-    except Exception as e:
-        return f"❌ Could not save feedback: {e}"
-def load_metrics():
-    ensure_logfile()
-    df = pd.read_csv(LOG_PATH)
-    if df.empty:
-        return "No feedback yet.", None, None, df
-    # Aggregates
-    # Per-language means:
-    per_lang = df.groupby("language").agg(
-        n=("wer", "count"),
-        mean_WER=("wer", "mean"),
-        mean_score=("score_10", "mean"),
-        mean_runtime_s=("runtime_s", "mean"),
-        mean_duration_s=("duration_s", "mean")
-    ).reset_index().sort_values(by="mean_WER", ascending=True)
-    # Per-domain (optional):
-    per_domain = df.groupby("domain").agg(
-        n=("wer", "count"),
-        mean_WER=("wer", "mean"),
-        mean_score=("score_10", "mean")
-    ).reset_index().sort_values(by="mean_WER", ascending=True)
-    return "📊 Metrics updated.", per_lang, per_domain, df
-with gr.Blocks(title="🌐 Multilingual ASR Demo", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        ## 🎙️ Multilingual Speech-to-Text + Feedback & Benchmarking
         Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
-        Choose the language/model and hit **Transcribe**.
-        Optionally provide a **reference transcript** to compute WER, then leave a score & feedback.
         """
     )
-    with gr.Tabs():
-        with gr.Tab("ASR"):
-            with gr.Row():
-                lang = gr.Dropdown(
-                    choices=list(language_models.keys()),
-                    value=list(language_models.keys())[0],
-                    label="Select Language / Model"
-                )
-            with gr.Row():
-                audio = gr.Audio(
-                    sources=["upload", "microphone"],
-                    type="filepath",
-                    label="Upload or Record Audio"
-                )
-            btn = gr.Button("Transcribe", variant="primary")
-            output = gr.Textbox(label="Transcription", lines=6)
-            runtime = gr.Number(label="Model runtime (s)", precision=3, interactive=False)
-            duration = gr.Number(label="Audio duration (s)", precision=3, interactive=False)
-            # Feedback / Benchmark block
-            gr.Markdown("### 📝 Feedback & WER (optional)")
-            with gr.Row():
-                reference = gr.Textbox(label="Reference transcript (optional, for WER)", lines=4, placeholder="Paste the ground-truth text here to compute WER")
-            with gr.Row():
-                score = gr.Slider(0, 10, step=1, value=8, label="Overall quality score (0–10)")
-            with gr.Row():
-                domain = gr.Dropdown(
-                    ["General", "Conversational", "News", "Agriculture", "Healthcare", "Education", "Customer support", "Finance", "Legal", "Entertainment", "Other"],
-                    value="General",
-                    label="Domain/topic"
-                )
-                environment = gr.Dropdown(
-                    ["Quiet", "Office", "Outdoor", "Vehicle", "Crowd/Market", "Radio/Phone", "Other"],
-                    value="Quiet",
-                    label="Recording environment"
-                )
-                accent_locale = gr.Textbox(label="Accent / Locale (e.g., Accra, Nairobi, Lagos)", placeholder="Optional")
-            feedback = gr.Textbox(label="Free-text feedback", lines=4, placeholder="What worked well? What failed? Any specific words or sounds?")
-            save_btn = gr.Button("Save Feedback", variant="secondary")
-            save_msg = gr.Markdown("")
-            # Wire up
-            btn.click(
-                fn=transcribe,
-                inputs=[audio, lang],
-                outputs=[output, runtime, duration]
-            )
-            save_btn.click(
-                fn=save_feedback,
-                inputs=[lang, output, reference, score, feedback, audio, duration, runtime, domain, environment, accent_locale],
-                outputs=save_msg
-            )
-        with gr.Tab("Metrics"):
-            refresh = gr.Button("Refresh metrics", variant="primary")
-            metrics_msg = gr.Markdown()
-            per_lang_df = gr.Dataframe(interactive=False, label="Per-language summary (lower WER is better)")
-            per_domain_df = gr.Dataframe(interactive=False, label="Per-domain summary")
-            logs_df = gr.Dataframe(interactive=False, label="Raw feedback log")
-            refresh.click(
-                fn=load_metrics,
-                inputs=[],
-                outputs=[metrics_msg, per_lang_df, per_domain_df, logs_df]
-            )
 if __name__ == "__main__":
-    demo.launch()

 # app.py
 import gradio as gr
 from transformers import pipeline
 import numpy as np
 import librosa  # pip install librosa
 # --- EDIT THIS: map display names to your HF Hub model IDs ---
 language_models = {
     "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
     "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
     "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
+    # "WOLOF":                    "misterkissi/w2v2-lg-xls-r-1b-wolof",
+    # "HAITIAN CREOLE":           "misterkissi/whisper-small-haitian-creole",
+    # "KABYLE":                   "misterkissi/w2v2-lg-xls-r-1b-kabyle",
     "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
+    "Luganda":                  "FarmerlineML/luganda_fkd",
     "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
     "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
     #"Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3"
+    # add more as needed
 }
 # Pre-load pipelines for each language on CPU (device=-1)
     for lang, model_id in language_models.items()
 }
+def transcribe(audio_path: str, language: str) -> str:
     """
     Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
     convert to mono, then run it through the chosen ASR pipeline.
     """
     if not audio_path:
+        return "⚠️ Please upload or record an audio clip."
     # librosa.load returns a 1D np.ndarray (mono) and the sample rate
     speech, sr = librosa.load(audio_path, sr=None, mono=True)
+    # Call the Hugging Face ASR pipeline
     result = asr_pipelines[language]({
         "sampling_rate": sr,
         "raw": speech
     })
+    return result.get("text", "")
+with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
     gr.Markdown(
         """
+        ## 🎙️ Multilingual Speech-to-Text
         Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
+        Then choose the language/model and hit **Transcribe**.
         """
     )
+    with gr.Row():
+        lang = gr.Dropdown(
+            choices=list(language_models.keys()),
+            value=list(language_models.keys())[0],
+            label="Select Language / Model"
+        )
+    with gr.Row():
+        audio = gr.Audio(
+            sources=["upload", "microphone"],
+            type="filepath",
+            label="Upload or Record Audio"
+        )
+    btn = gr.Button("Transcribe")
+    output = gr.Textbox(label="Transcription")
+    btn.click(fn=transcribe, inputs=[audio, lang], outputs=output)
 if __name__ == "__main__":
+    demo.launch()