# app.py

import os
import time
import datetime as dt
import pandas as pd
import gradio as gr
from transformers import pipeline
import numpy as np
import librosa  # pip install librosa
from jiwer import wer  # pip install jiwer

LOG_PATH = "feedback_logs.csv"

# --- EDIT THIS: map display names to your HF Hub model IDs ---
language_models = {
    "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
    "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
    "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",
    "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
    "Fante":                    "misterkissi/w2v2-lg-xls-r-300m-fante", 
    "Bemba":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
    "Bambara":                  "DarliAI/kissi-w2v2-lg-xls-r-300m-bambara",
    "Dagaare":                  "DarliAI/kissi-w2v2-lg-xls-r-300m-dagaare",
    "Kinyarwanda":              "DarliAI/kissi-w2v2-lg-xls-r-300m-kinyarwanda",
    "Fula":                     "DarliAI/kissi-wav2vec2-fula-fleurs-full",
    "Oromo":                    "DarliAI/kissi-w2v-bert-2.0-oromo",
    "Runynakore":               "misterkissi/w2v2-lg-xls-r-300m-runyankore",
    "Ga":                       "misterkissi/w2v2-lg-xls-r-300m-ga",
    "Vai":                      "misterkissi/whisper-small-vai",
    "Kasem":                    "misterkissi/w2v2-lg-xls-r-300m-kasem",
    "Lingala":                  "misterkissi/w2v2-lg-xls-r-300m-lingala",
    "Fongbe":                   "misterkissi/whisper-small-fongbe",
    "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
    "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
    "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
    "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
    "Luganda (FKD)":            "FarmerlineML/luganda_fkd",
    "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
    "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
    "Pidgin":                   "FarmerlineML/pidgin_nigerian",
    "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
    "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
    "Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3"
}

# Pre-load pipelines for each language on CPU (device=-1)
asr_pipelines = {
    lang: pipeline(
        task="automatic-speech-recognition",
        model=model_id,
        device=-1,            # force CPU usage
        chunk_length_s=30
    )
    for lang, model_id in language_models.items()
}

def transcribe(audio_path: str, language: str):
    """
    Load the audio via librosa (supports mp3, wav, flac, m4a, ogg, etc.),
    convert to mono, then run it through the chosen ASR pipeline.
    Returns (transcript, runtime_seconds, duration_seconds).
    """
    if not audio_path:
        return "⚠️ Please upload or record an audio clip.", 0.0, 0.0

    # librosa.load returns a 1D np.ndarray (mono) and the sample rate
    speech, sr = librosa.load(audio_path, sr=None, mono=True)
    duration_s = librosa.get_duration(y=speech, sr=sr)

    t0 = time.time()
    result = asr_pipelines[language]({
        "sampling_rate": sr,
        "raw": speech
    })
    runtime_s = time.time() - t0
    text = result.get("text", "")
    return text, round(runtime_s, 3), round(duration_s, 3)

def compute_wer(pred: str, ref: str) -> float:
    if not ref or not pred:
        return None
    try:
        return float(wer(ref, pred))
    except Exception:
        return None

def ensure_logfile():
    if not os.path.exists(LOG_PATH):
        pd.DataFrame(columns=[
            "timestamp", "language", "model_id", "audio_filename",
            "duration_s", "runtime_s", "transcript", "reference",
            "wer", "score_10", "feedback",
            "domain", "environment", "accent_locale"
        ]).to_csv(LOG_PATH, index=False)

def save_feedback(language: str,
                  transcript: str,
                  reference: str,
                  score_10: int,
                  feedback: str,
                  audio_file: str,
                  duration_s: float,
                  runtime_s: float,
                  domain: str,
                  environment: str,
                  accent_locale: str):
    ensure_logfile()
    model_id = language_models.get(language, "")
    audio_filename = os.path.basename(audio_file) if audio_file else ""

    w = compute_wer(transcript, reference)

    row = {
        "timestamp": dt.datetime.utcnow().isoformat(),
        "language": language,
        "model_id": model_id,
        "audio_filename": audio_filename,
        "duration_s": duration_s,
        "runtime_s": runtime_s,
        "transcript": transcript,
        "reference": reference,
        "wer": w,
        "score_10": score_10,
        "feedback": feedback,
        "domain": domain,
        "environment": environment,
        "accent_locale": accent_locale
    }
    try:
        df = pd.read_csv(LOG_PATH)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
        df.to_csv(LOG_PATH, index=False)
        msg = "✅ Feedback saved."
        if w is not None:
            msg += f" WER: {w:.3f}"
        return msg
    except Exception as e:
        return f"❌ Could not save feedback: {e}"

def load_metrics():
    ensure_logfile()
    df = pd.read_csv(LOG_PATH)
    if df.empty:
        return "No feedback yet.", None, None, df

    # Aggregates
    # Per-language means:
    per_lang = df.groupby("language").agg(
        n=("wer", "count"),
        mean_WER=("wer", "mean"),
        mean_score=("score_10", "mean"),
        mean_runtime_s=("runtime_s", "mean"),
        mean_duration_s=("duration_s", "mean")
    ).reset_index().sort_values(by="mean_WER", ascending=True)

    # Per-domain (optional):
    per_domain = df.groupby("domain").agg(
        n=("wer", "count"),
        mean_WER=("wer", "mean"),
        mean_score=("score_10", "mean")
    ).reset_index().sort_values(by="mean_WER", ascending=True)

    return "📊 Metrics updated.", per_lang, per_domain, df

with gr.Blocks(title="🌐 Multilingual ASR Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        ## 🎙️ Multilingual Speech-to-Text + Feedback & Benchmarking
        Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.  
        Choose the language/model and hit **Transcribe**.  
        Optionally provide a **reference transcript** to compute WER, then leave a score & feedback.
        """
    )

    with gr.Tabs():
        with gr.Tab("ASR"):
            with gr.Row():
                lang = gr.Dropdown(
                    choices=list(language_models.keys()),
                    value=list(language_models.keys())[0],
                    label="Select Language / Model"
                )

            with gr.Row():
                audio = gr.Audio(
                    sources=["upload", "microphone"],
                    type="filepath",
                    label="Upload or Record Audio"
                )

            btn = gr.Button("Transcribe", variant="primary")
            output = gr.Textbox(label="Transcription", lines=6)
            runtime = gr.Number(label="Model runtime (s)", precision=3, interactive=False)
            duration = gr.Number(label="Audio duration (s)", precision=3, interactive=False)

            # Feedback / Benchmark block
            gr.Markdown("### 📝 Feedback & WER (optional)")
            with gr.Row():
                reference = gr.Textbox(label="Reference transcript (optional, for WER)", lines=4, placeholder="Paste the ground-truth text here to compute WER")
            with gr.Row():
                score = gr.Slider(0, 10, step=1, value=8, label="Overall quality score (0–10)")
            with gr.Row():
                domain = gr.Dropdown(
                    ["General", "Conversational", "News", "Agriculture", "Healthcare", "Education", "Customer support", "Finance", "Legal", "Entertainment", "Other"],
                    value="General",
                    label="Domain/topic"
                )
                environment = gr.Dropdown(
                    ["Quiet", "Office", "Outdoor", "Vehicle", "Crowd/Market", "Radio/Phone", "Other"],
                    value="Quiet",
                    label="Recording environment"
                )
                accent_locale = gr.Textbox(label="Accent / Locale (e.g., Accra, Nairobi, Lagos)", placeholder="Optional")

            feedback = gr.Textbox(label="Free-text feedback", lines=4, placeholder="What worked well? What failed? Any specific words or sounds?")

            save_btn = gr.Button("Save Feedback", variant="secondary")
            save_msg = gr.Markdown("")

            # Wire up
            btn.click(
                fn=transcribe,
                inputs=[audio, lang],
                outputs=[output, runtime, duration]
            )

            save_btn.click(
                fn=save_feedback,
                inputs=[lang, output, reference, score, feedback, audio, duration, runtime, domain, environment, accent_locale],
                outputs=save_msg
            )

        with gr.Tab("Metrics"):
            refresh = gr.Button("Refresh metrics", variant="primary")
            metrics_msg = gr.Markdown()
            per_lang_df = gr.Dataframe(interactive=False, label="Per-language summary (lower WER is better)")
            per_domain_df = gr.Dataframe(interactive=False, label="Per-domain summary")
            logs_df = gr.Dataframe(interactive=False, label="Raw feedback log")

            refresh.click(
                fn=load_metrics,
                inputs=[],
                outputs=[metrics_msg, per_lang_df, per_domain_df, logs_df]
            )

if __name__ == "__main__":
    demo.launch()