Spaces:

DarliAI
/

Evaluation

Running

App Files Files Community

FarmerlineML commited on 15 days ago

Commit

81083e5

verified ·

1 Parent(s): f4f8ccf

Update app.py

Browse files

Files changed (1) hide show

app.py +425 -232

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py (MP3-robust loader + Luganda FKD commented; minimal feedback)
 import os
 import json
@@ -13,6 +13,11 @@ from transformers import pipeline
 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
 # Optional: modest thread hints for CPU Spaces
 try:
@@ -22,95 +27,27 @@ try:
 except Exception:
     pass
-# Basic logging so we can verify which model is loaded per inference
-logging.basicConfig(level=logging.INFO)
-# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
-from datasets import Dataset, Features, Value, Audio, load_dataset
-# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
-HF_FEATURES = Features({
-    "timestamp":        Value("string"),
-    "session_id":       Value("string"),
-    "language_display": Value("string"),
-    "model_id":         Value("string"),
-    "model_revision":   Value("string"),
-    "audio":            Audio(sampling_rate=None),   # uploaded only if user consents
-    "audio_duration_s": Value("float32"),
-    "sample_rate":      Value("int32"),
-    "source":           Value("string"),
-    "decode_params":    Value("string"),
-    "transcript_hyp":   Value("string"),
-    "corrected_text":   Value("string"),
-    "latency_ms":       Value("int32"),
-    "rtf":              Value("float32"),
-    "score_out_of_10":  Value("int32"),
-    "share_publicly":   Value("bool"),
-})
-def _push_row_to_hf_dataset(row, audio_file_path):
-    """
-    Append a single example to the HF dataset repo (train split).
-    If user didn't consent or no audio path, 'audio' field is None.
-    """
-    if not PUSH_TO_HF:
-        return "HF push disabled (missing HF_TOKEN or repo)."
-    example = dict(row)
-    # Audio: only include if user consented and file exists
-    example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
-    # Normalize types
-    def _to_int(v):
-        try:
-            return int(v)
-        except Exception:
-            return None
-    def _to_float(v):
-        try:
-            return float(v)
-        except Exception:
-            return None
-    for k in ["latency_ms", "score_out_of_10", "sample_rate"]:
-        example[k] = _to_int(example.get(k))
-    for k in ["rtf", "audio_duration_s"]:
-        example[k] = _to_float(example.get(k))
-    ds = Dataset.from_list([example], features=HF_FEATURES)
-    # Load existing split if present, then append
-    try:
-        existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
-        merged = existing.concatenate(ds)
-    except Exception:
-        merged = ds
-    merged.push_to_hub(
-        HF_DATASET_REPO,
-        split="train",
-        private=True,
-        token=HF_TOKEN,
-        commit_message="append feedback row"
-    )
-    return "Pushed to HF Dataset."
 # --- Map display names to your HF Hub model IDs ---
 language_models = {
     "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
     "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
     "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
-    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",   # active
-    # "Luganda (FKD)":          "FarmerlineML/luganda_fkd",            # commented out per request
     "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
     "Fante":                    "misterkissi/w2v2-lg-xls-r-300m-fante",
     "Bemba":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
@@ -128,61 +65,180 @@ language_models = {
     "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
     "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
     "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
-    # "WOLOF":                  "misterkissi/w2v2-lg-xls-r-1b-wolof",
-    # "HAITIAN CREOLE":         "misterkissi/whisper-small-haitian-creole",
-    # "KABYLE":                 "misterkissi/w2v2-lg-xls-r-1b-kabyle",
     "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
     "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
     "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
-    "Krio":                   "FarmerlineML/w2v-bert-2.0_krio_v3",
 }
 # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
 TARGET_SR = 16000
-def _has_ffmpeg():
     return shutil.which("ffmpeg") is not None
-def _load_with_soundfile(path):
     data, sr = sf.read(path, always_2d=False)
     if isinstance(data, np.ndarray) and data.ndim > 1:
         data = data.mean(axis=1)
     return data.astype(np.float32), sr
-def _load_with_ffmpeg(path, target_sr=TARGET_SR):
-    # Convert to mono 16k wav in a temp file using ffmpeg
     if not _has_ffmpeg():
         raise RuntimeError("ffmpeg not available")
     tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     tmp_wav.close()
-    cmd = [
-        "ffmpeg", "-hide_banner", "-loglevel", "error",
-        "-y", "-i", path,
-        "-ac", "1", "-ar", str(target_sr),
-        tmp_wav.name,
-    ]
-    subprocess.run(cmd, check=True)
-    data, sr = sf.read(tmp_wav.name, always_2d=False)
     try:
-        os.remove(tmp_wav.name)
-    except Exception:
-        pass
-    if isinstance(data, np.ndarray) and data.ndim > 1:
-        data = data.mean(axis=1)
-    return data.astype(np.float32), sr
-def _resample_if_needed(y, sr, target_sr=TARGET_SR):
     if sr == target_sr:
         return y.astype(np.float32), sr
     y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
     return y_rs.astype(np.float32), target_sr
-def load_audio_any(path, target_sr=TARGET_SR):
     """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
     ext = os.path.splitext(path)[1].lower()
     try:
         if ext in {".wav", ".flac", ".ogg", ".opus"}:
             y, sr = _load_with_soundfile(path)
@@ -192,10 +248,11 @@ def load_audio_any(path, target_sr=TARGET_SR):
         else:
             # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
             y, sr = librosa.load(path, sr=None, mono=True)
         y, sr = _resample_if_needed(y, sr, target_sr)
         return y, sr
     except Exception as e:
-        logging.warning(f"[AUDIO] Primary load failed for {path} ({e}). Falling back to librosa.")
         y, sr = librosa.load(path, sr=target_sr, mono=True)
         return y.astype(np.float32), sr
@@ -204,20 +261,23 @@ _PIPELINE_CACHE = {}
 _CACHE_ORDER = []  # usage order
 _CACHE_MAX_SIZE = 3  # tune for RAM
-def _touch_cache(key):
     if key in _CACHE_ORDER:
         _CACHE_ORDER.remove(key)
     _CACHE_ORDER.insert(0, key)
 def _evict_if_needed():
     while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
-        oldest = _CACHE_ORDER.pop()
-        try:
-            del _PIPELINE_CACHE[oldest]
-        except KeyError:
-            pass
 def get_asr_pipeline(language_display: str):
     if language_display not in language_models:
         raise ValueError(f"Unknown language selection: {language_display}")
@@ -226,13 +286,15 @@ def get_asr_pipeline(language_display: str):
         return _PIPELINE_CACHE[language_display]
     model_id = language_models[language_display]
-    logging.info(f"[ASR] Loading pipeline for '{language_display}' -> {model_id}")
     pipe = pipeline(
         task="automatic-speech-recognition",
         model=model_id,
-        device=-1,          # CPU on Spaces (explicit)
         chunk_length_s=30
     )
     _PIPELINE_CACHE[language_display] = pipe
     _touch_cache(language_display)
     _evict_if_needed()
@@ -240,7 +302,7 @@ def get_asr_pipeline(language_display: str):
 # -------- Helpers --------
 def _model_revision_from_pipeline(pipe) -> str:
-    # Best-effort capture of revision/hash for reproducibility
     for attr in ("hub_revision", "revision", "_commit_hash"):
         val = getattr(getattr(pipe, "model", None), attr, None)
         if val:
@@ -251,7 +313,7 @@ def _model_revision_from_pipeline(pipe) -> str:
         return "unknown"
 # -------- Inference --------
-def transcribe(audio_path: str, language: str):
     """
     Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
     then run it through the chosen ASR pipeline.
@@ -259,138 +321,269 @@ def transcribe(audio_path: str, language: str):
     """
     if not audio_path:
         return "⚠️ Please upload or record an audio clip.", None
-    speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
-    duration_s = float(len(speech) / float(sr))
-    pipe = get_asr_pipeline(language)
-    decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
-    t0 = time.time()
-    result = pipe({"sampling_rate": sr, "raw": speech})
-    latency_ms = int((time.time() - t0) * 1000.0)
-    hyp_text = result.get("text", "")
-    rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
-    meta = {
-        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
-        "session_id": f"anon-{uuid.uuid4()}",
-        "language_display": language,
-        "model_id": language_models.get(language, "unknown"),
-        "model_revision": _model_revision_from_pipeline(pipe),
-        "audio_duration_s": duration_s,
-        "sample_rate": sr,
-        "source": "upload",
-        "decode_params": json.dumps(decode_params),
-        "transcript_hyp": hyp_text,
-        "latency_ms": latency_ms,
-        "rtf": rtf,
-    }
-    return hyp_text, meta
-# -------- Feedback submit (minimal) --------
-def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, audio_file_path):
     """
-    Push a minimal row to HF Dataset: model info, language, transcript, optional corrected text, score.
     """
     if not meta:
-        return {"status": "No transcription metadata available. Please transcribe first."}
     row = dict(meta)
     row.update({
         "corrected_text": (corrected_text or "").strip(),
         "score_out_of_10": int(score) if score is not None else None,
         "share_publicly": bool(share_publicly),
     })
     try:
         audio_to_push = audio_file_path if store_audio else None
         hf_status = _push_row_to_hf_dataset(row, audio_to_push)
-        status = f"Feedback saved. {hf_status}"
     except Exception as e:
-        status = f"Failed to push to HF Dataset: {e}"
-    return {
-        "status": status,
-        "latency_ms": row["latency_ms"],
-        "rtf": row["rtf"],
-        "model_id": row["model_id"],
-        "model_revision": row["model_revision"],
-        "language": row["language_display"],
-    }
-# -------- UI (original preserved; additions appended) --------
-with gr.Blocks(title="🌐 Multilingual ASR Demo") as demo:
-    gr.Markdown(
-        """
-        ## 🎙️ Multilingual Speech-to-Text
-        Upload an audio file (MP3, WAV, FLAC, M4A, OGG,…) or record via your microphone.
-        Then choose the language/model and hit **Transcribe**.
-        """
-    )
-    with gr.Row():
-        lang = gr.Dropdown(
-            choices=list(language_models.keys()),
-            value=list(language_models.keys())[0],
-            label="Select Language / Model"
-        )
-    with gr.Row():
-        audio = gr.Audio(
-            sources=["upload", "microphone"],
-            type="filepath",
-            label="Upload or Record Audio"
         )
-    btn = gr.Button("Transcribe")
-    output = gr.Textbox(label="Transcription")
-    # Hidden state to carry metadata from transcribe -> feedback
-    meta_state = gr.State(value=None)
-    # Keep original behavior: output shows transcript
-    # Also capture meta into the hidden state
-    def _transcribe_and_store(audio_path, language):
-        hyp, meta = transcribe(audio_path, language)
-        # Pre-fill corrected with hypothesis for easy edits
-        return hyp, meta, hyp
-    # --- Minimal Evaluation (score + optional corrected text) ---
-    with gr.Accordion("Evaluation", open=False):
-        with gr.Row():
-            corrected_tb = gr.Textbox(label="Corrected transcript (optional)", lines=4, value="")
         with gr.Row():
-            score_slider = gr.Slider(minimum=0, maximum=10, step=1, label="Score out of 10", value=7)
-        with gr.Row():
-            store_audio_cb = gr.Checkbox(label="Allow storing my audio for research/eval", value=False)
-            share_cb = gr.Checkbox(label="Allow sharing this example publicly", value=False)
-        submit_btn = gr.Button("Submit")
-        results_json = gr.JSON(label="Status")
-    # Wire events
-    btn.click(
-        fn=_transcribe_and_store,
-        inputs=[audio, lang],
-        outputs=[output, meta_state, corrected_tb]
-    )
-    submit_btn.click(
-        fn=submit_feedback,
-        inputs=[
-            meta_state,
-            corrected_tb,
-            score_slider,
-            store_audio_cb,
-            share_cb,
-            audio  # raw file path from gr.Audio
-        ],
-        outputs=results_json
-    )
-# Keep Spaces stable under load
 if __name__ == "__main__":
-    demo.queue()
-    demo.launch()

+# app.py (MP3-robust loader + Robust HF Dataset Appending)
 import os
 import json
 import numpy as np
 import soundfile as sf  # librosa depends on this; good for wav/flac/ogg
 import librosa  # fallback / resampling
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow as pa
+from huggingface_hub import HfApi
+from typing import Optional, Tuple, Dict, Any
 # Optional: modest thread hints for CPU Spaces
 try:
 except Exception:
     pass
+# Setup logging with more detail
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# -------- CONFIG: Hub dataset target --------
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
+# Initialize HF API client once
+hf_api = HfApi() if PUSH_TO_HF else None
 # --- Map display names to your HF Hub model IDs ---
 language_models = {
     "Akan (Asante Twi)":        "FarmerlineML/w2v-bert-2.0_twi_alpha_v1",
     "Ewe":                      "FarmerlineML/w2v-bert-2.0_ewe_2",
     "Kiswahili":                "FarmerlineML/w2v-bert-2.0_swahili_alpha",
+    "Luganda":                  "FarmerlineML/w2v-bert-2.0_luganda",
     "Brazilian Portuguese":     "FarmerlineML/w2v-bert-2.0_brazilian_portugese_alpha",
     "Fante":                    "misterkissi/w2v2-lg-xls-r-300m-fante",
     "Bemba":                    "DarliAI/kissi-w2v2-lg-xls-r-300m-bemba",
     "Amharic":                  "misterkissi/w2v2-lg-xls-r-1b-amharic",
     "Xhosa":                    "misterkissi/w2v2-lg-xls-r-300m-xhosa",
     "Tsonga":                   "misterkissi/w2v2-lg-xls-r-300m-tsonga",
     "Yoruba":                   "FarmerlineML/w2v-bert-2.0_yoruba_v1",
     "Luo":                      "FarmerlineML/w2v-bert-2.0_luo_v2",
     "Somali":                   "FarmerlineML/w2v-bert-2.0_somali_alpha",
     "Pidgin":                   "FarmerlineML/pidgin_nigerian",
     "Kikuyu":                   "FarmerlineML/w2v-bert-2.0_kikuyu",
     "Igbo":                     "FarmerlineML/w2v-bert-2.0_igbo_v1",
+    "Krio":                     "FarmerlineML/w2v-bert-2.0_krio_v3",
 }
+# -------- Robust Dataset Push Function --------
+def _push_row_to_hf_dataset(row: Dict[str, Any], audio_file_path: Optional[str]) -> str:
+    """
+    Append a single example to the HF dataset repo using Parquet files.
+    Each submission creates a new Parquet file to avoid overwrites.
+    """
+    if not PUSH_TO_HF:
+        return "HF push disabled (missing HF_TOKEN or repo)."
+    if not hf_api:
+        return "HF API client not initialized."
+    # Create a copy of the row to avoid modifying the original
+    example = dict(row)
+    # Generate unique identifiers for this submission
+    timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
+    unique_id = str(uuid.uuid4())[:8]
+    # Handle audio file if provided and user consented
+    audio_uploaded = False
+    if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly", False):
+        try:
+            # Store reference to audio file in the dataset
+            audio_filename = f"audio_{timestamp}_{unique_id}{os.path.splitext(audio_file_path)[1]}"
+            example["audio_filename"] = audio_filename
+            # Upload audio file separately
+            logger.info(f"Uploading audio file: {audio_filename}")
+            hf_api.upload_file(
+                path_or_fileobj=audio_file_path,
+                path_in_repo=f"audio/{audio_filename}",
+                repo_id=HF_DATASET_REPO,
+                repo_type="dataset",
+                token=HF_TOKEN,
+                commit_message=f"Add audio for feedback {timestamp}"
+            )
+            audio_uploaded = True
+            logger.info("Audio file uploaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to upload audio: {e}")
+            example["audio_filename"] = None
+    else:
+        example["audio_filename"] = None
+    # Normalize data types for Parquet storage
+    def _safe_cast(value, cast_func, default=None):
+        """Safely cast a value to a type, returning default on failure."""
+        try:
+            return cast_func(value) if value is not None else default
+        except (ValueError, TypeError):
+            return default
+    # Type normalization
+    example["latency_ms"] = _safe_cast(example.get("latency_ms"), int)
+    example["score_out_of_10"] = _safe_cast(example.get("score_out_of_10"), int)
+    example["sample_rate"] = _safe_cast(example.get("sample_rate"), int)
+    example["rtf"] = _safe_cast(example.get("rtf"), float)
+    example["audio_duration_s"] = _safe_cast(example.get("audio_duration_s"), float)
+    example["share_publicly"] = bool(example.get("share_publicly", False))
+    # Ensure all string fields are properly handled
+    string_fields = ["timestamp", "session_id", "language_display", "model_id",
+                    "model_revision", "source", "decode_params", "transcript_hyp",
+                    "corrected_text"]
+    for field in string_fields:
+        if field in example and example[field] is not None:
+            example[field] = str(example[field])
+    # Create DataFrame and save as Parquet
+    df = pd.DataFrame([example])
+    # Generate Parquet filename
+    parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
+    # Create temporary Parquet file
+    temp_parquet = None
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
+            temp_parquet = tmp_file.name
+            df.to_parquet(temp_parquet, engine='pyarrow', compression='snappy')
+        # Upload Parquet file to dataset repo
+        logger.info(f"Uploading feedback data: {parquet_filename}")
+        hf_api.upload_file(
+            path_or_fileobj=temp_parquet,
+            path_in_repo=f"data/{parquet_filename}",
+            repo_id=HF_DATASET_REPO,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message=f"Add feedback row {timestamp}"
+        )
+        logger.info("Feedback data uploaded successfully")
+        status_msg = f"Successfully pushed to HF Dataset as {parquet_filename}"
+        if audio_uploaded:
+            status_msg += " (with audio)"
+        return status_msg
+    except Exception as e:
+        logger.error(f"Failed to push to HF Dataset: {e}")
+        return f"Failed to push to HF Dataset: {str(e)}"
+    finally:
+        # Clean up temporary file
+        if temp_parquet and os.path.exists(temp_parquet):
+            try:
+                os.remove(temp_parquet)
+            except Exception as e:
+                logger.warning(f"Failed to remove temp file: {e}")
 # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
 TARGET_SR = 16000
+def _has_ffmpeg() -> bool:
+    """Check if ffmpeg is available in the system."""
     return shutil.which("ffmpeg") is not None
+def _load_with_soundfile(path: str) -> Tuple[np.ndarray, int]:
+    """Load audio using soundfile (for wav/flac/ogg)."""
     data, sr = sf.read(path, always_2d=False)
     if isinstance(data, np.ndarray) and data.ndim > 1:
         data = data.mean(axis=1)
     return data.astype(np.float32), sr
+def _load_with_ffmpeg(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
+    """Convert audio to mono wav using ffmpeg."""
     if not _has_ffmpeg():
         raise RuntimeError("ffmpeg not available")
     tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     tmp_wav.close()
     try:
+        cmd = [
+            "ffmpeg", "-hide_banner", "-loglevel", "error",
+            "-y", "-i", path,
+            "-ac", "1", "-ar", str(target_sr),
+            tmp_wav.name,
+        ]
+        subprocess.run(cmd, check=True)
+        data, sr = sf.read(tmp_wav.name, always_2d=False)
+        if isinstance(data, np.ndarray) and data.ndim > 1:
+            data = data.mean(axis=1)
+        return data.astype(np.float32), sr
+    finally:
+        try:
+            os.remove(tmp_wav.name)
+        except Exception:
+            pass
+def _resample_if_needed(y: np.ndarray, sr: int, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
+    """Resample audio if needed."""
     if sr == target_sr:
         return y.astype(np.float32), sr
     y_rs = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=target_sr)
     return y_rs.astype(np.float32), target_sr
+def load_audio_any(path: str, target_sr: int = TARGET_SR) -> Tuple[np.ndarray, int]:
     """Robust loader: wav/flac/ogg via soundfile; mp3/m4a via ffmpeg; fallback to librosa."""
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Audio file not found: {path}")
     ext = os.path.splitext(path)[1].lower()
     try:
         if ext in {".wav", ".flac", ".ogg", ".opus"}:
             y, sr = _load_with_soundfile(path)
         else:
             # Fallback to librosa for formats like mp3/m4a when ffmpeg isn't present
             y, sr = librosa.load(path, sr=None, mono=True)
         y, sr = _resample_if_needed(y, sr, target_sr)
         return y, sr
     except Exception as e:
+        logger.warning(f"Primary load failed for {path} ({e}). Falling back to librosa.")
         y, sr = librosa.load(path, sr=target_sr, mono=True)
         return y.astype(np.float32), sr
 _CACHE_ORDER = []  # usage order
 _CACHE_MAX_SIZE = 3  # tune for RAM
+def _touch_cache(key: str):
+    """Update cache access order."""
     if key in _CACHE_ORDER:
         _CACHE_ORDER.remove(key)
     _CACHE_ORDER.insert(0, key)
 def _evict_if_needed():
+    """Evict least recently used pipelines if cache is full."""
     while len(_PIPELINE_CACHE) > _CACHE_MAX_SIZE:
+        if _CACHE_ORDER:
+            oldest = _CACHE_ORDER.pop()
+            if oldest in _PIPELINE_CACHE:
+                logger.info(f"Evicting pipeline from cache: {oldest}")
+                del _PIPELINE_CACHE[oldest]
 def get_asr_pipeline(language_display: str):
+    """Get or create ASR pipeline for the specified language."""
     if language_display not in language_models:
         raise ValueError(f"Unknown language selection: {language_display}")
         return _PIPELINE_CACHE[language_display]
     model_id = language_models[language_display]
+    logger.info(f"Loading pipeline for '{language_display}' -> {model_id}")
     pipe = pipeline(
         task="automatic-speech-recognition",
         model=model_id,
+        device=-1,  # CPU on Spaces
         chunk_length_s=30
     )
     _PIPELINE_CACHE[language_display] = pipe
     _touch_cache(language_display)
     _evict_if_needed()
 # -------- Helpers --------
 def _model_revision_from_pipeline(pipe) -> str:
+    """Best-effort capture of revision/hash for reproducibility."""
     for attr in ("hub_revision", "revision", "_commit_hash"):
         val = getattr(getattr(pipe, "model", None), attr, None)
         if val:
         return "unknown"
 # -------- Inference --------
+def transcribe(audio_path: str, language: str) -> Tuple[str, Optional[Dict[str, Any]]]:
     """
     Robust audio load (mp3/m4a friendly), resample to 16 kHz mono,
     then run it through the chosen ASR pipeline.
     """
     if not audio_path:
         return "⚠️ Please upload or record an audio clip.", None
+    try:
+        # Load and process audio
+        speech, sr = load_audio_any(audio_path, target_sr=TARGET_SR)
+        duration_s = float(len(speech) / float(sr))
+        # Get ASR pipeline
+        pipe = get_asr_pipeline(language)
+        decode_params = {"chunk_length_s": getattr(pipe, "chunk_length_s", 30)}
+        # Run inference
+        logger.info(f"Running ASR inference for {language} on {duration_s:.2f}s audio")
+        t0 = time.time()
+        result = pipe({"sampling_rate": sr, "raw": speech})
+        latency_ms = int((time.time() - t0) * 1000.0)
+        hyp_text = result.get("text", "")
+        # Calculate real-time factor
+        rtf = (latency_ms / 1000.0) / max(duration_s, 1e-9)
+        # Prepare metadata
+        meta = {
+            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+            "session_id": f"anon-{uuid.uuid4()}",
+            "language_display": language,
+            "model_id": language_models.get(language, "unknown"),
+            "model_revision": _model_revision_from_pipeline(pipe),
+            "audio_duration_s": duration_s,
+            "sample_rate": sr,
+            "source": "upload",
+            "decode_params": json.dumps(decode_params),
+            "transcript_hyp": hyp_text,
+            "latency_ms": latency_ms,
+            "rtf": rtf,
+        }
+        logger.info(f"Transcription complete. RTF: {rtf:.3f}")
+        return hyp_text, meta
+    except Exception as e:
+        logger.error(f"Transcription failed: {e}")
+        return f"❌ Transcription failed: {str(e)}", None
+# -------- Feedback submit --------
+def submit_feedback(
+    meta: Optional[Dict[str, Any]],
+    corrected_text: str,
+    score: int,
+    store_audio: bool,
+    share_publicly: bool,
+    audio_file_path: Optional[str]
+) -> Dict[str, Any]:
     """
+    Submit feedback to HF Dataset with improved error handling.
     """
     if not meta:
+        return {
+            "status": "❌ No transcription metadata available. Please transcribe first.",
+            "success": False
+        }
+    # Prepare row data
     row = dict(meta)
     row.update({
         "corrected_text": (corrected_text or "").strip(),
         "score_out_of_10": int(score) if score is not None else None,
         "share_publicly": bool(share_publicly),
     })
+    # Push to HF Dataset
     try:
         audio_to_push = audio_file_path if store_audio else None
         hf_status = _push_row_to_hf_dataset(row, audio_to_push)
+        return {
+            "status": f"✅ {hf_status}",
+            "success": True,
+            "latency_ms": row["latency_ms"],
+            "rtf": f"{row['rtf']:.3f}",
+            "model_id": row["model_id"],
+            "model_revision": row["model_revision"],
+            "language": row["language_display"],
+        }
     except Exception as e:
+        logger.error(f"Failed to submit feedback: {e}")
+        return {
+            "status": f"❌ Failed to submit feedback: {str(e)}",
+            "success": False
+        }
+# -------- Gradio UI --------
+def create_demo():
+    """Create the Gradio demo interface."""
+    with gr.Blocks(
+        title="🌐 Multilingual ASR Demo",
+        theme=gr.themes.Soft()
+    ) as demo:
+        gr.Markdown(
+            """
+            # 🎙️ Multilingual Speech-to-Text Demo
+            Upload an audio file (MP3, WAV, FLAC, M4A, OGG, etc.) or record via your microphone.
+            Then choose the language/model and hit **Transcribe**.
+            ---
+            """
         )
         with gr.Row():
+            with gr.Column(scale=1):
+                lang = gr.Dropdown(
+                    choices=list(language_models.keys()),
+                    value=list(language_models.keys())[0],
+                    label="Select Language / Model",
+                    info="Choose the language of your audio"
+                )
+                audio = gr.Audio(
+                    sources=["upload", "microphone"],
+                    type="filepath",
+                    label="Upload or Record Audio",
+                    elem_id="audio-input"
+                )
+                btn = gr.Button("🎯 Transcribe", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                output = gr.Textbox(
+                    label="Transcription",
+                    placeholder="Transcription will appear here...",
+                    lines=5
+                )
+                # Status indicators
+                with gr.Row():
+                    status_box = gr.Textbox(
+                        label="Status",
+                        interactive=False,
+                        placeholder="Ready",
+                        max_lines=1
+                    )
+        # Hidden state to carry metadata from transcribe -> feedback
+        meta_state = gr.State(value=None)
+        # Evaluation section
+        with gr.Accordion("📝 Evaluation & Feedback", open=False):
+            gr.Markdown(
+                """
+                Help us improve! Please provide feedback on the transcription quality.
+                """
+            )
+            with gr.Row():
+                corrected_tb = gr.Textbox(
+                    label="Corrected transcript (optional)",
+                    placeholder="If there are errors, type the correct transcription here...",
+                    lines=4,
+                    value=""
+                )
+            with gr.Row():
+                score_slider = gr.Slider(
+                    minimum=0,
+                    maximum=10,
+                    step=1,
+                    label="Quality Score (0 = terrible, 10 = perfect)",
+                    value=7,
+                    info="Rate the transcription quality"
+                )
+            with gr.Row():
+                store_audio_cb = gr.Checkbox(
+                    label="Allow storing my audio for research/evaluation",
+                    value=False,
+                    info="Audio will be stored securely and used only for improving the models"
+                )
+                share_cb = gr.Checkbox(
+                    label="Allow sharing this example publicly",
+                    value=False,
+                    info="Your example may be used in public datasets or demos"
+                )
+            submit_btn = gr.Button("📤 Submit Feedback", variant="secondary")
+            results_json = gr.JSON(
+                label="Submission Result",
+                visible=True
+            )
+        # Examples section
+        with gr.Accordion("📚 Example Usage", open=False):
+            gr.Markdown(
+                """
+                ### Tips for best results:
+                - Speak clearly and at a normal pace
+                - Minimize background noise
+                - Keep recordings under 30 seconds for optimal performance
+                - Select the correct language before transcribing
+                ### Supported formats:
+                WAV, MP3, FLAC, M4A, OGG, OPUS, and more!
+                """
+            )
+        # Wire up events
+        def _transcribe_and_update(audio_path, language):
+            """Transcribe and update UI components."""
+            if not audio_path:
+                return "", None, "", "⚠️ Please provide audio"
+            status_box_val = f"🔄 Processing {language}..."
+            hyp, meta = transcribe(audio_path, language)
+            if meta:
+                status_msg = f"✅ Done! (RTF: {meta['rtf']:.3f})"
+                # Pre-fill corrected with hypothesis for easy edits
+                return hyp, meta, hyp, status_msg
+            else:
+                return hyp, None, "", "❌ Transcription failed"
+        btn.click(
+            fn=_transcribe_and_update,
+            inputs=[audio, lang],
+            outputs=[output, meta_state, corrected_tb, status_box]
+        )
+        submit_btn.click(
+            fn=submit_feedback,
+            inputs=[
+                meta_state,
+                corrected_tb,
+                score_slider,
+                store_audio_cb,
+                share_cb,
+                audio
+            ],
+            outputs=results_json
+        )
+        # Auto-focus on audio input when page loads
+        demo.load(
+            fn=lambda: "Ready",
+            inputs=[],
+            outputs=[status_box]
+        )
+    return demo
+# -------- Main --------
 if __name__ == "__main__":
+    # Log startup info
+    logger.info(f"Starting ASR Demo")
+    logger.info(f"HF Dataset Repo: {HF_DATASET_REPO}")
+    logger.info(f"Push to HF enabled: {PUSH_TO_HF}")
+    logger.info(f"Available languages: {len(language_models)}")
+    # Create and launch demo
+    demo = create_demo()
+    demo.queue(max_size=10)  # Limit queue size for stability
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False  # Set to True if you want a public link
+    )