Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

App Files Files Community

sudhanm commited on 5 days ago

Commit

fa0e345

verified ·

1 Parent(s): 189dfd8

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -53

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ try:
 except:
     INDIC_OK = False
-# Optional HF Spaces decorator
 try:
     import spaces
     GPU_DECORATOR = spaces.GPU
@@ -43,16 +43,24 @@ SPECIALIZED_MODELS = {
     "Malayalam": "thennal/whisper-medium-ml",
 }
-# Scripts and banking
 SCRIPT_PATTERNS = {
     "Tamil": re.compile(r"[஀-௿]"),
     "Malayalam": re.compile(r"[ഀ-ൿ]"),
     "English": re.compile(r"[A-Za-z]"),
 }
 SENTENCE_BANK = {
-    "English": ["The sun sets over the beautiful horizon.", "Hard work always pays off in the end."],
-    "Tamil": ["இன்று நல்ல வானிலை உள்ளது.", "உழைப்பு எப்போதும் வெற்றியைத் தரும்."],
-    "Malayalam": ["എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.", "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും."]
 }
 # Model cache
@@ -90,9 +98,11 @@ def preprocess_audio(audio_path, target_sr=16000):
         return audio, target_sr
     except: return None, None
-JIWER_TRANSFORM = jiwer.Compose([jiwer.ToLowerCase(), jiwer.RemovePunctuation(),
-                                 jiwer.RemoveMultipleSpaces(), jiwer.Strip(),
-                                 jiwer.ReduceToListOfListOfWords()])
 def compute_wer(ref,hyp):
     try: return jiwer.wer(ref, hyp, truth_transform=JIWER_TRANSFORM, hypothesis_transform=JIWER_TRANSFORM)
     except: return 1.0
@@ -103,24 +113,45 @@ def compute_cer(ref,hyp):
 # ---------------- MODEL LOADERS ---------------- #
 @GPU_DECORATOR
 def load_indicwhisper():
     global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
-    if indicwhisper_pipeline: return indicwhisper_pipeline
     try:
-        from whisper_jax import FlaxWhisperPipeline; import jax.numpy as jnp
-        indicwhisper_pipeline = FlaxWhisperPipeline(INDICWHISPER_MODEL, dtype=jnp.bfloat16, batch_size=1)
         WHISPER_JAX_AVAILABLE = True
         print("✅ JAX IndicWhisper loaded!")
         return indicwhisper_pipeline
     except Exception as e:
-        print(f"⚠️ JAX unavailable: {e}"); WHISPER_JAX_AVAILABLE = False
     from transformers import pipeline
-    indicwhisper_pipeline = pipeline("automatic-speech-recognition", model=INDICWHISPER_MODEL, device=DEVICE_INDEX)
     print("✅ Transformers IndicWhisper loaded!")
     return indicwhisper_pipeline
 @GPU_DECORATOR
 def load_specialized_model(language):
-    if language in fallback_models: return fallback_models[language]
     from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
     name = SPECIALIZED_MODELS[language]
     proc = AutoProcessor.from_pretrained(name)
@@ -132,18 +163,27 @@ def load_specialized_model(language):
 @GPU_DECORATOR
 def transcribe_with_primary_model(audio_path, language):
     try:
-        pl = load_indicwhisper(); lang_code = LANG_CODES.get(language, "en")
         if WHISPER_JAX_AVAILABLE:
-            res = pl(audio_path, task="transcribe", language=lang_code)
-            if isinstance(res, dict): return res.get("text","").strip()
-            return str(res).strip()
-        if hasattr(pl, "model") and hasattr(pl, "tokenizer"):
-            try:
-                forced_ids = pl.tokenizer.get_decoder_prompt_ids(language=lang_code, task="transcribe")
-                pl.model.config.forced_decoder_ids = forced_ids
-            except: pass
         with amp_ctx():
-            out = pl(audio_path)
         if isinstance(out, dict): return (out.get("text") or "").strip()
         return str(out).strip()
     except Exception as e:
@@ -160,7 +200,9 @@ def transcribe_with_specialized_model(audio_path, language):
         gen_kwargs = {"inputs": feats, "max_length": 200, "num_beams": 3}
         if language != "English":
             try:
-                forced_ids = comp["processor"].tokenizer.get_decoder_prompt_ids(LANG_CODES[language], task="transcribe")
                 gen_kwargs["forced_decoder_ids"] = forced_ids
             except: pass
         with torch.no_grad(), amp_ctx():
@@ -178,24 +220,6 @@ def transcribe_audio(audio_path, language, use_specialized=False):
         return transcribe_with_primary_model(audio_path, language)
 # ---------------- MAIN ---------------- #
-@GPU_DECORATOR
-def compare_pronunciation(audio, lang_choice, intended):
-    if audio is None: return ("❌ Please record audio first.","","","","","","","")
-    if not intended.strip(): return ("❌ Please generate a sentence first.","","","","","","","")
-    ptext = transcribe_audio(audio, lang_choice, False)
-    stext = transcribe_audio(audio, lang_choice, True)
-    actual = ptext if not ptext.startswith("Error:") else stext
-    if actual.startswith("Error:"): return (f"❌ {actual}","","","","","","","")
-    wer_val, cer_val = compute_wer(intended, actual), compute_cer(intended, actual)
-    score, feedback = get_score(wer_val, cer_val)
-    return (f"✅ Done - {score}\n💬 {feedback}",
-            ptext, stext,
-            f"{wer_val:.3f} ({(1-wer_val)*100:.1f}%)",
-            f"{cer_val:.3f} ({(1-cer_val)*100:.1f}%)",
-            diff_html(intended, actual),
-            char_html(intended, actual),
-            f"🎯 Target: {intended}")
 def get_score(wer, cer):
     c = (wer*0.7)+(cer*0.3)
     if c <= 0.1: return "🏆 Excellent!","Outstanding!"
@@ -204,11 +228,7 @@ def get_score(wer, cer):
     elif c <= 0.6: return "📚 Needs Practice","Focus on clearer pronunciation."
     else: return "💪 Keep Trying!","Don't give up!"
-def diff_html(ref,hyp): return highlight_differences(ref,hyp)
-def char_html(ref,hyp): return char_level_highlight(ref,hyp)
-# Diff functions
-def highlight_differences(ref,hyp):
     ref_w, hyp_w = ref.split(), hyp.split()
     sm = difflib.SequenceMatcher(None, ref_w, hyp_w)
     out=[]
@@ -217,13 +237,11 @@ def highlight_differences(ref,hyp):
         elif tag=='replace':
             out += [f"<span style='color:red'>{w}</span>" for w in ref_w[i1:i2]]
             out += [f"<span style='color:orange'>→{w}</span>" for w in hyp_w[j1:j2]]
-        elif tag=='delete':
-            out += [f"<span style='color:red'>{w}</span>" for w in ref_w[i1:i2]]
-        elif tag=='insert':
-            out += [f"<span style='color:orange'>+{w}</span>" for w in hyp_w[j1:j2]]
     return " ".join(out)
-def char_level_highlight(ref,hyp):
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out=[]
     for tag,i1,i2,j1,j2 in sm.get_opcodes():
@@ -232,6 +250,29 @@ def char_level_highlight(ref,hyp):
         elif tag=='insert': out += [f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]]
     return "".join(out)
 # ---------------- UI ---------------- #
 def create_interface():
     with gr.Blocks() as demo:
@@ -256,6 +297,7 @@ def create_interface():
         lang.change(get_random_sentence, [lang], [intended])
     return demo
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

 except:
     INDIC_OK = False
+# Optional HF Spaces GPU decorator
 try:
     import spaces
     GPU_DECORATOR = spaces.GPU
     "Malayalam": "thennal/whisper-medium-ml",
 }
 SCRIPT_PATTERNS = {
     "Tamil": re.compile(r"[஀-௿]"),
     "Malayalam": re.compile(r"[ഀ-ൿ]"),
     "English": re.compile(r"[A-Za-z]"),
 }
 SENTENCE_BANK = {
+    "English": [
+        "The sun sets over the beautiful horizon.",
+        "Hard work always pays off in the end."
+    ],
+    "Tamil": [
+        "இன்று நல்ல வானிலை உள்ளது.",
+        "உழைப்பு எப்போதும் வெற்றியைத் தரும்."
+    ],
+    "Malayalam": [
+        "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
+        "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും."
+    ]
 }
 # Model cache
         return audio, target_sr
     except: return None, None
+JIWER_TRANSFORM = jiwer.Compose([
+    jiwer.ToLowerCase(), jiwer.RemovePunctuation(),
+    jiwer.RemoveMultipleSpaces(), jiwer.Strip(),
+    jiwer.ReduceToListOfListOfWords()
+])
 def compute_wer(ref,hyp):
     try: return jiwer.wer(ref, hyp, truth_transform=JIWER_TRANSFORM, hypothesis_transform=JIWER_TRANSFORM)
     except: return 1.0
 # ---------------- MODEL LOADERS ---------------- #
 @GPU_DECORATOR
 def load_indicwhisper():
+    """
+    Load IndicWhisper (parthiv11/indic_whisper_nodcil) with matching config/weights.
+    Prefer whisper-jax if available, else Transformers pipeline.
+    """
     global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
+    if indicwhisper_pipeline is not None:
+        return indicwhisper_pipeline
+    # Try JAX first
     try:
+        from whisper_jax import FlaxWhisperPipeline
+        import jax.numpy as jnp
+        print(f"🔄 Loading JAX IndicWhisper: {INDICWHISPER_MODEL}")
+        indicwhisper_pipeline = FlaxWhisperPipeline(
+            INDICWHISPER_MODEL, dtype=jnp.bfloat16, batch_size=1
+        )
         WHISPER_JAX_AVAILABLE = True
         print("✅ JAX IndicWhisper loaded!")
         return indicwhisper_pipeline
     except Exception as e:
+        print(f"⚠️ JAX unavailable: {e}")
+        WHISPER_JAX_AVAILABLE = False
+    # Transformers fallback — use model+tokenizer+feature_extractor from same repo
     from transformers import pipeline
+    indicwhisper_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model=INDICWHISPER_MODEL,
+        tokenizer=INDICWHISPER_MODEL,
+        feature_extractor=INDICWHISPER_MODEL,
+        device=DEVICE_INDEX
+    )
     print("✅ Transformers IndicWhisper loaded!")
     return indicwhisper_pipeline
 @GPU_DECORATOR
 def load_specialized_model(language):
+    if language in fallback_models:
+        return fallback_models[language]
     from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
     name = SPECIALIZED_MODELS[language]
     proc = AutoProcessor.from_pretrained(name)
 @GPU_DECORATOR
 def transcribe_with_primary_model(audio_path, language):
     try:
+        pipe = load_indicwhisper()
+        lang_code = LANG_CODES.get(language, "en")
+        # JAX path
         if WHISPER_JAX_AVAILABLE:
+            result = pipe(audio_path, task="transcribe", language=lang_code)
+            if isinstance(result, dict) and "text" in result:
+                return result["text"].strip()
+            return str(result).strip()
+        # Transformers path
+        try:
+            if hasattr(pipe, "model") and hasattr(pipe, "tokenizer"):
+                forced_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang_code, task="transcribe")
+                if forced_ids is not None:
+                    pipe.model.config.forced_decoder_ids = forced_ids
+        except Exception as e:
+            print(f"⚠️ Lang forcing failed: {e}")
         with amp_ctx():
+            out = pipe(audio_path)
         if isinstance(out, dict): return (out.get("text") or "").strip()
         return str(out).strip()
     except Exception as e:
         gen_kwargs = {"inputs": feats, "max_length": 200, "num_beams": 3}
         if language != "English":
             try:
+                forced_ids = comp["processor"].tokenizer.get_decoder_prompt_ids(
+                    LANG_CODES[language], task="transcribe"
+                )
                 gen_kwargs["forced_decoder_ids"] = forced_ids
             except: pass
         with torch.no_grad(), amp_ctx():
         return transcribe_with_primary_model(audio_path, language)
 # ---------------- MAIN ---------------- #
 def get_score(wer, cer):
     c = (wer*0.7)+(cer*0.3)
     if c <= 0.1: return "🏆 Excellent!","Outstanding!"
     elif c <= 0.6: return "📚 Needs Practice","Focus on clearer pronunciation."
     else: return "💪 Keep Trying!","Don't give up!"
+def diff_html(ref,hyp):
     ref_w, hyp_w = ref.split(), hyp.split()
     sm = difflib.SequenceMatcher(None, ref_w, hyp_w)
     out=[]
         elif tag=='replace':
             out += [f"<span style='color:red'>{w}</span>" for w in ref_w[i1:i2]]
             out += [f"<span style='color:orange'>→{w}</span>" for w in hyp_w[j1:j2]]
+        elif tag=='delete': out += [f"<span style='color:red'>{w}</span>" for w in ref_w[i1:i2]]
+        elif tag=='insert': out += [f"<span style='color:orange'>+{w}</span>" for w in hyp_w[j1:j2]]
     return " ".join(out)
+def char_html(ref,hyp):
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out=[]
     for tag,i1,i2,j1,j2 in sm.get_opcodes():
         elif tag=='insert': out += [f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]]
     return "".join(out)
+@GPU_DECORATOR
+def compare_pronunciation(audio, lang_choice, intended):
+    if audio is None:
+        return ("❌ Please record first","","","","","","","")
+    if not intended.strip():
+        return ("❌ Please generate a sentence first","","","","","","","")
+    ptext = transcribe_audio(audio, lang_choice, False)
+    stext = transcribe_audio(audio, lang_choice, True)
+    actual = ptext if not ptext.startswith("Error:") else stext
+    if actual.startswith("Error:"):
+        return (f"❌ {actual}","","","","","","","")
+    wer_val, cer_val = compute_wer(intended, actual), compute_cer(intended, actual)
+    score, feedback = get_score(wer_val, cer_val)
+    return (f"✅ Done - {score}\n💬 {feedback}",
+            ptext, stext,
+            f"{wer_val:.3f} ({(1-wer_val)*100:.1f}%)",
+            f"{cer_val:.3f} ({(1-cer_val)*100:.1f}%)",
+            diff_html(intended, actual),
+            char_html(intended, actual),
+            f"🎯 Target: {intended}")
 # ---------------- UI ---------------- #
 def create_interface():
     with gr.Blocks() as demo:
         lang.change(get_random_sentence, [lang], [intended])
     return demo
+# ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)