Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

App Files Files Community

sudhanm commited on 16 days ago

Commit

b7a8eef

verified ·

1 Parent(s): a950033

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -196

app.py CHANGED Viewed

@@ -5,21 +5,27 @@ import re
 import jiwer
 import torch
 import numpy as np
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import librosa
 import soundfile as sf
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
-import unicodedata
 import warnings
-import spaces
 warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🔧 Using device: {DEVICE}")
-DEVICE_INDEX = 0 if DEVICE == "cuda" else -1
 LANG_CODES = {
     "English": "en",
@@ -27,8 +33,10 @@ LANG_CODES = {
     "Malayalam": "ml"
 }
 INDICWHISPER_MODEL = "openai/whisper-large-v2"
 SPECIALIZED_MODELS = {
     "English": "openai/whisper-base.en",
     "Tamil": "vasista22/whisper-tamil-large-v2",
@@ -83,23 +91,83 @@ SENTENCE_BANK = {
     ]
 }
-# Controls for stricter script checking and normalization
-STRICT_SCRIPT_CHECK = False  # set True for strict script-only validation
-NORMALIZE_TEXT_FOR_METRICS = True
 # ---------------- MODEL CACHE ---------------- #
 indicwhisper_pipeline = None
 fallback_models = {}
-WHISPER_JAX_AVAILABLE = False
-def normalize_text(s: str) -> str:
-    if not NORMALIZE_TEXT_FOR_METRICS:
-        return s
-    # Normalize unicode and collapse whitespace; do not remove language-specific punctuation
-    s = unicodedata.normalize("NFC", s)
-    s = re.sub(r"\s+", " ", s).strip()
-    return s
 def get_random_sentence(language_choice):
     return random.choice(SENTENCE_BANK[language_choice])
@@ -107,16 +175,7 @@ def is_script(text, lang_name):
     pattern = SCRIPT_PATTERNS.get(lang_name)
     if not pattern:
         return True
-    if not STRICT_SCRIPT_CHECK:
-        # any occurrence of script chars counts as match
-        return bool(pattern.search(text))
-    # strict: allow only spaces and target script chars
-    for ch in text:
-        if ch.isspace():
-            continue
-        if not pattern.match(ch):
-            return False
-    return True
 def transliterate_to_hk(text, lang_choice):
     mapping = {
@@ -125,6 +184,8 @@ def transliterate_to_hk(text, lang_choice):
         "English": None
     }
     script = mapping.get(lang_choice)
     if script and is_script(text, lang_choice):
         try:
             return transliterate(text, script, sanscript.HK)
@@ -134,111 +195,75 @@ def transliterate_to_hk(text, lang_choice):
     return text
 def preprocess_audio(audio_path, target_sr=16000):
     try:
-        audio, sr = librosa.load(audio_path, sr=target_sr)
-        if np.max(np.abs(audio)) > 0:
-            audio = audio / np.max(np.abs(audio))
         audio, _ = librosa.effects.trim(audio, top_db=20)
-        if len(audio) < target_sr * 0.1:
             return None, None
         return audio, target_sr
     except Exception as e:
         print(f"Audio preprocessing error: {e}")
         return None, None
-@spaces.GPU
-def load_indicwhisper():
-    global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
-    if indicwhisper_pipeline is None:
-        try:
-            # Try JAX pipeline
-            try:
-                from whisper_jax import FlaxWhisperPipeline
-                import jax.numpy as jnp
-                print(f"🔄 Loading JAX-optimized model: {INDICWHISPER_MODEL}")
-                indicwhisper_pipeline = FlaxWhisperPipeline(
-                    INDICWHISPER_MODEL,
-                    dtype=jnp.bfloat16,
-                    batch_size=1
-                )
-                WHISPER_JAX_AVAILABLE = True
-                print("✅ JAX-optimized model loaded successfully!")
-                return indicwhisper_pipeline
-            except Exception as e:
-                print(f"⚠️ JAX loading failed: {e}")
-                WHISPER_JAX_AVAILABLE = False
-            # Fallback to transformers pipeline
-            print(f"🔄 Loading transformers pipeline: {INDICWHISPER_MODEL}")
-            from transformers import pipeline
-            indicwhisper_pipeline = pipeline(
-                "automatic-speech-recognition",
-                model=INDICWHISPER_MODEL,
-                device=DEVICE_INDEX,
-                torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
-            )
-            print("✅ High-performance model loaded with transformers!")
-        except Exception as e:
-            print(f"❌ Failed to load primary model: {e}")
-            indicwhisper_pipeline = None
-            raise Exception(f"Could not load high-performance model: {str(e)}")
-    return indicwhisper_pipeline
-@spaces.GPU
-def load_specialized_model(language):
-    if language not in fallback_models:
-        model_name = SPECIALIZED_MODELS[language]
-        print(f"🔄 Loading specialized model for {language}: {model_name}")
-        try:
-            processor = AutoProcessor.from_pretrained(model_name)
-            model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-                low_cpu_mem_usage=True,
-                use_safetensors=True
-            ).to(DEVICE)
-            model.eval()
-            fallback_models[language] = {"processor": processor, "model": model, "model_name": model_name}
-            print(f"✅ Specialized model loaded for {language}")
-        except Exception as e:
-            print(f"❌ Failed to load specialized {model_name}: {e}")
-            raise Exception(f"Could not load specialized {language} model")
-    return fallback_models[language]
-@spaces.GPU
 def transcribe_with_primary_model(audio_path, language):
     try:
         pipe = load_indicwhisper()
-        if callable(pipe):
-            # Try to set forced decoder ids when available
-            if language != "English":
-                lang_code = LANG_CODES.get(language, "en")
                 try:
-                    if hasattr(pipe, "model") and hasattr(pipe, "tokenizer"):
-                        if hasattr(pipe.model, "config"):
-                            forced_ids = pipe.tokenizer.get_decoder_prompt_ids(
-                                language=lang_code, task="transcribe"
-                            )
-                            pipe.model.config.forced_decoder_ids = forced_ids
                 except Exception as e:
-                    print(f"⚠️ Language forcing failed: {e}")
-            result = pipe(audio_path)
-            if isinstance(result, dict) and "text" in result:
-                return result["text"].strip()
-            elif isinstance(result, str):
-                return result.strip()
-            else:
-                return str(result).strip()
         else:
-            return "Error: Pipeline not properly initialized"
     except Exception as e:
         print(f"Primary model transcription error: {e}")
-        raise e
-@spaces.GPU
 def transcribe_with_specialized_model(audio_path, language):
     try:
         components = load_specialized_model(language)
         processor = components["processor"]
@@ -248,15 +273,23 @@ def transcribe_with_specialized_model(audio_path, language):
         if audio is None:
             return "Error: Audio too short or could not be processed"
-        inputs = processor(
-            audio,
-            sampling_rate=sr,
-            return_tensors="pt",
-            padding=True
-        )
-        input_features = inputs.input_features.to(DEVICE)
-        forced_decoder_ids = None
         if language != "English":
             lang_code = LANG_CODES.get(language, "en")
             try:
@@ -265,60 +298,53 @@ def transcribe_with_specialized_model(audio_path, language):
                         language=lang_code,
                         task="transcribe"
                     )
             except Exception as e:
                 print(f"⚠️ Language forcing failed: {e}")
         with torch.no_grad():
-            gen_kwargs = {
-                "max_length": 200,
-                "num_beams": 3,
-                "do_sample": False
-            }
-            if forced_decoder_ids:
-                gen_kwargs["forced_decoder_ids"] = forced_decoder_ids
-            predicted_ids = model.generate(
-                input_features,
-                **gen_kwargs
-            )
         transcription = processor.batch_decode(
             predicted_ids,
             skip_special_tokens=True,
             clean_up_tokenization_spaces=True
         )[0]
-        return transcription.strip() or "(No transcription generated)"
     except Exception as e:
         print(f"Specialized model transcription error: {e}")
-        return f"Error: {str(e)[:150]}..."
-@spaces.GPU
 def transcribe_audio(audio_path, language, initial_prompt="", use_specialized=False):
     try:
         if use_specialized:
             print(f"🔄 Using specialized model for {language}")
             return transcribe_with_specialized_model(audio_path, language)
         else:
-            print(f"🔄 Using high-performance primary model for {language}")
             return transcribe_with_primary_model(audio_path, language)
     except Exception as e:
         print(f"Transcription failed, trying specialized model: {e}")
         if not use_specialized:
             return transcribe_audio(audio_path, language, initial_prompt, use_specialized=True)
         else:
-            return f"Error: All transcription methods failed - {str(e)[:100]}"
 def highlight_differences(ref, hyp):
-    if not ref.strip() or not hyp.strip():
         return "No text to compare"
     ref_words = ref.strip().split()
     hyp_words = hyp.strip().split()
     sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
             out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
@@ -329,13 +355,11 @@ def highlight_differences(ref, hyp):
             out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
             out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
-    if not ref.strip() or not hyp.strip():
         return "No text to compare"
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
@@ -360,63 +384,50 @@ def get_pronunciation_score(wer_val, cer_val):
     else:
         return "💪 Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
-@spaces.GPU
 def compare_pronunciation(audio, language_choice, intended_sentence):
-    print(f"🔍 Starting advanced analysis with language: {language_choice}")
     print(f"📝 Audio file: {audio}")
     print(f"🎯 Intended sentence: {intended_sentence}")
     if audio is None:
-        print("❌ No audio provided")
         return ("❌ Please record audio first.", "", "", "", "", "", "", "")
-    if not intended_sentence.strip():
-        print("❌ No intended sentence")
         return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "")
     try:
-        print(f"🔄 Starting Pass 1: High-performance model transcription...")
         primary_text = transcribe_audio(audio, language_choice, use_specialized=False)
-        print(f"✅ Primary model result: {primary_text}")
-        print("🔄 Starting Pass 2: Specialized model transcription...")
         specialized_text = transcribe_audio(audio, language_choice, use_specialized=True)
-        print(f"✅ Specialized model result: {specialized_text}")
         actual_text = primary_text if not str(primary_text).startswith("Error:") else specialized_text
         if str(actual_text).startswith("Error:"):
-            print(f"❌ Transcription error: {actual_text}")
             return (f"❌ {actual_text}", "", "", "", "", "", "", "")
-        # Normalize for metrics if enabled
-        ref_for_metrics = normalize_text(intended_sentence)
-        hyp_for_metrics = normalize_text(actual_text)
         try:
-            print("🔄 Calculating error metrics...")
-            wer_val = jiwer.wer(ref_for_metrics, hyp_for_metrics)
-            cer_val = jiwer.cer(ref_for_metrics, hyp_for_metrics)
-            print(f"✅ WER: {wer_val:.3f}, CER: {cer_val:.3f}")
         except Exception as e:
-            print(f"❌ Error calculating metrics: {e}")
             wer_val, cer_val = 1.0, 1.0
         score_text, feedback = get_pronunciation_score(wer_val, cer_val)
-        print("🔄 Generating transliterations...")
         actual_hk = transliterate_to_hk(actual_text, language_choice)
         target_hk = transliterate_to_hk(intended_sentence, language_choice)
-        if not is_script(actual_text, language_choice) and language_choice != "English":
             actual_hk = f"⚠️ Expected {language_choice} script, got mixed/other script"
-        print("🔄 Generating visual feedback...")
         diff_html = highlight_differences(intended_sentence, actual_text)
         char_html = char_level_highlight(intended_sentence, actual_text)
-        status = f"✅ Advanced Analysis Complete - {score_text}\n💬 {feedback}\n🚀 Powered by High-Performance ASR Models"
-        print(f"✅ Advanced analysis completed successfully")
         return (
             status,
@@ -431,14 +442,11 @@ def compare_pronunciation(audio, language_choice, intended_sentence):
     except Exception as e:
         error_msg = f"❌ Analysis Error: {str(e)[:200]}"
-        print(f"❌ FATAL ERROR: {e}")
-        import traceback
-        traceback.print_exc()
         return (error_msg, str(e), "", "", "", "", "", "")
 def create_interface():
     with gr.Blocks(title="🎙️ SOTA Multilingual Pronunciation Trainer") as demo:
         gr.Markdown("""
         # 🎙️ Advanced Multilingual Pronunciation Trainer
@@ -446,12 +454,12 @@ def create_interface():
         ### 🏆 Powered by Advanced Models:
         - Dual-Model Analysis: Primary + specialized model comparison
-        - High Accuracy: Language-specific fine-tuned models
         - Robust Performance: Automatic fallback for reliability
         ### 📋 How to Use:
         1. Select your target language 🌍
-        2. Generate a practice sentence 🎲
         3. Record yourself reading it aloud 🎤
         4. Get detailed feedback with advanced accuracy 📊
@@ -520,7 +528,7 @@ def create_interface():
             gr.Markdown("""
             ### 🎨 Color Guide:
             - 🟢 Green: Correctly pronounced words/characters
-            - 🔴 Red: Missing or mispronounced (strikethrough)
             - 🟠 Orange: Extra words or substitutions
             """)
             diff_html_box = gr.HTML(label="🔍 Word-Level Analysis", show_label=True)
@@ -542,14 +550,14 @@ def create_interface():
             fn=compare_pronunciation,
             inputs=[audio_input, lang_choice, intended_display],
             outputs=[
-                status_output,
-                pass1_out,
-                pass2_out,
-                wer_out,
-                cer_out,
-                diff_html_box,
-                char_html_box,
-                target_display
             ]
         )
@@ -563,27 +571,33 @@ def create_interface():
         ---
         ### 🏆 Advanced Technology Stack:
         - Primary ASR: OpenAI Whisper Large v2 (High-performance multilingual model)
-        - Specialized Models:
-          - Tamil: vasista22/whisper-tamil-large-v2
-          - Malayalam: thennal/whisper-medium-ml
-          - English: OpenAI Whisper Base EN
-        - Dual Analysis and Automatic Fallback
         ### 🔧 Technical Details:
-        - Metrics: WER and CER
-        - Transliteration: Harvard-Kyoto for Indic scripts
-        - Languages: English, Tamil, Malayalam
         """)
     return demo
 if __name__ == "__main__":
     print("🚀 Starting Advanced Multilingual Pronunciation Trainer...")
-    print(f"🔧 Device: {DEVICE} (index={DEVICE_INDEX})")
-    print(f"🔧 PyTorch version: {torch.__version__}")
     print("🏆 Using High-Performance Dual-Model Approach")
     print("⚡ Automatic model selection with specialized fallbacks")
     print("📊 Advanced analysis with robust error handling")
-    print("🎮 GPU functions decorated with @spaces.GPU for HuggingFace Spaces")
     demo = create_interface()
     demo.launch(

 import jiwer
 import torch
 import numpy as np
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor
 import librosa
 import soundfile as sf
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
 import warnings
+# Optional: only available on HF Spaces runtime
+try:
+    import spaces
+    GPU_DECORATOR = spaces.GPU
+except Exception:
+    def GPU_DECORATOR(fn):
+        return fn
 warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+CUDA_DEVICE_INDEX = 0 if torch.cuda.is_available() else -1  # for transformers pipeline device
 print(f"🔧 Using device: {DEVICE}")
 LANG_CODES = {
     "English": "en",
     "Malayalam": "ml"
 }
+# Primary model
 INDICWHISPER_MODEL = "openai/whisper-large-v2"
+# Specialized models
 SPECIALIZED_MODELS = {
     "English": "openai/whisper-base.en",
     "Tamil": "vasista22/whisper-tamil-large-v2",
     ]
 }
 # ---------------- MODEL CACHE ---------------- #
 indicwhisper_pipeline = None
 fallback_models = {}
+WHISPER_JAX_AVAILABLE = False  # default false; will set true if we load it
+@GPU_DECORATOR
+def load_indicwhisper():
+    """Load primary high-performance model (prefer transformers pipeline, optionally JAX if available)."""
+    global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
+    if indicwhisper_pipeline is not None:
+        return indicwhisper_pipeline
+    # Try JAX first (optional)
+    try:
+        from whisper_jax import FlaxWhisperPipeline
+        import jax.numpy as jnp
+        print(f"🔄 Loading JAX-optimized model: {INDICWHISPER_MODEL}")
+        indicwhisper_pipeline = FlaxWhisperPipeline(
+            INDICWHISPER_MODEL,
+            dtype=jnp.bfloat16,
+            batch_size=1
+        )
+        WHISPER_JAX_AVAILABLE = True
+        print("✅ JAX-optimized model loaded successfully!")
+        return indicwhisper_pipeline
+    except Exception as e:
+        print(f"⚠️ JAX loading failed: {e}")
+        WHISPER_JAX_AVAILABLE = False
+    # Fallback to transformers pipeline
+    try:
+        from transformers import pipeline
+        print(f"🔄 Loading transformers ASR pipeline: {INDICWHISPER_MODEL}")
+        indicwhisper_pipeline = pipeline(
+            task="automatic-speech-recognition",
+            model=INDICWHISPER_MODEL,
+            device=CUDA_DEVICE_INDEX  # 0 for CUDA, -1 for CPU
+        )
+        print("✅ Transformers ASR pipeline loaded!")
+        return indicwhisper_pipeline
+    except Exception as e:
+        print(f"❌ Failed to load primary model: {e}")
+        indicwhisper_pipeline = None
+        raise Exception(f"Could not load primary model: {str(e)}")
+@GPU_DECORATOR
+def load_specialized_model(language: str):
+    """Load language-specific specialized model with processor."""
+    if language in fallback_models:
+        return fallback_models[language]
+    model_name = SPECIALIZED_MODELS[language]
+    print(f"🔄 Loading specialized model for {language}: {model_name}")
+    try:
+        # WhisperProcessor ensures get_decoder_prompt_ids is available
+        try:
+            processor = WhisperProcessor.from_pretrained(model_name)
+        except Exception:
+            processor = AutoProcessor.from_pretrained(model_name)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            low_cpu_mem_usage=True
+        )
+        model.to(DEVICE)
+        fallback_models[language] = {"processor": processor, "model": model, "model_name": model_name}
+        print(f"✅ Specialized model loaded for {language}")
+        return fallback_models[language]
+    except Exception as e:
+        print(f"❌ Failed to load specialized {model_name}: {e}")
+        raise Exception(f"Could not load specialized {language} model: {str(e)}")
+# ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     return random.choice(SENTENCE_BANK[language_choice])
     pattern = SCRIPT_PATTERNS.get(lang_name)
     if not pattern:
         return True
+    return bool(pattern.search(text or ""))
 def transliterate_to_hk(text, lang_choice):
     mapping = {
         "English": None
     }
     script = mapping.get(lang_choice)
+    if not text:
+        return ""
     if script and is_script(text, lang_choice):
         try:
             return transliterate(text, script, sanscript.HK)
     return text
 def preprocess_audio(audio_path, target_sr=16000):
+    """Load, normalize, trim, return float32 audio."""
     try:
+        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
+        if audio is None or len(audio) == 0:
+            return None, None
+        # Normalize
+        m = np.max(np.abs(audio))
+        if m > 0:
+            audio = audio / m
+        # Trim silence
         audio, _ = librosa.effects.trim(audio, top_db=20)
+        # Ensure min length
+        if len(audio) < int(target_sr * 0.1):
             return None, None
+        # Ensure float32
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
         return audio, target_sr
     except Exception as e:
         print(f"Audio preprocessing error: {e}")
         return None, None
+@GPU_DECORATOR
 def transcribe_with_primary_model(audio_path, language):
+    """Transcribe using primary model (JAX if available else transformers pipeline)."""
     try:
         pipe = load_indicwhisper()
+        lang_code = LANG_CODES.get(language, "en")
+        if WHISPER_JAX_AVAILABLE:
+            # whisper-jax expects array or path; pass path is okay
+            result = pipe(audio_path, task="transcribe", language=lang_code)
+            # whisper-jax returns dict with 'text'
+            if isinstance(result, dict) and "text" in result:
+                return (result["text"] or "").strip()
+            return str(result).strip()
+        # transformers pipeline
+        # Some transformers versions accept language/task via generate_kwargs
+        generate_kwargs = {}
+        try:
+            # If underlying model is Whisper, we can set forced decoder ids
+            model = pipe.model if hasattr(pipe, "model") else None
+            tokenizer = getattr(pipe, "tokenizer", None)
+            processor = getattr(pipe, "feature_extractor", None)
+            if hasattr(pipe, "tokenizer") and hasattr(model, "config"):
                 try:
+                    forced_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang_code, task="transcribe")
+                    model.config.forced_decoder_ids = forced_ids
                 except Exception as e:
+                    print(f"⚠️ Primary model language forcing failed: {e}")
+        except Exception as e:
+            print(f"⚠️ Primary model prompt config error: {e}")
+        out = pipe(audio_path, generate_kwargs=generate_kwargs)
+        if isinstance(out, dict) and "text" in out:
+            return (out["text"] or "").strip()
+        elif isinstance(out, str):
+            return out.strip()
         else:
+            return str(out).strip()
     except Exception as e:
         print(f"Primary model transcription error: {e}")
+        return f"Error: {str(e)[:200]}"
+@GPU_DECORATOR
 def transcribe_with_specialized_model(audio_path, language):
+    """Transcribe using language-specific models."""
     try:
         components = load_specialized_model(language)
         processor = components["processor"]
         if audio is None:
             return "Error: Audio too short or could not be processed"
+        inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
+        # WhisperProcessor returns input_features
+        input_features = inputs.get("input_features", None)
+        if input_features is None:
+            # Fallback: some processors use feature_extractor path
+            input_features = inputs.get("input_values", None)
+        if input_features is None:
+            return "Error: Could not prepare input features"
+        input_features = input_features.to(DEVICE)
+        generate_kwargs = {
+            "max_length": 200,
+            "num_beams": 3,
+            "do_sample": False
+        }
         if language != "English":
             lang_code = LANG_CODES.get(language, "en")
             try:
                         language=lang_code,
                         task="transcribe"
                     )
+                    generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
+                elif hasattr(model, "config") and hasattr(processor, "tokenizer"):
+                    forced_decoder_ids = processor.tokenizer.get_decoder_prompt_ids(
+                        language=lang_code,
+                        task="transcribe"
+                    )
+                    model.config.forced_decoder_ids = forced_decoder_ids
             except Exception as e:
                 print(f"⚠️ Language forcing failed: {e}")
         with torch.no_grad():
+            predicted_ids = model.generate(input_features=input_features, **generate_kwargs)
         transcription = processor.batch_decode(
             predicted_ids,
             skip_special_tokens=True,
             clean_up_tokenization_spaces=True
         )[0]
+        return (transcription or "").strip() or "(No transcription generated)"
     except Exception as e:
         print(f"Specialized model transcription error: {e}")
+        return f"Error: {str(e)[:200]}"
+@GPU_DECORATOR
 def transcribe_audio(audio_path, language, initial_prompt="", use_specialized=False):
+    """Dispatch to primary or specialized path with fallback."""
     try:
         if use_specialized:
             print(f"🔄 Using specialized model for {language}")
             return transcribe_with_specialized_model(audio_path, language)
         else:
+            print(f"🔄 Using primary model for {language}")
             return transcribe_with_primary_model(audio_path, language)
     except Exception as e:
         print(f"Transcription failed, trying specialized model: {e}")
         if not use_specialized:
             return transcribe_audio(audio_path, language, initial_prompt, use_specialized=True)
         else:
+            return f"Error: All transcription methods failed - {str(e)[:200]}"
 def highlight_differences(ref, hyp):
+    if not (ref or "").strip() or not (hyp or "").strip():
         return "No text to compare"
     ref_words = ref.strip().split()
     hyp_words = hyp.strip().split()
     sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
             out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
             out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
             out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
+    if not (ref or "").strip() or not (hyp or "").strip():
         return "No text to compare"
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
     else:
         return "💪 Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
+# ---------------- MAIN FUNCTION ---------------- #
+@GPU_DECORATOR
 def compare_pronunciation(audio, language_choice, intended_sentence):
+    print(f"🔍 Starting analysis with language: {language_choice}")
     print(f"📝 Audio file: {audio}")
     print(f"🎯 Intended sentence: {intended_sentence}")
     if audio is None:
         return ("❌ Please record audio first.", "", "", "", "", "", "", "")
+    if not (intended_sentence or "").strip():
         return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "")
     try:
+        print("🔄 Pass 1: Primary model transcription...")
         primary_text = transcribe_audio(audio, language_choice, use_specialized=False)
+        print(f"✅ Primary: {primary_text}")
+        print("🔄 Pass 2: Specialized model transcription...")
         specialized_text = transcribe_audio(audio, language_choice, use_specialized=True)
+        print(f"✅ Specialized: {specialized_text}")
         actual_text = primary_text if not str(primary_text).startswith("Error:") else specialized_text
         if str(actual_text).startswith("Error:"):
             return (f"❌ {actual_text}", "", "", "", "", "", "", "")
         try:
+            wer_val = jiwer.wer(intended_sentence, actual_text)
+            cer_val = jiwer.cer(intended_sentence, actual_text)
         except Exception as e:
+            print(f"❌ Metrics error: {e}")
             wer_val, cer_val = 1.0, 1.0
         score_text, feedback = get_pronunciation_score(wer_val, cer_val)
         actual_hk = transliterate_to_hk(actual_text, language_choice)
         target_hk = transliterate_to_hk(intended_sentence, language_choice)
+        if language_choice != "English" and not is_script(actual_text, language_choice):
             actual_hk = f"⚠️ Expected {language_choice} script, got mixed/other script"
         diff_html = highlight_differences(intended_sentence, actual_text)
         char_html = char_level_highlight(intended_sentence, actual_text)
+        status = f"✅ Analysis Complete - {score_text}\n💬 {feedback}\n🚀 Powered by High-Performance ASR Models"
         return (
             status,
     except Exception as e:
         error_msg = f"❌ Analysis Error: {str(e)[:200]}"
         return (error_msg, str(e), "", "", "", "", "", "")
+# ---------------- UI ---------------- #
 def create_interface():
     with gr.Blocks(title="🎙️ SOTA Multilingual Pronunciation Trainer") as demo:
         gr.Markdown("""
         # 🎙️ Advanced Multilingual Pronunciation Trainer
         ### 🏆 Powered by Advanced Models:
         - Dual-Model Analysis: Primary + specialized model comparison
+        - High Accuracy: Language-specific fine-tuned models
         - Robust Performance: Automatic fallback for reliability
         ### 📋 How to Use:
         1. Select your target language 🌍
+        2. Generate a practice sentence 🎲
         3. Record yourself reading it aloud 🎤
         4. Get detailed feedback with advanced accuracy 📊
             gr.Markdown("""
             ### 🎨 Color Guide:
             - 🟢 Green: Correctly pronounced words/characters
+            - 🔴 Red: Missing or mispronounced (strikethrough)
             - 🟠 Orange: Extra words or substitutions
             """)
             diff_html_box = gr.HTML(label="🔍 Word-Level Analysis", show_label=True)
             fn=compare_pronunciation,
             inputs=[audio_input, lang_choice, intended_display],
             outputs=[
+                status_output,      # status
+                pass1_out,          # primary transcription
+                pass2_out,          # specialized transcription
+                wer_out,            # wer formatted
+                cer_out,            # cer formatted
+                diff_html_box,      # diff_html
+                char_html_box,      # char_html
+                target_display      # target_display
             ]
         )
         ---
         ### 🏆 Advanced Technology Stack:
         - Primary ASR: OpenAI Whisper Large v2 (High-performance multilingual model)
+        - Specialized Models: Fine-tuned language-specific models
+          - Tamil: vasista22/whisper-tamil-large-v2 (IIT Madras Speech Lab)
+          - Malayalam: thennal/whisper-medium-ml (Common Voice trained)
+          - English: openai/whisper-base.en (English-optimized)
+        - Dual Analysis: Primary + specialized model comparison
+        - Automatic Fallback: Ensures reliable results always
         ### 🔧 Technical Details:
+        - Metrics: WER (Word Error Rate) and CER (Character Error Rate)
+        - Transliteration: Harvard-Kyoto system for Indic scripts
+        - Analysis: Dual-model comparison for comprehensive feedback
+        - Languages: English, Tamil, and Malayalam
         """)
     return demo
+# ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
     print("🚀 Starting Advanced Multilingual Pronunciation Trainer...")
+    print(f"🔧 Device: {DEVICE}")
+    try:
+        torch_ver = getattr(torch, '__version__', 'unknown')
+    except Exception:
+        torch_ver = 'unknown'
+    print(f"🔧 PyTorch version: {torch_ver}")
     print("🏆 Using High-Performance Dual-Model Approach")
     print("⚡ Automatic model selection with specialized fallbacks")
     print("📊 Advanced analysis with robust error handling")
     demo = create_interface()
     demo.launch(