Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

App Files Files Community

sudhanm commited on 17 days ago

Commit

60fa434

verified ·

1 Parent(s): b7a8eef

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -457

app.py CHANGED Viewed

@@ -2,60 +2,59 @@ import gradio as gr
 import random
 import difflib
 import re
-import jiwer
 import torch
 import numpy as np
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor
 import librosa
 import soundfile as sf
-from indic_transliteration import sanscript
-from indic_transliteration.sanscript import transliterate
-import warnings
-# Optional: only available on HF Spaces runtime
 try:
     import spaces
     GPU_DECORATOR = spaces.GPU
-except Exception:
-    def GPU_DECORATOR(fn):
-        return fn
 warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-CUDA_DEVICE_INDEX = 0 if torch.cuda.is_available() else -1  # for transformers pipeline device
 print(f"🔧 Using device: {DEVICE}")
 LANG_CODES = {
     "English": "en",
     "Tamil": "ta",
-    "Malayalam": "ml"
 }
-# Primary model
-INDICWHISPER_MODEL = "openai/whisper-large-v2"
-# Specialized models
 SPECIALIZED_MODELS = {
     "English": "openai/whisper-base.en",
     "Tamil": "vasista22/whisper-tamil-large-v2",
-    "Malayalam": "thennal/whisper-medium-ml"
-}
-LANG_PRIMERS = {
-    "English": ("Transcribe in English.",
-                "Write only in English. Example: This is an English sentence."),
-    "Tamil": ("தமிழில் எழுதுக.",
-              "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும். உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
-    "Malayalam": ("മലയാളത്തിൽ എഴുതുക.",
-                  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്.")
 }
 SCRIPT_PATTERNS = {
     "Tamil": re.compile(r"[஀-௿]"),
     "Malayalam": re.compile(r"[ഀ-ൿ]"),
-    "English": re.compile(r"[A-Za-z]")
 }
 SENTENCE_BANK = {
@@ -67,7 +66,7 @@ SENTENCE_BANK = {
         "Music brings people together across cultures.",
         "Education is the key to a bright future.",
         "The flowers bloom beautifully in spring.",
-        "Hard work always pays off in the end."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
@@ -77,7 +76,7 @@ SENTENCE_BANK = {
         "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்.",
         "கல்வி நமது எதிர்காலத்தின் திறவுகோல்.",
         "பறவைகள் காலையில் இனிமையாக பாடுகின்றன.",
-        "உழைப்பு எப்போதும் வெற்றியைத் தரும்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
@@ -87,85 +86,14 @@ SENTENCE_BANK = {
         "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്.",
         "സംഗീതം മനസ്സിന് സന്തോഷം നൽകുന്നു.",
         "കുടുംബസമയം വളരെ വിലപ്പെട്ടതാണ്.",
-        "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും."
-    ]
 }
-# ---------------- MODEL CACHE ---------------- #
 indicwhisper_pipeline = None
 fallback_models = {}
-WHISPER_JAX_AVAILABLE = False  # default false; will set true if we load it
-@GPU_DECORATOR
-def load_indicwhisper():
-    """Load primary high-performance model (prefer transformers pipeline, optionally JAX if available)."""
-    global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
-    if indicwhisper_pipeline is not None:
-        return indicwhisper_pipeline
-    # Try JAX first (optional)
-    try:
-        from whisper_jax import FlaxWhisperPipeline
-        import jax.numpy as jnp
-        print(f"🔄 Loading JAX-optimized model: {INDICWHISPER_MODEL}")
-        indicwhisper_pipeline = FlaxWhisperPipeline(
-            INDICWHISPER_MODEL,
-            dtype=jnp.bfloat16,
-            batch_size=1
-        )
-        WHISPER_JAX_AVAILABLE = True
-        print("✅ JAX-optimized model loaded successfully!")
-        return indicwhisper_pipeline
-    except Exception as e:
-        print(f"⚠️ JAX loading failed: {e}")
-        WHISPER_JAX_AVAILABLE = False
-    # Fallback to transformers pipeline
-    try:
-        from transformers import pipeline
-        print(f"🔄 Loading transformers ASR pipeline: {INDICWHISPER_MODEL}")
-        indicwhisper_pipeline = pipeline(
-            task="automatic-speech-recognition",
-            model=INDICWHISPER_MODEL,
-            device=CUDA_DEVICE_INDEX  # 0 for CUDA, -1 for CPU
-        )
-        print("✅ Transformers ASR pipeline loaded!")
-        return indicwhisper_pipeline
-    except Exception as e:
-        print(f"❌ Failed to load primary model: {e}")
-        indicwhisper_pipeline = None
-        raise Exception(f"Could not load primary model: {str(e)}")
-@GPU_DECORATOR
-def load_specialized_model(language: str):
-    """Load language-specific specialized model with processor."""
-    if language in fallback_models:
-        return fallback_models[language]
-    model_name = SPECIALIZED_MODELS[language]
-    print(f"🔄 Loading specialized model for {language}: {model_name}")
-    try:
-        # WhisperProcessor ensures get_decoder_prompt_ids is available
-        try:
-            processor = WhisperProcessor.from_pretrained(model_name)
-        except Exception:
-            processor = AutoProcessor.from_pretrained(model_name)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-            low_cpu_mem_usage=True
-        )
-        model.to(DEVICE)
-        fallback_models[language] = {"processor": processor, "model": model, "model_name": model_name}
-        print(f"✅ Specialized model loaded for {language}")
-        return fallback_models[language]
-    except Exception as e:
-        print(f"❌ Failed to load specialized {model_name}: {e}")
-        raise Exception(f"Could not load specialized {language} model: {str(e)}")
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
@@ -178,168 +106,61 @@ def is_script(text, lang_name):
     return bool(pattern.search(text or ""))
 def transliterate_to_hk(text, lang_choice):
     mapping = {
         "Tamil": sanscript.TAMIL,
         "Malayalam": sanscript.MALAYALAM,
         "English": None
     }
     script = mapping.get(lang_choice)
-    if not text:
-        return ""
     if script and is_script(text, lang_choice):
         try:
             return transliterate(text, script, sanscript.HK)
-        except Exception as e:
-            print(f"Transliteration error: {e}")
             return text
     return text
 def preprocess_audio(audio_path, target_sr=16000):
-    """Load, normalize, trim, return float32 audio."""
     try:
         audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
         if audio is None or len(audio) == 0:
             return None, None
-        # Normalize
-        m = np.max(np.abs(audio))
-        if m > 0:
-            audio = audio / m
-        # Trim silence
         audio, _ = librosa.effects.trim(audio, top_db=20)
-        # Ensure min length
-        if len(audio) < int(target_sr * 0.1):
             return None, None
-        # Ensure float32
-        if audio.dtype != np.float32:
-            audio = audio.astype(np.float32)
         return audio, target_sr
     except Exception as e:
         print(f"Audio preprocessing error: {e}")
         return None, None
-@GPU_DECORATOR
-def transcribe_with_primary_model(audio_path, language):
-    """Transcribe using primary model (JAX if available else transformers pipeline)."""
-    try:
-        pipe = load_indicwhisper()
-        lang_code = LANG_CODES.get(language, "en")
-        if WHISPER_JAX_AVAILABLE:
-            # whisper-jax expects array or path; pass path is okay
-            result = pipe(audio_path, task="transcribe", language=lang_code)
-            # whisper-jax returns dict with 'text'
-            if isinstance(result, dict) and "text" in result:
-                return (result["text"] or "").strip()
-            return str(result).strip()
-        # transformers pipeline
-        # Some transformers versions accept language/task via generate_kwargs
-        generate_kwargs = {}
-        try:
-            # If underlying model is Whisper, we can set forced decoder ids
-            model = pipe.model if hasattr(pipe, "model") else None
-            tokenizer = getattr(pipe, "tokenizer", None)
-            processor = getattr(pipe, "feature_extractor", None)
-            if hasattr(pipe, "tokenizer") and hasattr(model, "config"):
-                try:
-                    forced_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang_code, task="transcribe")
-                    model.config.forced_decoder_ids = forced_ids
-                except Exception as e:
-                    print(f"⚠️ Primary model language forcing failed: {e}")
-        except Exception as e:
-            print(f"⚠️ Primary model prompt config error: {e}")
-        out = pipe(audio_path, generate_kwargs=generate_kwargs)
-        if isinstance(out, dict) and "text" in out:
-            return (out["text"] or "").strip()
-        elif isinstance(out, str):
-            return out.strip()
-        else:
-            return str(out).strip()
-    except Exception as e:
-        print(f"Primary model transcription error: {e}")
-        return f"Error: {str(e)[:200]}"
-@GPU_DECORATOR
-def transcribe_with_specialized_model(audio_path, language):
-    """Transcribe using language-specific models."""
     try:
-        components = load_specialized_model(language)
-        processor = components["processor"]
-        model = components["model"]
-        audio, sr = preprocess_audio(audio_path)
-        if audio is None:
-            return "Error: Audio too short or could not be processed"
-        inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
-        # WhisperProcessor returns input_features
-        input_features = inputs.get("input_features", None)
-        if input_features is None:
-            # Fallback: some processors use feature_extractor path
-            input_features = inputs.get("input_values", None)
-        if input_features is None:
-            return "Error: Could not prepare input features"
-        input_features = input_features.to(DEVICE)
-        generate_kwargs = {
-            "max_length": 200,
-            "num_beams": 3,
-            "do_sample": False
-        }
-        if language != "English":
-            lang_code = LANG_CODES.get(language, "en")
-            try:
-                if hasattr(processor, "get_decoder_prompt_ids"):
-                    forced_decoder_ids = processor.get_decoder_prompt_ids(
-                        language=lang_code,
-                        task="transcribe"
-                    )
-                    generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
-                elif hasattr(model, "config") and hasattr(processor, "tokenizer"):
-                    forced_decoder_ids = processor.tokenizer.get_decoder_prompt_ids(
-                        language=lang_code,
-                        task="transcribe"
-                    )
-                    model.config.forced_decoder_ids = forced_decoder_ids
-            except Exception as e:
-                print(f"⚠️ Language forcing failed: {e}")
-        with torch.no_grad():
-            predicted_ids = model.generate(input_features=input_features, **generate_kwargs)
-        transcription = processor.batch_decode(
-            predicted_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True
-        )[0]
-        return (transcription or "").strip() or "(No transcription generated)"
-    except Exception as e:
-        print(f"Specialized model transcription error: {e}")
-        return f"Error: {str(e)[:200]}"
-@GPU_DECORATOR
-def transcribe_audio(audio_path, language, initial_prompt="", use_specialized=False):
-    """Dispatch to primary or specialized path with fallback."""
     try:
-        if use_specialized:
-            print(f"🔄 Using specialized model for {language}")
-            return transcribe_with_specialized_model(audio_path, language)
-        else:
-            print(f"🔄 Using primary model for {language}")
-            return transcribe_with_primary_model(audio_path, language)
-    except Exception as e:
-        print(f"Transcription failed, trying specialized model: {e}")
-        if not use_specialized:
-            return transcribe_audio(audio_path, language, initial_prompt, use_specialized=True)
-        else:
-            return f"Error: All transcription methods failed - {str(e)[:200]}"
 def highlight_differences(ref, hyp):
-    if not (ref or "").strip() or not (hyp or "").strip():
         return "No text to compare"
     ref_words = ref.strip().split()
     hyp_words = hyp.strip().split()
@@ -347,262 +168,205 @@ def highlight_differences(ref, hyp):
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
-            out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
-            out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>→{w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
-            out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
-            out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
-    if not (ref or "").strip() or not (hyp or "").strip():
         return "No text to compare"
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            out.extend([f"<span style='color:green; background-color:#e8f5e8;'>{c}</span>" for c in ref[i1:i2]])
         elif tag in ('replace', 'delete'):
-            out.extend([f"<span style='color:red; text-decoration:underline; background-color:#ffe8e8; font-weight:bold;'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
-            out.extend([f"<span style='color:orange; background-color:#fff3cd; font-weight:bold;'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
 def get_pronunciation_score(wer_val, cer_val):
-    combined_score = (wer_val * 0.7) + (cer_val * 0.3)
-    if combined_score <= 0.1:
         return "🏆 Excellent! (90%+)", "Your pronunciation is outstanding!"
-    elif combined_score <= 0.2:
         return "🎉 Very Good! (80-90%)", "Great pronunciation with minor areas for improvement."
-    elif combined_score <= 0.4:
-        return "👍 Good! (60-80%)", "Good effort! Keep practicing for better accuracy."
-    elif combined_score <= 0.6:
-        return "📚 Needs Practice (40-60%)", "Focus on clearer pronunciation of highlighted words."
     else:
-        return "💪 Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
-# ---------------- MAIN FUNCTION ---------------- #
 @GPU_DECORATOR
-def compare_pronunciation(audio, language_choice, intended_sentence):
-    print(f"🔍 Starting analysis with language: {language_choice}")
-    print(f"📝 Audio file: {audio}")
-    print(f"🎯 Intended sentence: {intended_sentence}")
-    if audio is None:
-        return ("❌ Please record audio first.", "", "", "", "", "", "", "")
-    if not (intended_sentence or "").strip():
-        return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "")
     try:
-        print("🔄 Pass 1: Primary model transcription...")
-        primary_text = transcribe_audio(audio, language_choice, use_specialized=False)
-        print(f"✅ Primary: {primary_text}")
-        print("🔄 Pass 2: Specialized model transcription...")
-        specialized_text = transcribe_audio(audio, language_choice, use_specialized=True)
-        print(f"✅ Specialized: {specialized_text}")
-        actual_text = primary_text if not str(primary_text).startswith("Error:") else specialized_text
-        if str(actual_text).startswith("Error:"):
-            return (f"❌ {actual_text}", "", "", "", "", "", "", "")
-        try:
-            wer_val = jiwer.wer(intended_sentence, actual_text)
-            cer_val = jiwer.cer(intended_sentence, actual_text)
-        except Exception as e:
-            print(f"❌ Metrics error: {e}")
-            wer_val, cer_val = 1.0, 1.0
-        score_text, feedback = get_pronunciation_score(wer_val, cer_val)
-        actual_hk = transliterate_to_hk(actual_text, language_choice)
-        target_hk = transliterate_to_hk(intended_sentence, language_choice)
-        if language_choice != "English" and not is_script(actual_text, language_choice):
-            actual_hk = f"⚠️ Expected {language_choice} script, got mixed/other script"
-        diff_html = highlight_differences(intended_sentence, actual_text)
-        char_html = char_level_highlight(intended_sentence, actual_text)
-        status = f"✅ Analysis Complete - {score_text}\n💬 {feedback}\n🚀 Powered by High-Performance ASR Models"
-        return (
-            status,
-            primary_text or "(No primary transcription)",
-            specialized_text or "(No specialized transcription)",
-            f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
-            f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
-            diff_html,
-            char_html,
-            f"🎯 Target: {intended_sentence}"
         )
     except Exception as e:
-        error_msg = f"❌ Analysis Error: {str(e)[:200]}"
-        return (error_msg, str(e), "", "", "", "", "", "")
-# ---------------- UI ---------------- #
-def create_interface():
-    with gr.Blocks(title="🎙️ SOTA Multilingual Pronunciation Trainer") as demo:
-        gr.Markdown("""
-        # 🎙️ Advanced Multilingual Pronunciation Trainer
-        Practice pronunciation in Tamil, Malayalam & English using high-performance ASR models!
-        ### 🏆 Powered by Advanced Models:
-        - Dual-Model Analysis: Primary + specialized model comparison
-        - High Accuracy: Language-specific fine-tuned models
-        - Robust Performance: Automatic fallback for reliability
-        ### 📋 How to Use:
-        1. Select your target language 🌍
-        2. Generate a practice sentence 🎲
-        3. Record yourself reading it aloud 🎤
-        4. Get detailed feedback with advanced accuracy 📊
-        ### 🎯 Features:
-        - Dual-pass analysis for comprehensive assessment
-        - Visual highlighting of pronunciation errors
-        - Romanization for Indic scripts
-        - Advanced metrics (Word & Character accuracy)
-        """)
-        with gr.Row():
-            with gr.Column(scale=3):
-                lang_choice = gr.Dropdown(
-                    choices=list(LANG_CODES.keys()),
-                    value="Tamil",
-                    label="🌍 Select Language"
-                )
-            with gr.Column(scale=1):
-                gen_btn = gr.Button("🎲 Generate Sentence", variant="primary")
-        intended_display = gr.Textbox(
-            label="📝 Practice Sentence (Read this aloud)",
-            placeholder="Click 'Generate Sentence' to get started...",
-            interactive=False,
-            lines=3
-        )
-        audio_input = gr.Audio(
-            sources=["microphone", "upload"],
-            type="filepath",
-            label="🎤 Record Your Pronunciation"
         )
-        analyze_btn = gr.Button("🔍 Analyze with Advanced Models", variant="primary")
-        status_output = gr.Textbox(
-            label="📊 Advanced Analysis Results",
-            interactive=False,
-            lines=4
-        )
-        with gr.Row():
-            with gr.Column():
-                pass1_out = gr.Textbox(
-                    label="🏆 Primary Model Output",
-                    interactive=False,
-                    lines=2
-                )
-                wer_out = gr.Textbox(
-                    label="📈 Word Accuracy",
-                    interactive=False
-                )
-            with gr.Column():
-                pass2_out = gr.Textbox(
-                    label="🔧 Specialized Model Comparison",
-                    interactive=False,
-                    lines=2
                 )
-        cer_out = gr.Textbox(
-            label="📊 Character Accuracy",
-            interactive=False
-        )
-        with gr.Accordion("📝 Detailed Visual Feedback", open=True):
-            gr.Markdown("""
-            ### 🎨 Color Guide:
-            - 🟢 Green: Correctly pronounced words/characters
-            - 🔴 Red: Missing or mispronounced (strikethrough)
-            - 🟠 Orange: Extra words or substitutions
-            """)
-            diff_html_box = gr.HTML(label="🔍 Word-Level Analysis", show_label=True)
-            char_html_box = gr.HTML(label="🔤 Character-Level Analysis", show_label=True)
-        target_display = gr.Textbox(
-            label="🎯 Reference Text",
-            interactive=False,
-            visible=False
-        )
-        gen_btn.click(
-            fn=get_random_sentence,
-            inputs=[lang_choice],
-            outputs=[intended_display]
-        )
-        analyze_btn.click(
-            fn=compare_pronunciation,
-            inputs=[audio_input, lang_choice, intended_display],
-            outputs=[
-                status_output,      # status
-                pass1_out,          # primary transcription
-                pass2_out,          # specialized transcription
-                wer_out,            # wer formatted
-                cer_out,            # cer formatted
-                diff_html_box,      # diff_html
-                char_html_box,      # char_html
-                target_display      # target_display
-            ]
-        )
-        lang_choice.change(
-            fn=get_random_sentence,
-            inputs=[lang_choice],
-            outputs=[intended_display]
-        )
-        gr.Markdown("""
-        ---
-        ### 🏆 Advanced Technology Stack:
-        - Primary ASR: OpenAI Whisper Large v2 (High-performance multilingual model)
-        - Specialized Models: Fine-tuned language-specific models
-          - Tamil: vasista22/whisper-tamil-large-v2 (IIT Madras Speech Lab)
-          - Malayalam: thennal/whisper-medium-ml (Common Voice trained)
-          - English: openai/whisper-base.en (English-optimized)
-        - Dual Analysis: Primary + specialized model comparison
-        - Automatic Fallback: Ensures reliable results always
-        ### 🔧 Technical Details:
-        - Metrics: WER (Word Error Rate) and CER (Character Error Rate)
-        - Transliteration: Harvard-Kyoto system for Indic scripts
-        - Analysis: Dual-model comparison for comprehensive feedback
-        - Languages: English, Tamil, and Malayalam
-        """)
     return demo
 # ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
-    print("🚀 Starting Advanced Multilingual Pronunciation Trainer...")
-    print(f"🔧 Device: {DEVICE}")
-    try:
-        torch_ver = getattr(torch, '__version__', 'unknown')
-    except Exception:
-        torch_ver = 'unknown'
-    print(f"🔧 PyTorch version: {torch_ver}")
-    print("🏆 Using High-Performance Dual-Model Approach")
-    print("⚡ Automatic model selection with specialized fallbacks")
-    print("📊 Advanced analysis with robust error handling")
     demo = create_interface()
-    demo.launch(
-        share=True,
-        show_error=True,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 import random
 import difflib
 import re
+import warnings
 import torch
 import numpy as np
 import librosa
 import soundfile as sf
+import jiwer
+# Optional: Indic transliteration
+try:
+    from indic_transliteration import sanscript
+    from indic_transliteration.sanscript import transliterate
+    INDIC_OK = True
+except:
+    INDIC_OK = False
+    sanscript = None
+    transliterate = None
+# Optional: HF Spaces GPU decorator
 try:
     import spaces
     GPU_DECORATOR = spaces.GPU
+except:
+    class _NoOp:
+        def __call__(self, f): return f
+    GPU_DECORATOR = _NoOp()
 warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEVICE_INDEX = 0 if DEVICE == "cuda" else -1
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 print(f"🔧 Using device: {DEVICE}")
 LANG_CODES = {
     "English": "en",
     "Tamil": "ta",
+    "Malayalam": "ml",
 }
+# AI4Bharat IndicWhisper community port
+INDICWHISPER_MODEL = "parthiv11/indic_whisper_nodcil"
 SPECIALIZED_MODELS = {
     "English": "openai/whisper-base.en",
     "Tamil": "vasista22/whisper-tamil-large-v2",
+    "Malayalam": "thennal/whisper-medium-ml",
 }
 SCRIPT_PATTERNS = {
     "Tamil": re.compile(r"[஀-௿]"),
     "Malayalam": re.compile(r"[ഀ-ൿ]"),
+    "English": re.compile(r"[A-Za-z]"),
 }
 SENTENCE_BANK = {
         "Music brings people together across cultures.",
         "Education is the key to a bright future.",
         "The flowers bloom beautifully in spring.",
+        "Hard work always pays off in the end.",
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்.",
         "கல்வி நமது எதிர்காலத்தின் திறவுகோல்.",
         "பறவைகள் காலையில் இனிமையாக பாடுகின்றன.",
+        "உழைப்பு எப்போதும் வெற்றியைத் தரும்.",
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്.",
         "സംഗീതം മനസ്സിന് സന്തോഷം നൽകുന്നു.",
         "കുടുംബസമയം വളരെ വിലപ്പെട്ടതാണ്.",
+        "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും.",
+    ],
 }
+# Model cache
 indicwhisper_pipeline = None
 fallback_models = {}
+WHISPER_JAX_AVAILABLE = False
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     return bool(pattern.search(text or ""))
 def transliterate_to_hk(text, lang_choice):
+    if not INDIC_OK:
+        return text
     mapping = {
         "Tamil": sanscript.TAMIL,
         "Malayalam": sanscript.MALAYALAM,
         "English": None
     }
     script = mapping.get(lang_choice)
     if script and is_script(text, lang_choice):
         try:
             return transliterate(text, script, sanscript.HK)
+        except:
             return text
     return text
 def preprocess_audio(audio_path, target_sr=16000):
     try:
         audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
         if audio is None or len(audio) == 0:
             return None, None
+        audio = audio.astype(np.float32)
+        max_abs = np.max(np.abs(audio))
+        if max_abs > 0:
+            audio /= max_abs
         audio, _ = librosa.effects.trim(audio, top_db=20)
+        if len(audio) < target_sr * 0.1:
             return None, None
         return audio, target_sr
     except Exception as e:
         print(f"Audio preprocessing error: {e}")
         return None, None
+# Normalization for WER
+JIWER_TRANSFORM = jiwer.Compose([
+    jiwer.ToLowerCase(),
+    jiwer.RemovePunctuation(),
+    jiwer.RemoveMultipleSpaces(),
+    jiwer.Strip(),
+    jiwer.ReduceToListOfListOfWords(),
+])
+def compute_wer(ref, hyp):
     try:
+        return jiwer.wer(ref, hyp, truth_transform=JIWER_TRANSFORM, hypothesis_transform=JIWER_TRANSFORM)
+    except:
+        return 1.0
+def compute_cer(ref, hyp):
     try:
+        return jiwer.cer(ref, hyp)
+    except:
+        return 1.0
 def highlight_differences(ref, hyp):
+    if not ref.strip() or not hyp.strip():
         return "No text to compare"
     ref_words = ref.strip().split()
     hyp_words = hyp.strip().split()
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            out_html.extend([f"<span style='color:green; background-color:#e8f5e8;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
+            out_html.extend([f"<span style='color:red; text-decoration:line-through;'>{w}</span>" for w in ref_words[i1:i2]])
+            out_html.extend([f"<span style='color:orange;'>→{w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
+            out_html.extend([f"<span style='color:red; text-decoration:line-through;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
+            out_html.extend([f"<span style='color:orange;'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
+    if not ref.strip() or not hyp.strip():
         return "No text to compare"
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            out.extend([f"<span style='color:green;'>{c}</span>" for c in ref[i1:i2]])
         elif tag in ('replace', 'delete'):
+            out.extend([f"<span style='color:red;'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
+            out.extend([f"<span style='color:orange;'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
 def get_pronunciation_score(wer_val, cer_val):
+    combined = (wer_val * 0.7) + (cer_val * 0.3)
+    if combined <= 0.1:
         return "🏆 Excellent! (90%+)", "Your pronunciation is outstanding!"
+    elif combined <= 0.2:
         return "🎉 Very Good! (80-90%)", "Great pronunciation with minor areas for improvement."
+    elif combined <= 0.4:
+        return "👍 Good! (60-80%)", "Good effort! Keep practicing."
+    elif combined <= 0.6:
+        return "📚 Needs Practice (40-60%)", "Focus on clearer pronunciation."
     else:
+        return "💪 Keep Trying! (<40%)", "Don't give up!"
+# ---------------- LOADERS ---------------- #
 @GPU_DECORATOR
+def load_indicwhisper():
+    global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
+    if indicwhisper_pipeline is not None:
+        return indicwhisper_pipeline
+    # Try JAX first
     try:
+        from whisper_jax import FlaxWhisperPipeline
+        import jax.numpy as jnp
+        print(f"🔄 Loading JAX IndicWhisper: {INDICWHISPER_MODEL}")
+        indicwhisper_pipeline = FlaxWhisperPipeline(
+            INDICWHISPER_MODEL, dtype=jnp.bfloat16, batch_size=1
         )
+        WHISPER_JAX_AVAILABLE = True
+        print("✅ JAX Loaded!")
+        return indicwhisper_pipeline
     except Exception as e:
+        print(f"⚠️ JAX unavailable: {e}")
+        WHISPER_JAX_AVAILABLE = False
+    # Fallback to Transformers
+    try:
+        from transformers import pipeline
+        indicwhisper_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model=INDICWHISPER_MODEL,
+            device=DEVICE_INDEX
         )
+        print("✅ Transformers IndicWhisper loaded!")
+        return indicwhisper_pipeline
+    except Exception as e:
+        print(f"❌ Failed to load IndicWhisper: {e}")
+        raise
+@GPU_DECORATOR
+def load_specialized_model(language):
+    if language in fallback_models:
+        return fallback_models[language]
+    from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+    model_name = SPECIALIZED_MODELS[language]
+    processor = AutoProcessor.from_pretrained(model_name)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_name, torch_dtype=DTYPE,
+        low_cpu_mem_usage=True
+    ).to(DEVICE)
+    fallback_models[language] = {"processor": processor, "model": model}
+    return fallback_models[language]
+# ---------------- TRANSCRIBE ---------------- #
+@GPU_DECORATOR
+def transcribe_with_primary_model(audio_path, language):
+    try:
+        pl = load_indicwhisper()
+        lang_code = LANG_CODES.get(language, "en")
+        # JAX
+        if WHISPER_JAX_AVAILABLE:
+            result = pl(audio_path, task="transcribe", language=lang_code)
+            if isinstance(result, dict) and "text" in result:
+                return result["text"].strip()
+            return str(result).strip()
+        # Transformers
+        if hasattr(pl, "model") and hasattr(pl, "tokenizer"):
+            try:
+                forced_ids = pl.tokenizer.get_decoder_prompt_ids(language=lang_code, task="transcribe")
+                pl.model.config.forced_decoder_ids = forced_ids
+            except: pass
+        out = pl(audio_path)
+        if isinstance(out, dict) and 'text' in out:
+            return out['text'].strip()
+        return str(out).strip()
+    except Exception as e:
+        return f"Error: {str(e)}"
+@GPU_DECORATOR
+def transcribe_with_specialized_model(audio_path, language):
+    try:
+        c = load_specialized_model(language)
+        audio, sr = preprocess_audio(audio_path)
+        if audio is None:
+            return "Error: Audio too short"
+        inputs = c["processor"](audio, sampling_rate=sr, return_tensors="pt")
+        input_features = inputs.input_features.to(DEVICE)
+        generate_kwargs = {"inputs": input_features, "max_length": 200, "num_beams": 3}
+        if language != "English":
+            try:
+                forced_ids = c["processor"].tokenizer.get_decoder_prompt_ids(
+                    language=LANG_CODES[language], task="transcribe"
                 )
+                generate_kwargs["forced_decoder_ids"] = forced_ids
+            except: pass
+        with torch.no_grad():
+            ids = c["model"].generate(**generate_kwargs)
+        text = c["processor"].batch_decode(ids, skip_special_tokens=True)[0]
+        return text.strip()
+    except Exception as e:
+        return f"Error: {str(e)}"
+@GPU_DECORATOR
+def transcribe_audio(audio_path, language, use_specialized=False):
+    try:
+        if use_specialized:
+            return transcribe_with_specialized_model(audio_path, language)
+        else:
+            return transcribe_with_primary_model(audio_path, language)
+    except:
+        if not use_specialized:
+            return transcribe_audio(audio_path, language, use_specialized=True)
+        return "Error"
+# ---------------- MAIN FUNCTION ---------------- #
+@GPU_DECORATOR
+def compare_pronunciation(audio, language_choice, intended_sentence):
+    if audio is None:
+        return ("❌ Please record audio first.", "", "", "", "", "", "", "")
+    if not intended_sentence.strip():
+        return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "")
+    primary_text = transcribe_audio(audio, language_choice, use_specialized=False)
+    specialized_text = transcribe_audio(audio, language_choice, use_specialized=True)
+    actual_text = primary_text if not primary_text.startswith("Error:") else specialized_text
+    if actual_text.startswith("Error:"):
+        return (f"❌ {actual_text}", "", "", "", "", "", "", "")
+    wer_val = compute_wer(intended_sentence, actual_text)
+    cer_val = compute_cer(intended_sentence, actual_text)
+    score_text, feedback = get_pronunciation_score(wer_val, cer_val)
+    diff_html = highlight_differences(intended_sentence, actual_text)
+    char_html = char_level_highlight(intended_sentence, actual_text)
+    return (
+        f"✅ Analysis Complete - {score_text}\n💬 {feedback}",
+        primary_text, specialized_text,
+        f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
+        f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% char accuracy)",
+        diff_html, char_html,
+        f"🎯 Target: {intended_sentence}"
+    )
+# ---------------- UI ---------------- #
+def create_interface():
+    with gr.Blocks(title="🎙️ IndicWhisper Pronunciation Trainer") as demo:
+        gr.Markdown("# 🎙️ IndicWhisper-based Pronunciation Trainer")
+        with gr.Row():
+            lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Tamil", label="🌍 Language")
+            gen_btn = gr.Button("🎲 Generate Sentence")
+        intended_display = gr.Textbox(label="📝 Practice Sentence", interactive=False, lines=3)
+        audio_input = gr.Audio(sources=["microphone","upload"], type="filepath", label="🎤 Record")
+        analyze_btn = gr.Button("🔍 Analyze")
+        status_output = gr.Textbox(label="📊 Results", interactive=False, lines=4)
+        with gr.Row():
+            pass1_out = gr.Textbox(label="🏆 Primary (IndicWhisper)", interactive=False)
+            pass2_out = gr.Textbox(label="🔧 Specialized", interactive=False)
+        wer_out = gr.Textbox(label="📈 Word Accuracy", interactive=False)
+        cer_out = gr.Textbox(label="📊 Char Accuracy", interactive=False)
+        diff_html_box = gr.HTML(label="Word-Level Analysis")
+        char_html_box = gr.HTML(label="Character-Level Analysis")
+        target_display = gr.Textbox(label="🎯 Reference", interactive=False, visible=False)
+        gen_btn.click(get_random_sentence, [lang_choice], [intended_display])
+        analyze_btn.click(compare_pronunciation,
+                          [audio_input, lang_choice, intended_display],
+                          [status_output, pass1_out, pass2_out, wer_out, cer_out, diff_html_box, char_html_box, target_display])
+        lang_choice.change(get_random_sentence, [lang_choice], [intended_display])
     return demo
 # ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)