Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 13 days ago

Commit

2175de9

verified ·

1 Parent(s): 494a527

Update app.py

Browse files

Files changed (1) hide show

app.py +310 -181

app.py CHANGED Viewed

@@ -9,10 +9,9 @@ import numpy as np
 from transformers import (
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
-    AutoTokenizer,
-    AutoModel
 )
-from TTS.api import TTS
 import librosa
 import soundfile as sf
 from indic_transliteration import sanscript
@@ -22,6 +21,7 @@ warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 LANG_CODES = {
     "English": "en",
@@ -31,21 +31,22 @@ LANG_CODES = {
     "Sanskrit": "sa"
 }
-# AI4Bharat model configurations
 ASR_MODELS = {
     "English": "openai/whisper-base.en",
-    "Tamil": "ai4bharat/whisper-medium-ta",
-    "Malayalam": "ai4bharat/whisper-medium-ml",
-    "Hindi": "ai4bharat/whisper-medium-hi",
-    "Sanskrit": "ai4bharat/whisper-medium-hi"  # Fallback to Hindi for Sanskrit
 }
-TTS_MODELS = {
-    "English": "tts_models/en/ljspeech/tacotron2-DDC",
-    "Tamil": "tts_models/ta/mai/tacotron2-DDC",
-    "Malayalam": "tts_models/ml/mai/tacotron2-DDC",
-    "Hindi": "tts_models/hi/mai/tacotron2-DDC",
-    "Sanskrit": "tts_models/hi/mai/tacotron2-DDC"  # Fallback to Hindi
 }
 LANG_PRIMERS = {
@@ -75,84 +76,103 @@ SENTENCE_BANK = {
         "Learning new languages opens many doors.",
         "I enjoy reading books in the evening.",
         "Technology has changed our daily lives.",
-        "Music brings people together across cultures."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
         "எனக்கு புத்தகம் படிக்க விருப்பம்.",
         "தமிழ் மொழி மிகவும் அழகானது.",
-        "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
         "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
         "കേരളത്തിന്റെ പ്രകൃതി സുന്ദരമാണ്.",
-        "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്."
     ],
     "Hindi": [
         "आज मौसम बहुत अच्छा है।",
         "मुझे हिंदी बोलना पसंद है।",
         "मैं रोज किताब पढ़ता हूँ।",
         "भारत की संस्कृति विविधतापूर्ण है।",
-        "शिक्षा हमारे भविष्य की कुंजी है।"
     ],
     "Sanskrit": [
         "अहं ग्रन्थं पठामि।",
         "अद्य सूर्यः तेजस्वी अस्ति।",
         "मम नाम रामः।",
         "विद्या सर्वत्र पूज्यते।",
-        "सत्यमेव जयते।"
     ]
 }
 # ---------------- MODEL CACHE ---------------- #
 asr_models = {}
-tts_models = {}
 def load_asr_model(language):
-    """Load ASR model for specific language"""
     if language not in asr_models:
         try:
             model_name = ASR_MODELS[language]
-            print(f"Loading ASR model for {language}: {model_name}")
-            processor = AutoProcessor.from_pretrained(model_name)
-            model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(DEVICE)
-            asr_models[language] = {"processor": processor, "model": model}
-            print(f"✅ ASR model loaded for {language}")
         except Exception as e:
-            print(f"❌ Failed to load ASR for {language}: {e}")
-            # Fallback to English model
             if language != "English":
-                print(f"🔄 Falling back to English ASR for {language}")
                 load_asr_model("English")
                 asr_models[language] = asr_models["English"]
     return asr_models[language]
-def load_tts_model(language):
-    """Load TTS model for specific language"""
-    if language not in tts_models:
-        try:
-            model_name = TTS_MODELS[language]
-            print(f"Loading TTS model for {language}: {model_name}")
-            tts = TTS(model_name=model_name).to(DEVICE)
-            tts_models[language] = tts
-            print(f"✅ TTS model loaded for {language}")
-        except Exception as e:
-            print(f"❌ Failed to load TTS for {language}: {e}")
-            # Fallback to English
-            if language != "English":
-                print(f"🔄 Falling back to English TTS for {language}")
-                load_tts_model("English")
-                tts_models[language] = tts_models["English"]
-    return tts_models[language]
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     """Get random sentence for practice"""
@@ -161,7 +181,9 @@ def get_random_sentence(language_choice):
 def is_script(text, lang_name):
     """Check if text is in expected script"""
     pattern = SCRIPT_PATTERNS.get(lang_name)
-    return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
     """Transliterate Indic text to Harvard-Kyoto"""
@@ -177,7 +199,8 @@ def transliterate_to_hk(text, lang_choice):
     if script and is_script(text, lang_choice):
         try:
             return transliterate(text, script, sanscript.HK)
-        except:
             return text
     return text
@@ -188,70 +211,89 @@ def preprocess_audio(audio_path, target_sr=16000):
         audio, sr = librosa.load(audio_path, sr=target_sr)
         # Normalize audio
-        audio = audio / np.max(np.abs(audio))
-        # Remove silence
         audio, _ = librosa.effects.trim(audio, top_db=20)
         return audio, target_sr
     except Exception as e:
         print(f"Audio preprocessing error: {e}")
         return None, None
-def transcribe_with_ai4bharat(audio_path, language, initial_prompt=""):
-    """Transcribe audio using AI4Bharat models"""
     try:
-        # Load model
         asr_components = load_asr_model(language)
         processor = asr_components["processor"]
         model = asr_components["model"]
         # Preprocess audio
         audio, sr = preprocess_audio(audio_path)
         if audio is None:
-            return "Error: Could not process audio"
         # Prepare inputs
-        inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
-        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
         # Generate transcription
         with torch.no_grad():
-            predicted_ids = model.generate(**inputs, max_length=200)
         # Decode
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
         return transcription.strip()
     except Exception as e:
         print(f"Transcription error for {language}: {e}")
-        return f"Error: Transcription failed - {str(e)}"
-def synthesize_with_ai4bharat(text, language):
-    """Synthesize speech using AI4Bharat TTS"""
-    if not text.strip():
-        return None
-    try:
-        # Load TTS model
-        tts = load_tts_model(language)
-        # Generate audio
-        audio_path = f"/tmp/tts_output_{hash(text)}.wav"
-        tts.tts_to_file(text=text, file_path=audio_path)
-        # Load generated audio
-        audio, sr = librosa.load(audio_path, sr=22050)
-        return sr, audio
-    except Exception as e:
-        print(f"TTS error for {language}: {e}")
-        return None
 def highlight_differences(ref, hyp):
-    """Highlight word-level differences"""
     ref_words = ref.strip().split()
     hyp_words = hyp.strip().split()
@@ -260,176 +302,249 @@ def highlight_differences(ref, hyp):
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            out_html.extend([f"<span style='color:green; font-weight:bold'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
-            out_html.extend([f"<span style='color:red; text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
-            out_html.extend([f"<span style='color:orange; font-weight:bold'> → {w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
-            out_html.extend([f"<span style='color:red; text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
-            out_html.extend([f"<span style='color:orange; font-weight:bold'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
     """Highlight character-level differences"""
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
         elif tag in ('replace', 'delete'):
-            out.extend([f"<span style='color:red; text-decoration:underline; font-weight:bold'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
-            out.extend([f"<span style='color:orange; background-color:yellow'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
 # ---------------- MAIN FUNCTION ---------------- #
 def compare_pronunciation(audio, language_choice, intended_sentence):
     """Main function to compare pronunciation"""
-    if audio is None or not intended_sentence.strip():
-        return ("❌ No audio or intended sentence provided.", "", "", "", "", "",
-                None, None, "", "")
     try:
-        print(f"Processing audio for {language_choice}")
         # Pass 1: Raw transcription
         primer_weak, _ = LANG_PRIMERS[language_choice]
-        actual_text = transcribe_with_ai4bharat(audio, language_choice, primer_weak)
-        # Pass 2: Target-biased transcription
         _, primer_strong = LANG_PRIMERS[language_choice]
-        strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
-        corrected_text = transcribe_with_ai4bharat(audio, language_choice, strict_prompt)
-        # Error metrics
         try:
             wer_val = jiwer.wer(intended_sentence, actual_text)
             cer_val = jiwer.cer(intended_sentence, actual_text)
-        except:
             wer_val, cer_val = 1.0, 1.0
-        # Transliteration
         hk_translit = transliterate_to_hk(actual_text, language_choice)
-        if not is_script(actual_text, language_choice):
-            hk_translit = f"⚠️ Script mismatch: expected {language_choice} script"
         # Visual feedback
         diff_html = highlight_differences(intended_sentence, actual_text)
         char_html = char_level_highlight(intended_sentence, actual_text)
-        # TTS synthesis
-        tts_intended = synthesize_with_ai4bharat(intended_sentence, language_choice)
-        tts_actual = synthesize_with_ai4bharat(actual_text, language_choice)
-        # Status message
-        status = f"✅ Analysis complete for {language_choice}"
-        if wer_val < 0.1:
-            status += " - Excellent pronunciation! 🎉"
-        elif wer_val < 0.3:
-            status += " - Good pronunciation! 👍"
-        elif wer_val < 0.5:
-            status += " - Needs improvement 📚"
-        else:
-            status += " - Keep practicing! 💪"
         return (
             status,
-            actual_text,
-            corrected_text,
             hk_translit,
-            f"{wer_val:.3f}",
-            f"{cer_val:.3f}",
             diff_html,
-            tts_intended,
-            tts_actual,
             char_html,
-            intended_sentence
         )
     except Exception as e:
-        error_msg = f"❌ Error during analysis: {str(e)}"
-        print(error_msg)
-        return (error_msg, "", "", "", "", "", None, None, "", "")
 # ---------------- UI ---------------- #
 def create_interface():
-    with gr.Blocks(title="🎙️ AI4Bharat Pronunciation Trainer", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
-        # 🎙️ AI4Bharat Pronunciation Trainer
-        Practice pronunciation in **Tamil, Malayalam, Hindi, Sanskrit & English** using state-of-the-art AI4Bharat models!
-        📋 **How to use:**
-        1. Select your target language
-        2. Generate a practice sentence
-        3. Record yourself reading it aloud
-        4. Get detailed feedback with error analysis
         """)
         with gr.Row():
-            with gr.Column(scale=2):
                 lang_choice = gr.Dropdown(
                     choices=list(LANG_CODES.keys()),
                     value="Tamil",
-                    label="🌍 Select Language"
                 )
             with gr.Column(scale=1):
-                gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
         intended_display = gr.Textbox(
             label="📝 Practice Sentence (Read this aloud)",
-            placeholder="Click 'Generate Practice Sentence' to get started...",
             interactive=False,
-            lines=2
         )
-        with gr.Row():
-            audio_input = gr.Audio(
-                sources=["microphone", "upload"],
-                type="filepath",
-                label="🎤 Record Your Pronunciation"
-            )
         analyze_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary", size="lg")
-        status_output = gr.Textbox(label="📊 Analysis Status", interactive=False)
         with gr.Row():
             with gr.Column():
-                pass1_out = gr.Textbox(label="🎯 What You Actually Said", interactive=False)
-                wer_out = gr.Textbox(label="📈 Word Error Rate (lower = better)", interactive=False)
             with gr.Column():
-                pass2_out = gr.Textbox(label="🔧 Target-Biased Output", interactive=False)
-                cer_out = gr.Textbox(label="📊 Character Error Rate (lower = better)", interactive=False)
-        hk_out = gr.Textbox(label="🔤 Romanization (Harvard-Kyoto)", interactive=False)
-        with gr.Accordion("📝 Detailed Feedback", open=True):
-            diff_html_box = gr.HTML(label="🔍 Word-Level Differences")
-            char_html_box = gr.HTML(label="🔤 Character-Level Analysis")
-        with gr.Row():
-            intended_tts_audio = gr.Audio(label="🔊 Reference Pronunciation", type="numpy")
-            actual_tts_audio = gr.Audio(label="🔊 Your Pronunciation (TTS)", type="numpy")
-        gr.Markdown("""
-        ### 🎨 Color Guide:
-        - 🟢 **Green**: Correctly pronounced
-        - 🔴 **Red**: Missing or incorrect words
-        - 🟠 **Orange**: Extra or substituted words
-        - 🟡 **Yellow background**: Inserted characters
-        """)
         # Event handlers
         gen_btn.click(
-            fn=get_random_sentence,
             inputs=[lang_choice],
-            outputs=[intended_display]
         )
         analyze_btn.click(
@@ -438,8 +553,7 @@ def create_interface():
             outputs=[
                 status_output, pass1_out, pass2_out, hk_out,
                 wer_out, cer_out, diff_html_box,
-                intended_tts_audio, actual_tts_audio,
-                char_html_box, intended_display
             ]
         )
@@ -449,26 +563,41 @@ def create_interface():
             inputs=[lang_choice],
             outputs=[intended_display]
         )
     return demo
 # ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
-    print("🚀 Starting AI4Bharat Pronunciation Trainer...")
-    # Pre-load English models for faster startup
-    print("📦 Pre-loading English models...")
     try:
         load_asr_model("English")
-        load_tts_model("English")
-        print("✅ English models loaded successfully")
     except Exception as e:
-        print(f"⚠️ Warning: Could not pre-load English models: {e}")
     demo = create_interface()
     demo.launch(
         share=True,
         show_error=True,
         server_name="0.0.0.0",
-        server_port=7860
     )

 from transformers import (
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
+    WhisperProcessor,
+    WhisperForConditionalGeneration
 )
 import librosa
 import soundfile as sf
 from indic_transliteration import sanscript
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🔧 Using device: {DEVICE}")
 LANG_CODES = {
     "English": "en",
     "Sanskrit": "sa"
 }
+# Updated model configurations for better HF Spaces compatibility
 ASR_MODELS = {
     "English": "openai/whisper-base.en",
+    "Tamil": "vasista22/whisper-tamil-base",  # Community model for Tamil
+    "Malayalam": "parambharat/whisper-small-ml",  # Community model for Malayalam
+    "Hindi": "vasista22/whisper-hindi-base",  # Community model for Hindi
+    "Sanskrit": "vasista22/whisper-hindi-base"  # Fallback to Hindi for Sanskrit
 }
+# Backup models in case primary ones fail
+FALLBACK_MODELS = {
+    "English": "openai/whisper-base.en",
+    "Tamil": "openai/whisper-small",
+    "Malayalam": "openai/whisper-small",
+    "Hindi": "openai/whisper-small",
+    "Sanskrit": "openai/whisper-small"
 }
 LANG_PRIMERS = {
         "Learning new languages opens many doors.",
         "I enjoy reading books in the evening.",
         "Technology has changed our daily lives.",
+        "Music brings people together across cultures.",
+        "Education is the key to a bright future.",
+        "The flowers bloom beautifully in spring.",
+        "Hard work always pays off in the end."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
         "எனக்கு புத்தகம் படிக்க விருப்பம்.",
         "தமிழ் மொழி மிகவும் அழகானது.",
+        "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்.",
+        "கல்வி நமது எதிர்காலத்தின் திறவுகோல்.",
+        "பறவைகள் காலையில் இனிமையாக பாடுகின்றன.",
+        "உழைப்பு எப்போதும் வெற்றியைத் தரும்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
         "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
         "കേരളത്തിന്റെ പ്രകൃതി സുന്ദരമാണ്.",
+        "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്.",
+        "സംഗീതം മനസ്സിന് സന്തോഷം നൽകുന്നു.",
+        "കുടുംബസമയം വളരെ വിലപ്പെട്ടതാണ്.",
+        "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും."
     ],
     "Hindi": [
         "आज मौसम बहुत अच्छा है।",
         "मुझे हिंदी बोलना पसंद है।",
         "मैं रोज किताब पढ़ता हूँ।",
         "भारत की संस्कृति विविधतापूर्ण है।",
+        "शिक्षा हमारे भविष्य की कुंजी है।",
+        "संगीत हमारे दिल को छूता है।",
+        "परिवार के साथ समय बिताना अनमोल है।",
+        "मेहनत का फल हमेशा मीठा होता है।"
     ],
     "Sanskrit": [
         "अहं ग्रन्थं पठामि।",
         "अद्य सूर्यः तेजस्वी अस्ति।",
         "मम नाम रामः।",
         "विद्या सर्वत्र पूज्यते।",
+        "सत्यमेव जयते।",
+        "गुरुर्ब्रह्मा गुरुर्विष्णुः।",
+        "वसुधैव कुटुम्बकम्।",
+        "श्रम एव विजयते।"
     ]
 }
 # ---------------- MODEL CACHE ---------------- #
 asr_models = {}
 def load_asr_model(language):
+    """Load ASR model for specific language with fallback"""
     if language not in asr_models:
         try:
             model_name = ASR_MODELS[language]
+            print(f"🔄 Loading ASR model for {language}: {model_name}")
+            # Try loading the primary model
+            try:
+                processor = AutoProcessor.from_pretrained(model_name)
+                model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+                    low_cpu_mem_usage=True,
+                    use_safetensors=True
+                ).to(DEVICE)
+                asr_models[language] = {"processor": processor, "model": model, "model_name": model_name}
+                print(f"✅ Primary ASR model loaded for {language}")
+                return asr_models[language]
+            except Exception as e:
+                print(f"⚠️ Primary model failed for {language}: {e}")
+                print(f"🔄 Trying fallback model...")
+                # Try fallback model
+                fallback_name = FALLBACK_MODELS[language]
+                processor = WhisperProcessor.from_pretrained(fallback_name)
+                model = WhisperForConditionalGeneration.from_pretrained(
+                    fallback_name,
+                    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+                    low_cpu_mem_usage=True
+                ).to(DEVICE)
+                asr_models[language] = {"processor": processor, "model": model, "model_name": fallback_name}
+                print(f"✅ Fallback ASR model loaded for {language}")
         except Exception as e:
+            print(f"❌ Failed to load any ASR model for {language}: {e}")
+            # Use English as ultimate fallback
             if language != "English":
+                print(f"🔄 Using English ASR as final fallback for {language}")
                 load_asr_model("English")
                 asr_models[language] = asr_models["English"]
     return asr_models[language]
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     """Get random sentence for practice"""
 def is_script(text, lang_name):
     """Check if text is in expected script"""
     pattern = SCRIPT_PATTERNS.get(lang_name)
+    if not pattern:
+        return True
+    return bool(pattern.search(text))
 def transliterate_to_hk(text, lang_choice):
     """Transliterate Indic text to Harvard-Kyoto"""
     if script and is_script(text, lang_choice):
         try:
             return transliterate(text, script, sanscript.HK)
+        except Exception as e:
+            print(f"Transliteration error: {e}")
             return text
     return text
         audio, sr = librosa.load(audio_path, sr=target_sr)
         # Normalize audio
+        if np.max(np.abs(audio)) > 0:
+            audio = audio / np.max(np.abs(audio))
+        # Remove silence from beginning and end
         audio, _ = librosa.effects.trim(audio, top_db=20)
+        # Ensure minimum length
+        if len(audio) < target_sr * 0.1:  # Less than 0.1 seconds
+            return None, None
         return audio, target_sr
     except Exception as e:
         print(f"Audio preprocessing error: {e}")
         return None, None
+def transcribe_audio(audio_path, language, initial_prompt="", force_language=True):
+    """Transcribe audio using loaded models"""
     try:
+        # Load model components
         asr_components = load_asr_model(language)
         processor = asr_components["processor"]
         model = asr_components["model"]
+        model_name = asr_components["model_name"]
         # Preprocess audio
         audio, sr = preprocess_audio(audio_path)
         if audio is None:
+            return "Error: Audio too short or could not be processed"
         # Prepare inputs
+        inputs = processor(
+            audio,
+            sampling_rate=sr,
+            return_tensors="pt",
+            padding=True
+        )
+        # Move to device
+        input_features = inputs.input_features.to(DEVICE)
         # Generate transcription
         with torch.no_grad():
+            # Set generation parameters
+            generate_kwargs = {
+                "input_features": input_features,
+                "max_length": 200,
+                "num_beams": 5,
+                "temperature": 0.0,
+                "do_sample": False
+            }
+            # Add language forcing if supported
+            if hasattr(model.config, 'forced_decoder_ids') and force_language:
+                lang_code = LANG_CODES.get(language, "en")
+                try:
+                    forced_decoder_ids = processor.get_decoder_prompt_ids(
+                        language=lang_code,
+                        task="transcribe"
+                    )
+                    generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
+                except:
+                    pass  # Skip if not supported
+            predicted_ids = model.generate(**generate_kwargs)
         # Decode
+        transcription = processor.batch_decode(
+            predicted_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True
+        )[0]
         return transcription.strip()
     except Exception as e:
         print(f"Transcription error for {language}: {e}")
+        return f"Error: Transcription failed - {str(e)[:100]}"
 def highlight_differences(ref, hyp):
+    """Highlight word-level differences with better styling"""
+    if not ref.strip() or not hyp.strip():
+        return "No text to compare"
     ref_words = ref.strip().split()
     hyp_words = hyp.strip().split()
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
+            out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
+            out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>→{w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
+            out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
+            out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
     """Highlight character-level differences"""
+    if not ref.strip() or not hyp.strip():
+        return "No text to compare"
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            out.extend([f"<span style='color:green; background-color:#e8f5e8;'>{c}</span>" for c in ref[i1:i2]])
         elif tag in ('replace', 'delete'):
+            out.extend([f"<span style='color:red; text-decoration:underline; background-color:#ffe8e8; font-weight:bold;'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
+            out.extend([f"<span style='color:orange; background-color:#fff3cd; font-weight:bold;'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
+def get_pronunciation_score(wer_val, cer_val):
+    """Calculate pronunciation score and feedback"""
+    # Weight WER more heavily than CER
+    combined_score = (wer_val * 0.7) + (cer_val * 0.3)
+    if combined_score <= 0.1:
+        return "🏆 Excellent! (90%+)", "Your pronunciation is outstanding!"
+    elif combined_score <= 0.2:
+        return "🎉 Very Good! (80-90%)", "Great pronunciation with minor areas for improvement."
+    elif combined_score <= 0.4:
+        return "👍 Good! (60-80%)", "Good effort! Keep practicing for better accuracy."
+    elif combined_score <= 0.6:
+        return "📚 Needs Practice (40-60%)", "Focus on clearer pronunciation of highlighted words."
+    else:
+        return "💪 Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
 # ---------------- MAIN FUNCTION ---------------- #
 def compare_pronunciation(audio, language_choice, intended_sentence):
     """Main function to compare pronunciation"""
+    if audio is None:
+        return ("❌ Please record audio first.", "", "", "", "", "", "", "", "", "")
+    if not intended_sentence.strip():
+        return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "", "", "")
     try:
+        print(f"🔍 Analyzing pronunciation for {language_choice}...")
         # Pass 1: Raw transcription
         primer_weak, _ = LANG_PRIMERS[language_choice]
+        actual_text = transcribe_audio(audio, language_choice, primer_weak, force_language=True)
+        # Pass 2: Target-biased transcription with stronger prompt
         _, primer_strong = LANG_PRIMERS[language_choice]
+        strict_prompt = f"{primer_strong}\nExpected: {intended_sentence}"
+        corrected_text = transcribe_audio(audio, language_choice, strict_prompt, force_language=True)
+        # Handle transcription errors
+        if actual_text.startswith("Error:"):
+            return (f"❌ {actual_text}", "", "", "", "", "", "", "", "", "")
+        # Calculate error metrics
         try:
             wer_val = jiwer.wer(intended_sentence, actual_text)
             cer_val = jiwer.cer(intended_sentence, actual_text)
+        except Exception as e:
+            print(f"Error calculating metrics: {e}")
             wer_val, cer_val = 1.0, 1.0
+        # Get pronunciation score and feedback
+        score_text, feedback = get_pronunciation_score(wer_val, cer_val)
+        # Transliteration for Indic scripts
         hk_translit = transliterate_to_hk(actual_text, language_choice)
+        if not is_script(actual_text, language_choice) and language_choice != "English":
+            hk_translit = f"⚠️ Expected {language_choice} script, got mixed/other script"
         # Visual feedback
         diff_html = highlight_differences(intended_sentence, actual_text)
         char_html = char_level_highlight(intended_sentence, actual_text)
+        # Status message with detailed feedback
+        status = f"✅ Analysis Complete - {score_text}\n💬 {feedback}"
         return (
             status,
+            actual_text or "(No transcription)",
+            corrected_text or "(No corrected transcription)",
             hk_translit,
+            f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
+            f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
             diff_html,
             char_html,
+            intended_sentence,
+            f"🎯 Target: {intended_sentence}"
         )
     except Exception as e:
+        error_msg = f"❌ Analysis Error: {str(e)[:200]}"
+        print(f"Analysis error: {e}")
+        return (error_msg, "", "", "", "", "", "", "", "", "")
 # ---------------- UI ---------------- #
 def create_interface():
+    with gr.Blocks(
+        title="🎙️ Multilingual Pronunciation Trainer",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {max-width: 1200px !important}
+        .feedback-box {font-size: 18px !important; font-weight: bold !important}
+        """
+    ) as demo:
         gr.Markdown("""
+        # 🎙️ Multilingual Pronunciation Trainer
+        **Practice pronunciation in Tamil, Malayalam, Hindi, Sanskrit & English** using advanced speech recognition!
+        ### 📋 How to Use:
+        1. **Select** your target language 🌍
+        2. **Generate** a practice sentence 🎲
+        3. **Record** yourself reading it aloud 🎤
+        4. **Get** detailed feedback with accuracy metrics 📊
+        ### 🎯 Features:
+        - **Dual-pass analysis** for accurate assessment
+        - **Visual highlighting** of pronunciation errors
+        - **Romanization** for Indic scripts
+        - **Detailed metrics** (Word & Character accuracy)
         """)
         with gr.Row():
+            with gr.Column(scale=3):
                 lang_choice = gr.Dropdown(
                     choices=list(LANG_CODES.keys()),
                     value="Tamil",
+                    label="🌍 Select Language",
+                    info="Choose the language you want to practice"
                 )
             with gr.Column(scale=1):
+                gen_btn = gr.Button("🎲 Generate Sentence", variant="primary", size="lg")
         intended_display = gr.Textbox(
             label="📝 Practice Sentence (Read this aloud)",
+            placeholder="Click 'Generate Sentence' to get started...",
             interactive=False,
+            lines=3,
+            show_copy_button=True
         )
+        audio_input = gr.Audio(
+            sources=["microphone", "upload"],
+            type="filepath",
+            label="🎤 Record Your Pronunciation",
+            info="Record yourself reading the sentence above"
+        )
         analyze_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary", size="lg")
+        status_output = gr.Textbox(
+            label="📊 Analysis Results",
+            interactive=False,
+            lines=3,
+            elem_classes=["feedback-box"]
+        )
         with gr.Row():
             with gr.Column():
+                pass1_out = gr.Textbox(
+                    label="🎯 What You Actually Said (Raw Output)",
+                    interactive=False,
+                    lines=2
+                )
+                wer_out = gr.Textbox(
+                    label="📈 Word Accuracy",
+                    interactive=False,
+                    info="Higher percentage = better pronunciation"
+                )
             with gr.Column():
+                pass2_out = gr.Textbox(
+                    label="🔧 Target-Biased Analysis",
+                    interactive=False,
+                    lines=2,
+                    info="What the model thinks you meant to say"
+                )
+                cer_out = gr.Textbox(
+                    label="📊 Character Accuracy",
+                    interactive=False,
+                    info="Character-level pronunciation accuracy"
+                )
+        hk_out = gr.Textbox(
+            label="🔤 Romanization (Harvard-Kyoto)",
+            interactive=False,
+            info="Romanized version for easier analysis",
+            show_copy_button=True
+        )
+        with gr.Accordion("📝 Detailed Visual Feedback", open=True):
+            gr.Markdown("""
+            ### 🎨 Color Guide:
+            - 🟢 **Green**: Correctly pronounced words/characters
+            - 🔴 **Red**: Missing or mispronounced (strikethrough)
+            - 🟠 **Orange**: Extra words or substitutions
+            """)
+            diff_html_box = gr.HTML(
+                label="🔍 Word-Level Analysis",
+                show_label=True
+            )
+            char_html_box = gr.HTML(
+                label="🔤 Character-Level Analysis",
+                show_label=True
+            )
+        target_display = gr.Textbox(
+            label="🎯 Reference Text",
+            interactive=False,
+            visible=False
+        )
         # Event handlers
+        def generate_and_clear(language):
+            sentence = get_random_sentence(language)
+            return sentence, "", "", "", "", "", "", "", "", ""
         gen_btn.click(
+            fn=generate_and_clear,
             inputs=[lang_choice],
+            outputs=[
+                intended_display, status_output, pass1_out, pass2_out,
+                hk_out, wer_out, cer_out, diff_html_box, char_html_box, target_display
+            ]
         )
         analyze_btn.click(
             outputs=[
                 status_output, pass1_out, pass2_out, hk_out,
                 wer_out, cer_out, diff_html_box,
+                char_html_box, intended_display, target_display
             ]
         )
             inputs=[lang_choice],
             outputs=[intended_display]
         )
+        # Footer
+        gr.Markdown("""
+        ---
+        ### 🔧 Technical Details:
+        - **ASR Models**: Community-trained Whisper models optimized for Indic languages
+        - **Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
+        - **Transliteration**: Harvard-Kyoto system for Indic scripts
+        - **Analysis**: Dual-pass approach for comprehensive feedback
+        **Note**: TTS (Text-to-Speech) reference audio will be added in future updates.
+        """)
     return demo
 # ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
+    print("🚀 Starting Multilingual Pronunciation Trainer...")
+    print(f"🔧 Device: {DEVICE}")
+    print(f"🔧 PyTorch version: {torch.__version__}")
+    # Pre-load English model for faster startup
+    print("📦 Pre-loading English model...")
     try:
         load_asr_model("English")
+        print("✅ English model loaded successfully")
     except Exception as e:
+        print(f"⚠️ Warning: Could not pre-load English model: {e}")
     demo = create_interface()
     demo.launch(
         share=True,
         show_error=True,
         server_name="0.0.0.0",
+        server_port=7860,
+        show_tips=True,
+        enable_queue=True
     )