Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 10 days ago

Commit

5a75be5

verified ·

1 Parent(s): 9d552f4

Update app.py

Browse files

Files changed (1) hide show

app.py +385 -162

app.py CHANGED Viewed

@@ -4,41 +4,66 @@ import difflib
 import re
 import jiwer
 import torch
-from parler_tts import ParlerTTSForConditionalGeneration
-from transformers import AutoTokenizer
-from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
-import soundfile as sf
 # ---------------- CONFIG ---------------- #
-MODEL_NAME = "large-v2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 LANG_CODES = {
     "English": "en",
-    "Tamil": "ta",
     "Malayalam": "ml",
     "Hindi": "hi",
     "Sanskrit": "sa"
 }
 LANG_PRIMERS = {
-    "English": ("The transcript should be in English only.",
-                "Write only in English without translation. Example: This is an English sentence."),
-    "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
-              "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
-    "Malayalam": ("ട്രാൻസ്ഖ്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
-                  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
-    "Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
-              "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
-    "Sanskrit": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
-                 "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।")
 }
 SCRIPT_PATTERNS = {
     "Tamil": re.compile(r"[஀-௿]"),
-    "Malayalam": re.compile(r"[ഀ-ൿ]"),
     "Hindi": re.compile(r"[ऀ-ॿ]"),
     "Sanskrit": re.compile(r"[ऀ-ॿ]"),
     "English": re.compile(r"[A-Za-z]")
@@ -46,206 +71,404 @@ SCRIPT_PATTERNS = {
 SENTENCE_BANK = {
     "English": [
-        "The sun sets over the horizon.",
-        "Learning languages is fun.",
-        "I like to drink coffee in the morning."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
-        "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
-        "எனக்கு புத்தகம் படிக்க விருப்பம்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
-        "ഞാൻ പുസ്തകം വായിക്കുന്നു."
     ],
     "Hindi": [
-        "आज मौसम अच्छा है।",
-        "मुझे हिंदी बोलना पसंद है।",
-        "मैं किताब पढ़ रहा हूँ।"
     ],
     "Sanskrit": [
         "अहं ग्रन्थं पठामि।",
         "अद्य सूर्यः तेजस्वी अस्ति।",
-        "मम नाम रामः।"
     ]
 }
-VOICE_STYLE = {
-    "English": "An English female voice with a neutral Indian accent.",
-    "Tamil": "A female speaker with a clear Tamil accent.",
-    "Malayalam": "A female speaker with a clear Malayali accent.",
-    "Hindi": "A female speaker with a neutral Hindi accent.",
-    "Sanskrit": "A female speaker reading in classical Sanskrit style."
-}
-# ---------------- LOAD MODELS ---------------- #
-print("Loading Whisper model...")
-whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
-print("Loading Parler-TTS model...")
-parler_model_id = "parler-tts/parler-tts-mini-v1"  # You may switch to larger models if desired
-parler_tts_model = ParlerTTSForConditionalGeneration.from_pretrained(parler_model_id).to(DEVICE)
-parler_tts_tokenizer = AutoTokenizer.from_pretrained(parler_model_id)
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     return random.choice(SENTENCE_BANK[language_choice])
 def is_script(text, lang_name):
     pattern = SCRIPT_PATTERNS.get(lang_name)
     return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
     mapping = {
         "Tamil": sanscript.TAMIL,
-        "Malayalam": sanscript.MALAYALAM,
         "Hindi": sanscript.DEVANAGARI,
         "Sanskrit": sanscript.DEVANAGARI,
         "English": None
     }
-    return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
-def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
-    segments, _ = whisper_model.transcribe(
-        audio_path,
-        language=lang_code,
-        task="transcribe",
-        initial_prompt=initial_prompt,
-        beam_size=beam_size,
-        temperature=temperature,
-        condition_on_previous_text=condition_on_previous_text,
-        word_timestamps=False
-    )
-    return "".join(s.text for s in segments).strip()
 def highlight_differences(ref, hyp):
-    ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
     sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            out_html.extend([f"<span style='color:green'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
-            out_html.extend([f"<span style='color:red'>{w}</span>" for w in ref_words[i1:i2]])
-            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
-            out_html.extend([f"<span style='color:red;text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
-            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
             out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
         elif tag in ('replace', 'delete'):
-            out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
-            out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
-def synthesize_tts(text, lang_choice):
-    if not text.strip():
-        return None
-    description = VOICE_STYLE.get(lang_choice, "")
-    description_input = parler_tts_tokenizer(description, return_tensors='pt').to(DEVICE)
-    prompt_input = parler_tts_tokenizer(text, return_tensors='pt').to(DEVICE)
-    generation = parler_tts_model.generate(
-        input_ids=description_input.input_ids,
-        attention_mask=description_input.attention_mask,
-        prompt_input_ids=prompt_input.input_ids,
-        prompt_attention_mask=prompt_input.attention_mask
-    )
-    audio_arr = generation.cpu().numpy().squeeze()
-    # Parler-TTS default sample rate is 24000
-    return 24000, audio_arr
-# ---------------- MAIN ---------------- #
-def compare_pronunciation(audio, language_choice, intended_sentence,
-                          pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
-        return ("No audio or intended sentence.", "", "", "", "", "",
                 None, None, "", "")
-    lang_code = LANG_CODES[language_choice]
-    primer_weak, primer_strong = LANG_PRIMERS[language_choice]
-    # Pass 1: raw transcription with user-configured decoding parameters
-    actual_text = transcribe_once(audio, lang_code, primer_weak,
-                                  pass1_beam, pass1_temp, pass1_condition)
-    # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
-    strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
-    corrected_text = transcribe_once(audio, lang_code, strict_prompt,
-                                     beam_size=5, temperature=0.0, condition_on_previous_text=False)
-    # Compute WER and CER
-    wer_val = jiwer.wer(intended_sentence, actual_text)
-    cer_val = jiwer.cer(intended_sentence, actual_text)
-    # Transliteration of Pass 1 output
-    hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
-    # Highlight word-level and character-level differences
-    diff_html = highlight_differences(intended_sentence, actual_text)
-    char_html = char_level_highlight(intended_sentence, actual_text)
-    # Synthesized TTS audios for intended and Pass 1 text
-    tts_intended = synthesize_tts(intended_sentence, language_choice)
-    tts_pass1 = synthesize_tts(actual_text, language_choice)
-    return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
-            diff_html, tts_intended, tts_pass1, char_html, intended_sentence)
 # ---------------- UI ---------------- #
-with gr.Blocks() as demo:
-    gr.Markdown("## 🎙 Pronunciation Comparator + Parler-TTS + Highlights")
-    with gr.Row():
-        lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
-        gen_btn = gr.Button("🎲 Generate Sentence")
-    intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
-    with gr.Row():
-        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
-        pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
-        pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
-        pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")
-    with gr.Row():
-        pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
-        pass2_out = gr.Textbox(label="Pass 2: Target-Biased Output")
-        hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
-    with gr.Row():
-        wer_out = gr.Textbox(label="Word Error Rate")
-        cer_out = gr.Textbox(label="Character Error Rate")
-    diff_html_box = gr.HTML(label="Word Differences Highlighted")
-    char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
-    with gr.Row():
-        intended_tts_audio = gr.Audio(label="TTS - Intended Sentence", type="numpy")
-        pass1_tts_audio = gr.Audio(label="TTS - Pass1 Output", type="numpy")
-    gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
-    submit_btn = gr.Button("Analyze Pronunciation")
-    submit_btn.click(
-        fn=compare_pronunciation,
-        inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
-        outputs=[
-            pass1_out, pass2_out, hk_out, wer_out, cer_out,
-            diff_html_box, intended_tts_audio, pass1_tts_audio,
-            char_html_box, intended_display
-        ]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import re
 import jiwer
 import torch
+import torchaudio
+import numpy as np
+from transformers import (
+    AutoProcessor,
+    AutoModelForSpeechSeq2Seq,
+    AutoTokenizer,
+    AutoModel
+)
+from TTS.api import TTS
+import librosa
+import soundfile as sf
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
+import warnings
+warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 LANG_CODES = {
     "English": "en",
+    "Tamil": "ta",
     "Malayalam": "ml",
     "Hindi": "hi",
     "Sanskrit": "sa"
 }
+# AI4Bharat model configurations
+ASR_MODELS = {
+    "English": "openai/whisper-base.en",
+    "Tamil": "ai4bharat/whisper-medium-ta",
+    "Malayalam": "ai4bharat/whisper-medium-ml",
+    "Hindi": "ai4bharat/whisper-medium-hi",
+    "Sanskrit": "ai4bharat/whisper-medium-hi"  # Fallback to Hindi for Sanskrit
+}
+TTS_MODELS = {
+    "English": "tts_models/en/ljspeech/tacotron2-DDC",
+    "Tamil": "tts_models/ta/mai/tacotron2-DDC",
+    "Malayalam": "tts_models/ml/mai/tacotron2-DDC",
+    "Hindi": "tts_models/hi/mai/tacotron2-DDC",
+    "Sanskrit": "tts_models/hi/mai/tacotron2-DDC"  # Fallback to Hindi
+}
 LANG_PRIMERS = {
+    "English": ("Transcribe in English.",
+                "Write only in English. Example: This is an English sentence."),
+    "Tamil": ("தமிழில் எழுதுக.",
+              "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும். உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
+    "Malayalam": ("മലയാളത്തിൽ എഴുതുക.",
+                  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്."),
+    "Hindi": ("हिंदी में लिखें।",
+              "केवल देवनागरी लिपि में लिखें। उदाहरण: यह एक हिंदी वाक्य है।"),
+    "Sanskrit": ("संस्कृते लिखत।",
+                 "देवनागरी लिपि में लिखें। उदाहरण: अहं संस्कृतं जानामि।")
 }
 SCRIPT_PATTERNS = {
     "Tamil": re.compile(r"[஀-௿]"),
+    "Malayalam": re.compile(r"[ഀ-ൿ]"),
     "Hindi": re.compile(r"[ऀ-ॿ]"),
     "Sanskrit": re.compile(r"[ऀ-ॿ]"),
     "English": re.compile(r"[A-Za-z]")
 SENTENCE_BANK = {
     "English": [
+        "The sun sets over the beautiful horizon.",
+        "Learning new languages opens many doors.",
+        "I enjoy reading books in the evening.",
+        "Technology has changed our daily lives.",
+        "Music brings people together across cultures."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
+        "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
+        "எனக்கு புத்தகம் படிக்க விருப்பம்.",
+        "தமிழ் மொழி மிகவும் அழகானது.",
+        "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
+        "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
+        "കേരളത്തിന്റെ പ്രകൃതി സുന്ദരമാണ്.",
+        "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്."
     ],
     "Hindi": [
+        "आज मौसम बहुत अच्छा है।",
+        "मुझे हिंदी बोलना पसंद है।",
+        "मैं रोज किताब पढ़ता हूँ।",
+        "भारत की संस्कृति विविधतापूर्ण है।",
+        "शिक्षा हमारे भविष्य की कुंजी है।"
     ],
     "Sanskrit": [
         "अहं ग्रन्थं पठामि।",
         "अद्य सूर्यः तेजस्वी अस्ति।",
+        "मम नाम रामः।",
+        "विद्या सर्वत्र पूज्यते।",
+        "सत्यमेव जयते।"
     ]
 }
+# ---------------- MODEL CACHE ---------------- #
+asr_models = {}
+tts_models = {}
+def load_asr_model(language):
+    """Load ASR model for specific language"""
+    if language not in asr_models:
+        try:
+            model_name = ASR_MODELS[language]
+            print(f"Loading ASR model for {language}: {model_name}")
+            processor = AutoProcessor.from_pretrained(model_name)
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(DEVICE)
+            asr_models[language] = {"processor": processor, "model": model}
+            print(f"✅ ASR model loaded for {language}")
+        except Exception as e:
+            print(f"❌ Failed to load ASR for {language}: {e}")
+            # Fallback to English model
+            if language != "English":
+                print(f"🔄 Falling back to English ASR for {language}")
+                load_asr_model("English")
+                asr_models[language] = asr_models["English"]
+    return asr_models[language]
+def load_tts_model(language):
+    """Load TTS model for specific language"""
+    if language not in tts_models:
+        try:
+            model_name = TTS_MODELS[language]
+            print(f"Loading TTS model for {language}: {model_name}")
+            tts = TTS(model_name=model_name).to(DEVICE)
+            tts_models[language] = tts
+            print(f"✅ TTS model loaded for {language}")
+        except Exception as e:
+            print(f"❌ Failed to load TTS for {language}: {e}")
+            # Fallback to English
+            if language != "English":
+                print(f"🔄 Falling back to English TTS for {language}")
+                load_tts_model("English")
+                tts_models[language] = tts_models["English"]
+    return tts_models[language]
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
+    """Get random sentence for practice"""
     return random.choice(SENTENCE_BANK[language_choice])
 def is_script(text, lang_name):
+    """Check if text is in expected script"""
     pattern = SCRIPT_PATTERNS.get(lang_name)
     return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
+    """Transliterate Indic text to Harvard-Kyoto"""
     mapping = {
         "Tamil": sanscript.TAMIL,
+        "Malayalam": sanscript.MALAYALAM,
         "Hindi": sanscript.DEVANAGARI,
         "Sanskrit": sanscript.DEVANAGARI,
         "English": None
     }
+    script = mapping.get(lang_choice)
+    if script and is_script(text, lang_choice):
+        try:
+            return transliterate(text, script, sanscript.HK)
+        except:
+            return text
+    return text
+def preprocess_audio(audio_path, target_sr=16000):
+    """Preprocess audio for ASR"""
+    try:
+        # Load audio
+        audio, sr = librosa.load(audio_path, sr=target_sr)
+        # Normalize audio
+        audio = audio / np.max(np.abs(audio))
+        # Remove silence
+        audio, _ = librosa.effects.trim(audio, top_db=20)
+        return audio, target_sr
+    except Exception as e:
+        print(f"Audio preprocessing error: {e}")
+        return None, None
+def transcribe_with_ai4bharat(audio_path, language, initial_prompt=""):
+    """Transcribe audio using AI4Bharat models"""
+    try:
+        # Load model
+        asr_components = load_asr_model(language)
+        processor = asr_components["processor"]
+        model = asr_components["model"]
+        # Preprocess audio
+        audio, sr = preprocess_audio(audio_path)
+        if audio is None:
+            return "Error: Could not process audio"
+        # Prepare inputs
+        inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        # Generate transcription
+        with torch.no_grad():
+            predicted_ids = model.generate(**inputs, max_length=200)
+        # Decode
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        return transcription.strip()
+    except Exception as e:
+        print(f"Transcription error for {language}: {e}")
+        return f"Error: Transcription failed - {str(e)}"
+def synthesize_with_ai4bharat(text, language):
+    """Synthesize speech using AI4Bharat TTS"""
+    if not text.strip():
+        return None
+    try:
+        # Load TTS model
+        tts = load_tts_model(language)
+        # Generate audio
+        audio_path = f"/tmp/tts_output_{hash(text)}.wav"
+        tts.tts_to_file(text=text, file_path=audio_path)
+        # Load generated audio
+        audio, sr = librosa.load(audio_path, sr=22050)
+        return sr, audio
+    except Exception as e:
+        print(f"TTS error for {language}: {e}")
+        return None
 def highlight_differences(ref, hyp):
+    """Highlight word-level differences"""
+    ref_words = ref.strip().split()
+    hyp_words = hyp.strip().split()
     sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            out_html.extend([f"<span style='color:green; font-weight:bold'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
+            out_html.extend([f"<span style='color:red; text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
+            out_html.extend([f"<span style='color:orange; font-weight:bold'> → {w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
+            out_html.extend([f"<span style='color:red; text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
+            out_html.extend([f"<span style='color:orange; font-weight:bold'>+{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
+    """Highlight character-level differences"""
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
             out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
         elif tag in ('replace', 'delete'):
+            out.extend([f"<span style='color:red; text-decoration:underline; font-weight:bold'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
+            out.extend([f"<span style='color:orange; background-color:yellow'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
+# ---------------- MAIN FUNCTION ---------------- #
+def compare_pronunciation(audio, language_choice, intended_sentence):
+    """Main function to compare pronunciation"""
     if audio is None or not intended_sentence.strip():
+        return ("❌ No audio or intended sentence provided.", "", "", "", "", "",
                 None, None, "", "")
+    try:
+        print(f"Processing audio for {language_choice}")
+        # Pass 1: Raw transcription
+        primer_weak, _ = LANG_PRIMERS[language_choice]
+        actual_text = transcribe_with_ai4bharat(audio, language_choice, primer_weak)
+        # Pass 2: Target-biased transcription
+        _, primer_strong = LANG_PRIMERS[language_choice]
+        strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
+        corrected_text = transcribe_with_ai4bharat(audio, language_choice, strict_prompt)
+        # Error metrics
+        try:
+            wer_val = jiwer.wer(intended_sentence, actual_text)
+            cer_val = jiwer.cer(intended_sentence, actual_text)
+        except:
+            wer_val, cer_val = 1.0, 1.0
+        # Transliteration
+        hk_translit = transliterate_to_hk(actual_text, language_choice)
+        if not is_script(actual_text, language_choice):
+            hk_translit = f"⚠️ Script mismatch: expected {language_choice} script"
+        # Visual feedback
+        diff_html = highlight_differences(intended_sentence, actual_text)
+        char_html = char_level_highlight(intended_sentence, actual_text)
+        # TTS synthesis
+        tts_intended = synthesize_with_ai4bharat(intended_sentence, language_choice)
+        tts_actual = synthesize_with_ai4bharat(actual_text, language_choice)
+        # Status message
+        status = f"✅ Analysis complete for {language_choice}"
+        if wer_val < 0.1:
+            status += " - Excellent pronunciation! 🎉"
+        elif wer_val < 0.3:
+            status += " - Good pronunciation! 👍"
+        elif wer_val < 0.5:
+            status += " - Needs improvement 📚"
+        else:
+            status += " - Keep practicing! 💪"
+        return (
+            status,
+            actual_text,
+            corrected_text,
+            hk_translit,
+            f"{wer_val:.3f}",
+            f"{cer_val:.3f}",
+            diff_html,
+            tts_intended,
+            tts_actual,
+            char_html,
+            intended_sentence
+        )
+    except Exception as e:
+        error_msg = f"❌ Error during analysis: {str(e)}"
+        print(error_msg)
+        return (error_msg, "", "", "", "", "", None, None, "", "")
 # ---------------- UI ---------------- #
+def create_interface():
+    with gr.Blocks(title="🎙️ AI4Bharat Pronunciation Trainer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🎙️ AI4Bharat Pronunciation Trainer
+        Practice pronunciation in **Tamil, Malayalam, Hindi, Sanskrit & English** using state-of-the-art AI4Bharat models!
+        📋 **How to use:**
+        1. Select your target language
+        2. Generate a practice sentence
+        3. Record yourself reading it aloud
+        4. Get detailed feedback with error analysis
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                lang_choice = gr.Dropdown(
+                    choices=list(LANG_CODES.keys()),
+                    value="Tamil",
+                    label="🌍 Select Language"
+                )
+            with gr.Column(scale=1):
+                gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
+        intended_display = gr.Textbox(
+            label="📝 Practice Sentence (Read this aloud)",
+            placeholder="Click 'Generate Practice Sentence' to get started...",
+            interactive=False,
+            lines=2
+        )
+        with gr.Row():
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="🎤 Record Your Pronunciation"
+            )
+        analyze_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary", size="lg")
+        status_output = gr.Textbox(label="📊 Analysis Status", interactive=False)
+        with gr.Row():
+            with gr.Column():
+                pass1_out = gr.Textbox(label="🎯 What You Actually Said", interactive=False)
+                wer_out = gr.Textbox(label="📈 Word Error Rate (lower = better)", interactive=False)
+            with gr.Column():
+                pass2_out = gr.Textbox(label="🔧 Target-Biased Output", interactive=False)
+                cer_out = gr.Textbox(label="📊 Character Error Rate (lower = better)", interactive=False)
+        hk_out = gr.Textbox(label="🔤 Romanization (Harvard-Kyoto)", interactive=False)
+        with gr.Accordion("📝 Detailed Feedback", open=True):
+            diff_html_box = gr.HTML(label="🔍 Word-Level Differences")
+            char_html_box = gr.HTML(label="🔤 Character-Level Analysis")
+        with gr.Row():
+            intended_tts_audio = gr.Audio(label="🔊 Reference Pronunciation", type="numpy")
+            actual_tts_audio = gr.Audio(label="🔊 Your Pronunciation (TTS)", type="numpy")
+        gr.Markdown("""
+        ### 🎨 Color Guide:
+        - 🟢 **Green**: Correctly pronounced
+        - 🔴 **Red**: Missing or incorrect words
+        - 🟠 **Orange**: Extra or substituted words
+        - 🟡 **Yellow background**: Inserted characters
+        """)
+        # Event handlers
+        gen_btn.click(
+            fn=get_random_sentence,
+            inputs=[lang_choice],
+            outputs=[intended_display]
+        )
+        analyze_btn.click(
+            fn=compare_pronunciation,
+            inputs=[audio_input, lang_choice, intended_display],
+            outputs=[
+                status_output, pass1_out, pass2_out, hk_out,
+                wer_out, cer_out, diff_html_box,
+                intended_tts_audio, actual_tts_audio,
+                char_html_box, intended_display
+            ]
+        )
+        # Auto-generate sentence on language change
+        lang_choice.change(
+            fn=get_random_sentence,
+            inputs=[lang_choice],
+            outputs=[intended_display]
+        )
+    return demo
+# ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
+    print("🚀 Starting AI4Bharat Pronunciation Trainer...")
+    # Pre-load English models for faster startup
+    print("📦 Pre-loading English models...")
+    try:
+        load_asr_model("English")
+        load_tts_model("English")
+        print("✅ English models loaded successfully")
+    except Exception as e:
+        print(f"⚠️ Warning: Could not pre-load English models: {e}")
+    demo = create_interface()
+    demo.launch(
+        share=True,
+        show_error=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )