Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

App Files Files Community

sudhanm commited on 7 days ago

Commit

eecaaa5

verified ·

1 Parent(s): e8f391d

Update app.py

Browse files

Files changed (1) hide show

app.py +427 -99

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ import torch
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -45,35 +47,90 @@ SENTENCE_BANK = {
         "Learning languages is fun.",
         "I like to drink coffee in the morning.",
         "Technology helps us communicate better.",
-        "Reading books expands our knowledge."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
         "எனக்கு புத்தகம் படிக்க விருப்பம்.",
         "தமிழ் மொழி மிகவும் அழகானது.",
-        "நான் தினமும் பள்ளிக்கு செல்கிறேன்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
         "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
         "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
-        "ഞാൻ മലയാളം പഠിക്കുന്നു."
     ]
 }
-# ---------------- LOAD MODELS ---------------- #
-print("Loading Whisper models...")
-whisper_models = {}
-whisper_processors = {}
-for lang, model_id in MODEL_CONFIGS.items():
-    print(f"Loading {lang} model: {model_id}")
-    whisper_models[lang] = WhisperForConditionalGeneration.from_pretrained(model_id).to(DEVICE)
-    whisper_processors[lang] = WhisperProcessor.from_pretrained(model_id)
-print("All models loaded successfully!")
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
@@ -84,17 +141,55 @@ def is_script(text, lang_name):
     return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
     mapping = {
         "Tamil": sanscript.TAMIL,
         "Malayalam": sanscript.MALAYALAM,
         "English": None
     }
-    return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
-def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temperature, condition_on_previous_text):
     # Get the appropriate model and processor for the language
-    model = whisper_models[language_choice]
-    processor = whisper_processors[language_choice]
     lang_code = LANG_CODES[language_choice]
     # Load and process audio
@@ -126,110 +221,343 @@ def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temp
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription.strip()
-def highlight_differences(ref, hyp):
-    ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
-    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
-    out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            out_html.extend([f"<span style='color:green'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'replace':
-            out_html.extend([f"<span style='color:red'>{w}</span>" for w in ref_words[i1:i2]])
-            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
         elif tag == 'delete':
-            out_html.extend([f"<span style='color:red;text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
         elif tag == 'insert':
-            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
-    return " ".join(out_html)
-def char_level_highlight(ref, hyp):
-    sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
-    out = []
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        if tag == 'equal':
-            out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
-        elif tag in ('replace', 'delete'):
-            out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
-        elif tag == 'insert':
-            out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
-    return "".join(out)
 # ---------------- MAIN ---------------- #
-def compare_pronunciation(audio, language_choice, intended_sentence,
-                          pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
-        return ("No audio or intended sentence.", "", "", "", "", "", "", "")
-    primer_weak, primer_strong = LANG_PRIMERS[language_choice]
-    # Pass 1: raw transcription with user-configured decoding parameters
-    actual_text = transcribe_once(audio, language_choice, primer_weak,
-                                  pass1_beam, pass1_temp, pass1_condition)
-    # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
-    strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
-    corrected_text = transcribe_once(audio, language_choice, strict_prompt,
-                                     beam_size=5, temperature=0.0, condition_on_previous_text=False)
-    # Compute WER and CER
-    wer_val = jiwer.wer(intended_sentence, actual_text)
-    cer_val = jiwer.cer(intended_sentence, actual_text)
-    # Transliteration of Pass 1 output
-    hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
-    # Highlight word-level and character-level differences
-    diff_html = highlight_differences(intended_sentence, actual_text)
-    char_html = char_level_highlight(intended_sentence, actual_text)
-    return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
-            diff_html, char_html, intended_sentence)
 # ---------------- UI ---------------- #
-with gr.Blocks(title="Pronunciation Comparator") as demo:
-    gr.Markdown("## 🎙 Pronunciation Comparator - English, Tamil & Malayalam")
-    gr.Markdown("Practice pronunciation with specialized Whisper models for each language!")
     with gr.Row():
-        lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
-        gen_btn = gr.Button("🎲 Generate Sentence")
-    intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
     with gr.Row():
-        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record your pronunciation")
-    with gr.Column():
-        gr.Markdown("### Transcription Parameters")
-        pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
-        pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
-        pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")
-    submit_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary")
     with gr.Row():
-        pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
-        pass2_out = gr.Textbox(label="Pass 2: Target-Biased Output")
-    with gr.Row():
-        hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
-        wer_out = gr.Textbox(label="Word Error Rate")
-        cer_out = gr.Textbox(label="Character Error Rate")
-    gr.Markdown("### Visual Feedback")
-    diff_html_box = gr.HTML(label="Word Differences Highlighted")
-    char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
     # Event handlers
-    gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
-    submit_btn.click(
         fn=compare_pronunciation,
-        inputs=[audio_input, language_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
-        outputs=[
-            pass1_out, pass2_out, hk_translit, wer_out, cer_out,
-            diff_html_box, char_html_box, intended_display
-        ]
     )
 if __name__ == "__main__":

 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
+import spaces
+import gc
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
         "Learning languages is fun.",
         "I like to drink coffee in the morning.",
         "Technology helps us communicate better.",
+        "Reading books expands our knowledge.",
+        "Music brings people together.",
+        "Exercise keeps us healthy and strong.",
+        "Cooking is both art and science."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
         "எனக்கு புத்தகம் படிக்க விருப்பம்.",
         "தமிழ் மொழி மிகவும் அழகானது.",
+        "நான் தினமும் பள்ளிக்கு செல்கிறேன்.",
+        "எனக்கு இசை கேட்க மிகவும் பிடிக்கும்.",
+        "அன்னை தமிழ் எங்கள் தாய்மொழி.",
+        "நல்ல உணவு உடல் நலத்திற்கு அவசியம்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
         "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
         "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
+        "ഞാൻ മലയാളം പഠിക്കുന്നു.",
+        "സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്.",
+        "നല്ല ആരോഗ്യം വളരെ പ്രധാനമാണ്.",
+        "വിദ്യാഭ്യാസം ജീവിതത്തിൽ അത്യാവശ്യമാണ്."
     ]
 }
+# ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
+# Store only currently loaded model to save memory
+current_model = {"language": None, "model": None, "processor": None}
+def load_model_for_language(language_choice):
+    """Load model on-demand and clear previous model from memory"""
+    global current_model
+    # If same language is already loaded, return current model
+    if current_model["language"] == language_choice and current_model["model"] is not None:
+        return current_model["model"], current_model["processor"]
+    # Clear previous model from memory
+    if current_model["model"] is not None:
+        del current_model["model"]
+        del current_model["processor"]
+        gc.collect()
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+    # Load new model
+    model_id = MODEL_CONFIGS[language_choice]
+    print(f"Loading {language_choice} model: {model_id}")
+    try:
+        model = WhisperForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32
+        ).to(DEVICE)
+        processor = WhisperProcessor.from_pretrained(model_id)
+        current_model = {
+            "language": language_choice,
+            "model": model,
+            "processor": processor
+        }
+        print(f"✓ {language_choice} model loaded successfully")
+        return model, processor
+    except Exception as e:
+        print(f"✗ Error loading {language_choice} model: {e}")
+        # Fallback to base whisper model
+        print(f"Falling back to openai/whisper-base for {language_choice}")
+        model = WhisperForConditionalGeneration.from_pretrained(
+            "openai/whisper-base",
+            torch_dtype=torch.float32
+        ).to(DEVICE)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+        current_model = {
+            "language": language_choice,
+            "model": model,
+            "processor": processor
+        }
+        return model, processor
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
+    """Improved transliteration with better handling"""
+    if not text or not text.strip():
+        return ""
     mapping = {
         "Tamil": sanscript.TAMIL,
         "Malayalam": sanscript.MALAYALAM,
         "English": None
     }
+    if mapping[lang_choice] is None:
+        return text  # Return as-is for English
+    try:
+        # Clean the text and transliterate
+        cleaned_text = text.strip()
+        transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.HK)
+        return transliterated if transliterated else text
+    except Exception as e:
+        print(f"Transliteration error: {e}")
+        return text
+def transliterate_to_roman(text, lang_choice):
+    """Transliterate to more readable Roman script"""
+    if not text or not text.strip():
+        return ""
+    mapping = {
+        "Tamil": sanscript.TAMIL,
+        "Malayalam": sanscript.MALAYALAM,
+        "English": None
+    }
+    if mapping[lang_choice] is None:
+        return text  # Return as-is for English
+    try:
+        # Clean the text and transliterate to IAST (more readable)
+        cleaned_text = text.strip()
+        transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.IAST)
+        return transliterated if transliterated else text
+    except Exception as e:
+        print(f"Transliteration error: {e}")
+        return text
+@spaces.GPU
+def transcribe_once(audio_path, language_choice, beam_size, temperature):
     # Get the appropriate model and processor for the language
+    model, processor = load_model_for_language(language_choice)
     lang_code = LANG_CODES[language_choice]
     # Load and process audio
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription.strip()
+def create_tabular_feedback(intended, actual, lang_choice):
+    """Create comprehensive tabular feedback with transliteration"""
+    # Get transliterations
+    intended_roman = transliterate_to_roman(intended, lang_choice)
+    actual_roman = transliterate_to_roman(actual, lang_choice)
+    intended_hk = transliterate_to_hk(intended, lang_choice)
+    actual_hk = transliterate_to_hk(actual, lang_choice)
+    # Split into words for comparison
+    intended_words = intended.strip().split()
+    actual_words = actual.strip().split()
+    intended_roman_words = intended_roman.strip().split()
+    actual_roman_words = actual_roman.strip().split()
+    # Calculate accuracy
+    correct_words = 0
+    total_words = len(intended_words)
+    # Create word-by-word comparison table
+    feedback_html = """
+    <div style='font-family: Arial, sans-serif; padding: 20px; background: #f8f9fa; border-radius: 12px; margin: 10px 0;'>
+        <h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Pronunciation Analysis</h3>
+    """
+    # Overview table
+    feedback_html += """
+    <div style='margin-bottom: 25px;'>
+        <h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison</h4>
+        <table style='width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
+            <thead>
+                <tr style='background: #3498db; color: white;'>
+                    <th style='padding: 12px; text-align: left; font-weight: bold;'>Type</th>
+                    <th style='padding: 12px; text-align: left; font-weight: bold;'>Original Script</th>
+                    <th style='padding: 12px; text-align: left; font-weight: bold;'>Roman/IAST</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr style='background: #e8f5e8;'>
+                    <td style='padding: 12px; font-weight: bold; color: #27ae60;'>🎯 Target</td>
+                    <td style='padding: 12px; font-family: monospace;'>{}</td>
+                    <td style='padding: 12px; font-family: monospace; font-style: italic;'>{}</td>
+                </tr>
+                <tr style='background: #fff3e0;'>
+                    <td style='padding: 12px; font-weight: bold; color: #e67e22;'>🗣️ You Said</td>
+                    <td style='padding: 12px; font-family: monospace;'>{}</td>
+                    <td style='padding: 12px; font-family: monospace; font-style: italic;'>{}</td>
+                </tr>
+            </tbody>
+        </table>
+    </div>
+    """.format(intended, intended_roman, actual, actual_roman)
+    # Word-by-word analysis
+    feedback_html += """
+    <div style='margin-bottom: 25px;'>
+        <h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Word-by-Word Analysis</h4>
+        <table style='width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
+            <thead>
+                <tr style='background: #9b59b6; color: white;'>
+                    <th style='padding: 12px; text-align: center; font-weight: bold;'>#</th>
+                    <th style='padding: 12px; text-align: left; font-weight: bold;'>Expected</th>
+                    <th style='padding: 12px; text-align: left; font-weight: bold;'>You Said</th>
+                    <th style='padding: 12px; text-align: center; font-weight: bold;'>Status</th>
+                </tr>
+            </thead>
+            <tbody>
+    """
+    # Compare words using difflib
+    sm = difflib.SequenceMatcher(None, intended_words, actual_words)
+    word_index = 0
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            # Correct words
+            for idx, word in enumerate(intended_words[i1:i2]):
+                word_index += 1
+                correct_words += 1
+                roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
+                actual_word = actual_words[j1 + idx] if (j1 + idx) < len(actual_words) else ""
+                actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
+                feedback_html += f"""
+                <tr style='background: #d4f6d4;'>
+                    <td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
+                    <td style='padding: 10px;'>
+                        <div style='font-family: monospace; font-size: 16px;'>{word}</div>
+                        <div style='font-size: 12px; color: #666; font-style: italic;'>{roman_word}</div>
+                    </td>
+                    <td style='padding: 10px;'>
+                        <div style='font-family: monospace; font-size: 16px; color: #27ae60;'>{actual_word}</div>
+                        <div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
+                    </td>
+                    <td style='padding: 10px; text-align: center;'>
+                        <span style='background: #27ae60; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>✓ Correct</span>
+                    </td>
+                </tr>
+                """
         elif tag == 'replace':
+            # Incorrect words
+            max_words = max(i2-i1, j2-j1)
+            for idx in range(max_words):
+                word_index += 1
+                expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
+                expected_roman = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
+                actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
+                actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
+                feedback_html += f"""
+                <tr style='background: #ffebee;'>
+                    <td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
+                    <td style='padding: 10px;'>
+                        <div style='font-family: monospace; font-size: 16px;'>{expected_word}</div>
+                        <div style='font-size: 12px; color: #666; font-style: italic;'>{expected_roman}</div>
+                    </td>
+                    <td style='padding: 10px;'>
+                        <div style='font-family: monospace; font-size: 16px; color: #e74c3c;'>{actual_word}</div>
+                        <div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
+                    </td>
+                    <td style='padding: 10px; text-align: center;'>
+                        <span style='background: #e74c3c; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>✗ Different</span>
+                    </td>
+                </tr>
+                """
         elif tag == 'delete':
+            # Missing words
+            for idx, word in enumerate(intended_words[i1:i2]):
+                word_index += 1
+                roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
+                feedback_html += f"""
+                <tr style='background: #ffeaa7;'>
+                    <td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
+                    <td style='padding: 10px;'>
+                        <div style='font-family: monospace; font-size: 16px;'>{word}</div>
+                        <div style='font-size: 12px; color: #666; font-style: italic;'>{roman_word}</div>
+                    </td>
+                    <td style='padding: 10px; color: #e17055; font-style: italic;'>
+                        <em>Not spoken</em>
+                    </td>
+                    <td style='padding: 10px; text-align: center;'>
+                        <span style='background: #fdcb6e; color: #2d3436; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>⚠ Missing</span>
+                    </td>
+                </tr>
+                """
         elif tag == 'insert':
+            # Extra words
+            for idx, word in enumerate(actual_words[j1:j2]):
+                actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
+                feedback_html += f"""
+                <tr style='background: #fab1a0;'>
+                    <td style='padding: 10px; text-align: center; font-weight: bold;'>+</td>
+                    <td style='padding: 10px; color: #636e72; font-style: italic;'>
+                        <em>Not expected</em>
+                    </td>
+                    <td style='padding: 10px;'>
+                        <div style='font-family: monospace; font-size: 16px; color: #e17055;'>{word}</div>
+                        <div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
+                    </td>
+                    <td style='padding: 10px; text-align: center;'>
+                        <span style='background: #fd79a8; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>+ Extra</span>
+                    </td>
+                </tr>
+                """
+    feedback_html += """
+            </tbody>
+        </table>
+    </div>
+    """
+    # Calculate accuracy
+    accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
+    # Summary section
+    feedback_html += f"""
+    <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 12px; text-align: center;'>
+        <h4 style='margin: 0 0 15px 0; font-size: 24px;'>🎯 Performance Summary</h4>
+        <div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 15px;'>
+            <div style='background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; min-width: 150px;'>
+                <div style='font-size: 32px; font-weight: bold;'>{accuracy:.1f}%</div>
+                <div style='font-size: 14px; opacity: 0.9;'>Word Accuracy</div>
+            </div>
+            <div style='background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; min-width: 150px;'>
+                <div style='font-size: 32px; font-weight: bold;'>{correct_words}/{total_words}</div>
+                <div style='font-size: 14px; opacity: 0.9;'>Correct Words</div>
+            </div>
+        </div>
+        <div style='margin-top: 15px; font-size: 18px;'>
+    """
+    # Motivational message
+    if accuracy >= 95:
+        feedback_html += "<span>🎉 Outstanding! Perfect pronunciation!</span>"
+    elif accuracy >= 85:
+        feedback_html += "<span>🌟 Excellent work! Very clear pronunciation!</span>"
+    elif accuracy >= 70:
+        feedback_html += "<span>👍 Good job! Keep practicing those tricky words!</span>"
+    elif accuracy >= 50:
+        feedback_html += "<span>📚 Making progress! Focus on the highlighted words!</span>"
+    else:
+        feedback_html += "<span>💪 Keep going! Practice makes perfect!</span>"
+    feedback_html += """
+        </div>
+    </div>
+    """
+    # Add HK transliteration section for reference
+    if lang_choice in ["Tamil", "Malayalam"]:
+        feedback_html += f"""
+        <div style='margin-top: 20px; padding: 15px; background: #ecf0f1; border-radius: 8px;'>
+            <h4 style='color: #2c3e50; margin-bottom: 10px;'>🔤 Harvard-Kyoto Transliteration (for reference)</h4>
+            <div style='display: grid; grid-template-columns: 1fr 1fr; gap: 15px;'>
+                <div>
+                    <strong>Expected:</strong><br>
+                    <span style='font-family: monospace; background: white; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{intended_hk}</span>
+                </div>
+                <div>
+                    <strong>You said:</strong><br>
+                    <span style='font-family: monospace; background: white; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{actual_hk}</span>
+                </div>
+            </div>
+        </div>
+        """
+    feedback_html += "</div>"
+    return feedback_html, accuracy
 # ---------------- MAIN ---------------- #
+@spaces.GPU
+def compare_pronunciation(audio, lang_choice, intended_sentence, pass1_beam, pass1_temp):
     if audio is None or not intended_sentence.strip():
+        return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
+    try:
+        # Single transcription pass with user settings
+        actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
+        if not actual_text.strip():
+            return ("⚠️ No speech detected. Please try recording again.", "", "", "", "")
+        # Compute metrics
+        wer_val = jiwer.wer(intended_sentence, actual_text)
+        cer_val = jiwer.cer(intended_sentence, actual_text)
+        # Get transliterations for both texts
+        intended_roman = transliterate_to_roman(intended_sentence, lang_choice)
+        actual_roman = transliterate_to_roman(actual_text, lang_choice)
+        # Create comprehensive tabular feedback
+        feedback_html, accuracy = create_tabular_feedback(intended_sentence, actual_text, lang_choice)
+        return (
+            actual_text,
+            actual_roman,
+            f"{wer_val:.1%}",
+            f"{cer_val:.1%}",
+            feedback_html
+        )
+    except Exception as e:
+        error_msg = f"❌ Error during transcription: {str(e)}"
+        print(error_msg)
+        return (error_msg, "", "", "", "")
 # ---------------- UI ---------------- #
+with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎙️ AI Pronunciation Coach
+    ### Practice English, Tamil & Malayalam with AI feedback
+    **How to use:**
+    1. Select your language
+    2. Generate a practice sentence
+    3. Record yourself reading it aloud
+    4. Get instant feedback on your pronunciation!
+    """)
     with gr.Row():
+        with gr.Column(scale=2):
+            lang_choice = gr.Dropdown(
+                choices=list(LANG_CODES.keys()),
+                value="Malayalam",
+                label="🌍 Choose Language"
+            )
+        with gr.Column(scale=1):
+            gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
+    intended_display = gr.Textbox(
+        label="📝 Practice Sentence (Read this aloud)",
+        interactive=False,
+        placeholder="Click 'Generate Practice Sentence' to get started..."
+    )
     with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="filepath",
+                label="🎤 Record Your Pronunciation"
+            )
+        with gr.Column():
+            gr.Markdown("### ⚙️ Advanced Settings")
+            pass1_beam = gr.Slider(1, 10, value=5, step=1, label="Beam Size (accuracy vs speed)")
+            pass1_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature (creativity)")
+    analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
     with gr.Row():
+        with gr.Column():
+            pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
+            actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Roman)", interactive=False)
+        with gr.Column():
+            wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
+            cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
+    gr.Markdown("### 📋 Detailed Analysis")
+    feedback_display = gr.HTML()
     # Event handlers
+    gen_btn.click(
+        fn=get_random_sentence,
+        inputs=[lang_choice],
+        outputs=[intended_display]
+    )
+    analyze_btn.click(
         fn=compare_pronunciation,
+        inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
+        outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
     )
 if __name__ == "__main__":