Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

App Files Files Community

sudhanm commited on 6 days ago

Commit

35b317d

verified ·

1 Parent(s): 57ea064

Update app.py

Browse files

Files changed (1) hide show

app.py +391 -203

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import random
 import difflib
 import re
 import jiwer
 import torch
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
@@ -74,6 +75,266 @@ SENTENCE_BANK = {
     ]
 }
 # ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
 # Store only currently loaded model to save memory
 current_model = {"language": None, "model": None, "processor": None}
@@ -139,8 +400,11 @@ def get_random_sentence(language_choice):
 def get_random_sentence_with_transliteration(language_choice):
     sentence = random.choice(SENTENCE_BANK[language_choice])
     if language_choice in ["Tamil", "Malayalam"]:
-        transliteration = transliterate_to_simple_roman(sentence, language_choice)
-        return sentence, transliteration
     else:
         return sentence, ""
@@ -171,88 +435,12 @@ def transliterate_to_hk(text, lang_choice):
         print(f"Transliteration error: {e}")
         return text
 def transliterate_to_simple_roman(text, lang_choice):
-    """Transliterate to Thanglish/Manglish - natural romanization used by speakers"""
-    if not text or not text.strip():
-        return ""
-    if lang_choice == "English":
-        return text  # Return as-is for English
-    try:
-        # First get IAST, then convert to natural romanization
-        if lang_choice == "Tamil":
-            iast_text = transliterate(text, sanscript.TAMIL, sanscript.IAST)
-        elif lang_choice == "Malayalam":
-            iast_text = transliterate(text, sanscript.MALAYALAM, sanscript.IAST)
-        else:
-            return text
-        # Comprehensive cleanup to remove ALL diacritics and make it natural
-        natural_map = {
-            # Vowels with diacritics
-            'ā': 'a', 'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a',
-            'ī': 'i', 'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
-            'ū': 'u', 'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u',
-            'ē': 'e', 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
-            'ō': 'o', 'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
-            # Consonants with diacritics
-            'ṅ': 'ng', 'ň': 'n', 'ñ': 'nj', 'ń': 'n',
-            'ṭ': 't', 'ť': 't', 'ţ': 't',
-            'ḍ': 'd', 'ď': 'd', 'ḏ': 'd',
-            'ṇ': 'n', 'ņ': 'n', 'ṉ': 'n',
-            'ṟ': 'r', 'ř': 'r', 'ŕ': 'r', 'ṛ': 'ru',
-            'ḷ': 'l', 'ľ': 'l', 'ļ': 'l', 'ḻ': 'zh',
-            'ś': 'sh', 'š': 'sh', 'ṣ': 'sh', 'ş': 's',
-            'ḥ': 'h', 'ḫ': 'h', 'ħ': 'h',
-            'ṃ': 'm', 'ṁ': 'm', 'ḿ': 'm',
-            'ç': 'ch', 'č': 'ch',
-            # Vocalic consonants
-            'r̥': 'ri', 'r̥̄': 'ri',
-            'l̥': 'li', 'l̥̄': 'li',
-            # Common combinations
-            'kṣ': 'ksh', 'jñ': 'gn', 'śr': 'shr',
-            # Remove virama and other marks
-            '·': '', '̥': '', '̄': '', '̃': '', '̂': '', '̀': '', '́': '',
-            # Double letters cleanup
-            'aa': 'a', 'ii': 'i', 'uu': 'u', 'ee': 'e', 'oo': 'o'
-        }
-        natural_text = iast_text
-        # Apply all mappings
-        for iast, natural in natural_map.items():
-            natural_text = natural_text.replace(iast, natural)
-        # Additional cleanup passes for any remaining diacritics
-        import unicodedata
-        # Remove all combining diacritical marks
-        natural_text = ''.join(c for c in unicodedata.normalize('NFD', natural_text)
-                              if unicodedata.category(c) != 'Mn')
-        # Fix common Malayalam/Tamil patterns
-        natural_text = natural_text.replace('zhz', 'zh')  # Double zh fix
-        natural_text = natural_text.replace('nnn', 'nn')  # Triple n fix
-        natural_text = natural_text.replace('lll', 'll')  # Triple l fix
-        natural_text = natural_text.replace('tth', 'th')  # Simplify aspirated
-        natural_text = natural_text.replace('ddh', 'dh')  # Simplify aspirated
-        # Make it more natural for Manglish/Thanglish
-        if lang_choice == "Malayalam":
-            natural_text = natural_text.replace('samgitam', 'sangeetham')
-            natural_text = natural_text.replace('jivitattinre', 'jeevitathinte')
-            natural_text = natural_text.replace('bhagaman', 'bhagamaanu')
-        return natural_text if natural_text else text
-    except Exception as e:
-        print(f"Transliteration error: {e}")
-        return text
 @spaces.GPU
 def transcribe_once(audio_path, language_choice, beam_size, temperature):
@@ -305,14 +493,14 @@ def normalize_word(word):
     # Remove punctuation and whitespace
     return word.strip().translate(str.maketrans('', '', string.punctuation)).lower()
-def create_tabular_feedback(intended, actual, lang_choice):
-    """Create clean, readable tabular feedback without background colors"""
-    # Get simple transliterations
-    intended_roman = transliterate_to_simple_roman(intended, lang_choice)
-    actual_roman = transliterate_to_simple_roman(actual, lang_choice)
-    intended_hk = transliterate_to_hk(intended, lang_choice)
-    actual_hk = transliterate_to_hk(actual, lang_choice)
     # Split into words for comparison
     intended_words = intended.strip().split()
@@ -320,38 +508,26 @@ def create_tabular_feedback(intended, actual, lang_choice):
     intended_roman_words = intended_roman.strip().split()
     actual_roman_words = actual_roman.strip().split()
-    # Calculate accuracy
     correct_words = 0
     total_words = len(intended_words)
     # Create word-by-word comparison table
     feedback_html = """
     <div style='font-family: Arial, sans-serif; padding: 20px; margin: 10px 0;'>
-        <h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Pronunciation Analysis</h3>
     """
-    # Show simple transliteration of target sentence for easier reading
-    if lang_choice in ["Tamil", "Malayalam"]:
-        feedback_html += f"""
-        <div style='margin-bottom: 25px; padding: 15px; border: 2px solid #3498db; border-radius: 8px; background: #f8f9fa;'>
-            <h4 style='color: #3498db; margin-bottom: 10px;'>🎯 Target Sentence (Reading Guide)</h4>
-            <div style='font-size: 20px; font-family: monospace; color: #2c3e50; line-height: 1.4;'>
-                <strong>Original:</strong> {intended}<br>
-                <strong>Romanized:</strong> <span style='color: #e67e22; font-weight: bold;'>{intended_roman}</span>
-            </div>
-        </div>
-        """
-    # Overview table - completely clean
     feedback_html += """
     <div style='margin-bottom: 25px;'>
-        <h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison</h4>
         <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
             <thead>
                 <tr style='border-bottom: 2px solid #ddd;'>
                     <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
                     <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
-                    <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Romanized</th>
                 </tr>
             </thead>
             <tbody>
@@ -370,31 +546,30 @@ def create_tabular_feedback(intended, actual, lang_choice):
     </div>
     """.format(intended, intended_roman, actual, actual_roman)
-    # Word-by-word analysis - clean table
     feedback_html += """
     <div style='margin-bottom: 25px;'>
-        <h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Word-by-Word Check</h4>
         <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
             <thead>
                 <tr style='border-bottom: 2px solid #ddd;'>
                     <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>#</th>
                     <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Expected Word</th>
                     <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>What You Said</th>
                     <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50;'>Result</th>
                 </tr>
             </thead>
             <tbody>
     """
-    # Compare words using difflib with normalized comparison
-    normalized_intended = [normalize_word(w) for w in intended_words]
-    normalized_actual = [normalize_word(w) for w in actual_words]
-    sm = difflib.SequenceMatcher(None, normalized_intended, normalized_actual)
     word_index = 0
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
-            # Correct words - clean white background
             for idx, word in enumerate(intended_words[i1:i2]):
                 word_index += 1
                 correct_words += 1
@@ -413,15 +588,18 @@ def create_tabular_feedback(intended, actual, lang_choice):
                         <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #27ae60;'>{actual_word}</div>
                         <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
                     </td>
                     <td style='padding: 12px; text-align: center;'>
                         <span style='color: #27ae60; font-weight: bold; font-size: 20px;'>✓</span>
-                        <div style='font-size: 12px; color: #27ae60; margin-top: 2px;'>Correct</div>
                     </td>
                 </tr>
                 """
         elif tag == 'replace':
-            # Incorrect words - clean white with colored text only
             max_words = max(i2-i1, j2-j1)
             for idx in range(max_words):
                 word_index += 1
@@ -430,6 +608,34 @@ def create_tabular_feedback(intended, actual, lang_choice):
                 actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
                 actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
                 feedback_html += f"""
                 <tr style='border-bottom: 1px solid #eee;'>
                     <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
@@ -438,12 +644,15 @@ def create_tabular_feedback(intended, actual, lang_choice):
                         <div style='font-size: 13px; color: #888;'>({expected_roman})</div>
                     </td>
                     <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #e74c3c;'>{actual_word}</div>
                         <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
                     </td>
                     <td style='padding: 12px; text-align: center;'>
-                        <span style='color: #e74c3c; font-weight: bold; font-size: 20px;'>✗</span>
-                        <div style='font-size: 12px; color: #e74c3c; margin-top: 2px;'>Different</div>
                     </td>
                 </tr>
                 """
@@ -463,6 +672,9 @@ def create_tabular_feedback(intended, actual, lang_choice):
                     <td style='padding: 12px; color: #f39c12; font-style: italic; border-right: 1px solid #ddd;'>
                         <em>Not spoken</em>
                     </td>
                     <td style='padding: 12px; text-align: center;'>
                         <span style='color: #f39c12; font-weight: bold; font-size: 20px;'>⚠</span>
                         <div style='font-size: 12px; color: #f39c12; margin-top: 2px;'>Missing</div>
@@ -484,6 +696,9 @@ def create_tabular_feedback(intended, actual, lang_choice):
                         <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #9b59b6;'>{word}</div>
                         <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
                     </td>
                     <td style='padding: 12px; text-align: center;'>
                         <span style='color: #9b59b6; font-weight: bold; font-size: 20px;'>+</span>
                         <div style='font-size: 12px; color: #9b59b6; margin-top: 2px;'>Extra</div>
@@ -497,74 +712,57 @@ def create_tabular_feedback(intended, actual, lang_choice):
     </div>
     """
-    # Calculate accuracy
     accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
-    # Clean summary section
     feedback_html += f"""
     <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 25px; border-radius: 12px; text-align: center; margin-top: 20px;'>
-        <h4 style='margin: 0 0 20px 0; font-size: 24px;'>🎯 Your Score</h4>
         <div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 20px;'>
             <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
                 <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{accuracy:.0f}%</div>
-                <div style='font-size: 16px; opacity: 0.9;'>Accuracy</div>
             </div>
             <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
-                <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{correct_words}/{total_words}</div>
-                <div style='font-size: 16px; opacity: 0.9;'>Words Correct</div>
             </div>
         </div>
-        <div style='margin-top: 20px; font-size: 18px;'>
     """
-    # Simple motivational message
     if accuracy >= 95:
-        feedback_html += "<span>🎉 Perfect! Outstanding pronunciation!</span>"
     elif accuracy >= 85:
-        feedback_html += "<span>🌟 Excellent! Very clear speaking!</span>"
     elif accuracy >= 70:
-        feedback_html += "<span>👍 Good job! Keep practicing!</span>"
     elif accuracy >= 50:
-        feedback_html += "<span>📚 Getting better! Focus on the red words!</span>"
     else:
-        feedback_html += "<span>💪 Keep going! Practice makes perfect!</span>"
-    feedback_html += """
-        </div>
-    </div>
-    """
-    # Optional technical section (collapsed)
-    if lang_choice in ["Tamil", "Malayalam"]:
-        feedback_html += f"""
-        <details style='margin-top: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 8px;'>
-            <summary style='cursor: pointer; font-weight: bold; color: #2c3e50; padding: 5px;'>🔧 Technical Details (for experts)</summary>
-            <div style='margin-top: 15px; display: grid; grid-template-columns: 1fr 1fr; gap: 15px;'>
-                <div>
-                    <strong>Expected (Harvard-Kyoto):</strong><br>
-                    <span style='font-family: monospace; background: #f5f5f5; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{intended_hk}</span>
-                </div>
-                <div>
-                    <strong>You said (Harvard-Kyoto):</strong><br>
-                    <span style='font-family: monospace; background: #f5f5f5; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{actual_hk}</span>
-                </div>
-            </div>
-        </details>
-        """
-    feedback_html += "</div>"
     return feedback_html, accuracy
 # ---------------- MAIN ---------------- #
 @spaces.GPU
-def compare_pronunciation(audio, lang_choice, intended_sentence, pass1_beam, pass1_temp):
-    if audio is None or not intended_sentence.strip():
         return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
     try:
         # Single transcription pass with user settings
         actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
@@ -575,12 +773,12 @@ def compare_pronunciation(audio, lang_choice, intended_sentence, pass1_beam, pas
         wer_val = jiwer.wer(intended_sentence, actual_text)
         cer_val = jiwer.cer(intended_sentence, actual_text)
-        # Get transliterations for both texts
-        intended_roman = transliterate_to_simple_roman(intended_sentence, lang_choice)
-        actual_roman = transliterate_to_simple_roman(actual_text, lang_choice)
-        # Create comprehensive tabular feedback
-        feedback_html, accuracy = create_tabular_feedback(intended_sentence, actual_text, lang_choice)
         return (
             actual_text,
@@ -598,14 +796,19 @@ def compare_pronunciation(audio, lang_choice, intended_sentence, pass1_beam, pas
 # ---------------- UI ---------------- #
 with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎙️ AI Pronunciation Coach
     ### Practice English, Tamil & Malayalam with AI feedback
     **How to use:**
     1. Select your language
-    2. Generate a practice sentence
     3. Record yourself reading it aloud
-    4. Get instant feedback on your pronunciation!
     """)
     with gr.Row():
@@ -621,15 +824,8 @@ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo
     intended_display = gr.Textbox(
         label="📝 Practice Sentence (Read this aloud)",
         interactive=False,
-        placeholder="Click 'Generate Practice Sentence' to get started..."
-    )
-    intended_transliteration = gr.Textbox(
-        label="🔤 Pronunciation Guide",
-        interactive=False,
-        placeholder="Pronunciation guide will appear here...",
-        visible=False,
-        lines=1
     )
     with gr.Row():
@@ -649,38 +845,30 @@ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo
     with gr.Row():
         with gr.Column():
             pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
-            actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
         with gr.Column():
             wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
             cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
-    gr.Markdown("### 📋 Detailed Analysis")
     feedback_display = gr.HTML()
-    # Event handlers
-    def update_transliteration_visibility(language_choice):
-        if language_choice in ["Tamil", "Malayalam"]:
-            return gr.update(visible=True)
-        else:
-            return gr.update(visible=False, value="")
-    lang_choice.change(
-        fn=update_transliteration_visibility,
-        inputs=[lang_choice],
-        outputs=[intended_transliteration]
-    )
-    gen_btn.click(
-        fn=get_random_sentence_with_transliteration,
-        inputs=[lang_choice],
-        outputs=[intended_display, intended_transliteration]
-    )
-    analyze_btn.click(
-        fn=compare_pronunciation,
-        inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
-        outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
-    )
 if __name__ == "__main__":
     demo.launch()

 import random
 import difflib
 import re
+import unicodedata
 import jiwer
 import torch
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
     ]
 }
+# ---------------- IMPROVED TRANSLITERATION SYSTEM ---------------- #
+def transliterate_to_natural_roman(text, lang_choice):
+    """
+    Generalizable transliteration to natural romanization (Thanglish/Manglish)
+    using systematic phonetic rules instead of manual dictionaries
+    """
+    if not text or not text.strip():
+        return ""
+    if lang_choice == "English":
+        return text
+    try:
+        # Step 1: Convert to ISO 15919 (more systematic than IAST)
+        if lang_choice == "Tamil":
+            iso_text = transliterate(text, sanscript.TAMIL, sanscript.ISO)
+        elif lang_choice == "Malayalam":
+            iso_text = transliterate(text, sanscript.MALAYALAM, sanscript.ISO)
+        else:
+            return text
+        # Step 2: Apply systematic phonetic conversion
+        romanized = apply_systematic_phonetic_rules(iso_text)
+        # Step 3: Apply language-specific natural patterns
+        romanized = apply_natural_language_patterns(romanized, lang_choice)
+        # Step 4: Final phonetic cleanup and flow optimization
+        romanized = optimize_natural_flow(romanized)
+        return romanized if romanized else text
+    except Exception as e:
+        print(f"Transliteration error: {e}")
+        return text
+def apply_systematic_phonetic_rules(iso_text):
+    """
+    Apply systematic phonetic rules based on linguistic principles
+    rather than manual character mappings
+    """
+    result = iso_text
+    # === VOWEL SYSTEM ===
+    # Long vowels -> natural doubling (how native speakers type)
+    vowel_rules = [
+        (r'ā', 'aa'),   # long a
+        (r'ī', 'ii'),   # long i
+        (r'ū', 'uu'),   # long u
+        (r'ē', 'ee'),   # long e (some prefer 'e', but 'ee' is clearer)
+        (r'ō', 'oo'),   # long o (some prefer 'o', but 'oo' is clearer)
+        (r'ai', 'ai'),  # diphthong ai
+        (r'au', 'au'),  # diphthong au
+        (r'r̥', 'ru'),   # vocalic r
+        (r'r̥̄', 'ruu'), # long vocalic r
+        (r'l̥', 'lu'),   # vocalic l
+        (r'l̥̄', 'luu'), # long vocalic l
+    ]
+    # === CONSONANT SYSTEM ===
+    # Systematic consonant conversion based on phonetic properties
+    consonant_rules = [
+        # Nasals - context-sensitive
+        (r'ṅ', 'ng'),    # velar nasal
+        (r'ñ', 'nj'),    # palatal nasal (natural in South Indian typing)
+        (r'ṇ', 'n'),     # retroflex nasal -> dental (natural simplification)
+        (r'n̆', 'n'),     # any other nasal variants
+        # Stops - systematic by place of articulation
+        (r'([kg])h', r'\1h'),  # keep aspirated velars
+        (r'([cj])h', r'\1h'),  # keep aspirated palatals
+        (r'([ṭḍ])h', r'th'),   # retroflex aspirated -> dental aspirated (natural)
+        (r'([td])h', r'\1h'),  # keep dental aspirated
+        (r'([pb])h', r'\1h'),  # keep labial aspirated
+        # Retroflex simplification (how native speakers naturally type)
+        (r'ṭ', 't'),     # retroflex t -> dental t
+        (r'ḍ', 'd'),     # retroflex d -> dental d
+        (r'ṇ', 'n'),     # retroflex n -> dental n (already covered above)
+        # Liquids and approximants
+        (r'ṟ', 'r'),     # Tamil/Malayalam retroflex r -> simple r
+        (r'ṛ', 'r'),     # any other retroflex r -> simple r
+        (r'ḷ', 'l'),     # retroflex l -> simple l (except for special cases)
+        (r'ḻ', 'zh'),    # Tamil/Malayalam special l -> zh (important!)
+        # Sibilants - systematic
+        (r'ś', 'sh'),    # palatal sibilant
+        (r'ṣ', 'sh'),    # retroflex sibilant
+        (r's', 's'),     # dental sibilant (unchanged)
+        # Fricatives and others
+        (r'ḥ', 'h'),     # visarga -> simple h
+        (r'ḫ', 'h'),     # any other h variants
+        (r'×', ''),      # multiplication sign sometimes appears
+        # Common combinations (compound consonants)
+        (r'kṣ', 'ksh'),  # kṣa combination
+        (r'jñ', 'gn'),   # jña combination (natural pronunciation)
+        (r'śr', 'shr'),  # śra combination
+    ]
+    # Apply vowel rules first
+    for pattern, replacement in vowel_rules:
+        result = re.sub(pattern, replacement, result)
+    # Apply consonant rules
+    for pattern, replacement in consonant_rules:
+        result = re.sub(pattern, replacement, result)
+    return result
+def apply_natural_language_patterns(text, lang_choice):
+    """
+    Apply language-specific patterns that reflect how native speakers
+    naturally romanize their languages
+    """
+    if lang_choice == "Tamil":
+        return apply_tamil_natural_patterns(text)
+    elif lang_choice == "Malayalam":
+        return apply_malayalam_natural_patterns(text)
+    return text
+def apply_tamil_natural_patterns(text):
+    """Tamil-specific natural romanization patterns"""
+    tamil_patterns = [
+        # Tamil-specific sounds
+        (r'ḻ', 'zh'),           # Tamil zh sound (crucial)
+        (r'ṟ', 'r'),            # Tamil r sound
+        # Natural doubling patterns in Tamil
+        (r'([kgcjṭḍtdpb])\1', r'\1\1'),  # Keep natural gemination
+        # Tamil word-final patterns
+        (r'um$', 'um'),         # Tamil suffix -um
+        (r'an$', 'an'),         # Tamil suffix -an
+        (r'al$', 'al'),         # Tamil suffix -al
+        # Natural vowel harmony adjustments
+        (r'([aeiou])u([mnlr])', r'\1\2u'),  # Vowel + u + liquid/nasal
+    ]
+    for pattern, replacement in tamil_patterns:
+        text = re.sub(pattern, replacement, text)
+    return text
+def apply_malayalam_natural_patterns(text):
+    """Malayalam-specific natural romanization patterns"""
+    malayalam_patterns = [
+        # Malayalam-specific sounds
+        (r'ḻ', 'zh'),           # Malayalam zh sound (very important!)
+        (r'ṟ', 'r'),            # Malayalam r sound
+        # Natural gemination in Malayalam
+        (r'([kgcjṭḍtdpb])\1', r'\1\1'),  # Keep natural gemination
+        # Malayalam word patterns
+        (r'aanu$', 'aanu'),     # Malayalam copula ending
+        (r'unnu$', 'unnu'),     # Malayalam verb ending
+        (r'aam$', 'aam'),       # Malayalam suffix
+        # Natural flow adjustments for Malayalam
+        (r'([aeiou])([mnlr])([aeiou])', r'\1\2\3'),  # Vowel-liquid-vowel unchanged
+        # Handle Malayalam specific consonant clusters
+        (r'ngh', 'ngh'),        # Keep ngh clusters
+        (r'mph', 'mph'),        # Keep mph clusters
+    ]
+    for pattern, replacement in malayalam_patterns:
+        text = re.sub(pattern, replacement, text)
+    return text
+def optimize_natural_flow(text):
+    """
+    Final optimization for natural reading flow -
+    how native speakers would actually type/read
+    """
+    # Remove any remaining diacritical marks using Unicode normalization
+    text = ''.join(c for c in unicodedata.normalize('NFD', text)
+                   if unicodedata.category(c) != 'Mn')
+    # Natural flow optimization rules
+    flow_rules = [
+        # Vowel optimization for readability
+        (r'([aeiou])\1{2,}', r'\1\1'),        # Max 2 repeated vowels
+        (r'aaa+', 'aa'),                       # Multiple a's -> aa
+        (r'iii+', 'ii'),                       # Multiple i's -> ii
+        (r'uuu+', 'uu'),                       # Multiple u's -> uu
+        (r'eee+', 'ee'),                       # Multiple e's -> ee
+        (r'ooo+', 'oo'),                       # Multiple o's -> oo
+        # Consonant cluster optimization
+        (r'([bcdfghjklmnpqrstvwxyz])\1{2,}', r'\1\1'),  # Max 2 repeated consonants
+        # Natural word boundaries and spacing
+        (r'\s+', ' '),                         # Normalize spaces
+        (r'^\s+|\s+$', ''),                    # Trim leading/trailing spaces
+        # Handle common awkward sequences
+        (r'([aeiou])h([aeiou])', r'\1\2'),     # Remove h between vowels if awkward
+        (r'([bcdfghjklmnpqrstvwxyz])y([bcdfghjklmnpqrstvwxyz])', r'\1i\2'),  # y->i in consonant clusters
+        # Ensure readability of common endings
+        (r'([mnlr])u$', r'\1u'),               # Keep natural endings
+        (r'([kgt])u$', r'\1u'),                # Keep natural endings
+    ]
+    for pattern, replacement in flow_rules:
+        text = re.sub(pattern, replacement, text)
+    return text
+def enhanced_phonetic_similarity_check(intended_roman, actual_roman):
+    """
+    Enhanced similarity check that accounts for natural variations
+    in how people might romanize the same sounds
+    """
+    # Define phonetically equivalent mappings
+    phonetic_equivalents = {
+        'aa': ['a', 'aa'],
+        'ii': ['i', 'ii'],
+        'uu': ['u', 'uu'],
+        'ee': ['e', 'ee'],
+        'oo': ['o', 'oo'],
+        'zh': ['zh', 'z', 'l'],  # Common variations for zh sound
+        'sh': ['sh', 's'],       # sh vs s variations
+        'ch': ['ch', 'c'],       # ch vs c variations
+        'th': ['th', 't'],       # th vs t variations
+        'dh': ['dh', 'd'],       # dh vs d variations
+        'ksh': ['ksh', 'ksh', 'ks'],  # ksh variations
+        'gn': ['gn', 'ny', 'nj'],     # gn/ny/nj variations
+    }
+    # Normalize both strings for comparison
+    intended_normalized = normalize_for_comparison(intended_roman, phonetic_equivalents)
+    actual_normalized = normalize_for_comparison(actual_roman, phonetic_equivalents)
+    return intended_normalized, actual_normalized
+def normalize_for_comparison(text, equivalents):
+    """Normalize text for phonetic comparison"""
+    text = text.lower().strip()
+    # Replace equivalents with canonical forms
+    for canonical, variants in equivalents.items():
+        for variant in variants:
+            text = text.replace(variant, canonical)
+    return text
 # ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
 # Store only currently loaded model to save memory
 current_model = {"language": None, "model": None, "processor": None}
 def get_random_sentence_with_transliteration(language_choice):
     sentence = random.choice(SENTENCE_BANK[language_choice])
     if language_choice in ["Tamil", "Malayalam"]:
+        # Use the new improved transliteration system
+        transliteration = transliterate_to_natural_roman(sentence, language_choice)
+        # Combine sentence with transliteration in the same box
+        combined_sentence = f"{sentence}\n\n🔤 {transliteration}"
+        return combined_sentence, transliteration
     else:
         return sentence, ""
         print(f"Transliteration error: {e}")
         return text
+# Updated function that uses the new transliteration system
 def transliterate_to_simple_roman(text, lang_choice):
+    """
+    IMPROVED VERSION: Natural transliteration using systematic phonetic rules
+    """
+    return transliterate_to_natural_roman(text, lang_choice)
 @spaces.GPU
 def transcribe_once(audio_path, language_choice, beam_size, temperature):
     # Remove punctuation and whitespace
     return word.strip().translate(str.maketrans('', '', string.punctuation)).lower()
+def create_enhanced_tabular_feedback(intended, actual, lang_choice):
+    """
+    Enhanced feedback system with better phonetic comparison
+    """
+    # Get natural transliterations using the new system
+    intended_roman = transliterate_to_natural_roman(intended, lang_choice)
+    actual_roman = transliterate_to_natural_roman(actual, lang_choice)
     # Split into words for comparison
     intended_words = intended.strip().split()
     intended_roman_words = intended_roman.strip().split()
     actual_roman_words = actual_roman.strip().split()
+    # Calculate accuracy with phonetic awareness
     correct_words = 0
     total_words = len(intended_words)
     # Create word-by-word comparison table
     feedback_html = """
     <div style='font-family: Arial, sans-serif; padding: 20px; margin: 10px 0;'>
+        <h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Enhanced Pronunciation Analysis</h3>
     """
+    # Overview table with improved romanization
     feedback_html += """
     <div style='margin-bottom: 25px;'>
+        <h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison (Improved Natural Romanization)</h4>
         <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
             <thead>
                 <tr style='border-bottom: 2px solid #ddd;'>
                     <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
                     <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
+                    <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Natural Romanization</th>
                 </tr>
             </thead>
             <tbody>
     </div>
     """.format(intended, intended_roman, actual, actual_roman)
+    # Enhanced word-by-word analysis with phonetic awareness
     feedback_html += """
     <div style='margin-bottom: 25px;'>
+        <h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Enhanced Word-by-Word Analysis</h4>
         <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
             <thead>
                 <tr style='border-bottom: 2px solid #ddd;'>
                     <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>#</th>
                     <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Expected Word</th>
                     <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>What You Said</th>
+                    <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Phonetic Match</th>
                     <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50;'>Result</th>
                 </tr>
             </thead>
             <tbody>
     """
+    # Enhanced word comparison with phonetic similarity
+    sm = difflib.SequenceMatcher(None, intended_words, actual_words)
     word_index = 0
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         if tag == 'equal':
+            # Correct words
             for idx, word in enumerate(intended_words[i1:i2]):
                 word_index += 1
                 correct_words += 1
                         <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #27ae60;'>{actual_word}</div>
                         <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
                     </td>
+                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
+                        <span style='color: #27ae60; font-weight: bold;'>Perfect</span>
+                    </td>
                     <td style='padding: 12px; text-align: center;'>
                         <span style='color: #27ae60; font-weight: bold; font-size: 20px;'>✓</span>
+                        <div style='font-size: 12px; color: #27ae60; margin-top: 2px;'>Exact</div>
                     </td>
                 </tr>
                 """
         elif tag == 'replace':
+            # Check for phonetic similarity in replacements
             max_words = max(i2-i1, j2-j1)
             for idx in range(max_words):
                 word_index += 1
                 actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
                 actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
+                # Check phonetic similarity
+                if expected_roman and actual_roman_word:
+                    norm_expected, norm_actual = enhanced_phonetic_similarity_check(expected_roman, actual_roman_word)
+                    similarity_ratio = difflib.SequenceMatcher(None, norm_expected, norm_actual).ratio()
+                    if similarity_ratio > 0.8:  # High phonetic similarity
+                        phonetic_match = "Very Close"
+                        phonetic_color = "#f39c12"
+                        result_icon = "≈"
+                        result_text = "Similar"
+                        correct_words += 0.8  # Partial credit
+                    elif similarity_ratio > 0.6:  # Moderate similarity
+                        phonetic_match = "Close"
+                        phonetic_color = "#e67e22"
+                        result_icon = "~"
+                        result_text = "Close"
+                        correct_words += 0.5  # Partial credit
+                    else:
+                        phonetic_match = "Different"
+                        phonetic_color = "#e74c3c"
+                        result_icon = "✗"
+                        result_text = "Different"
+                else:
+                    phonetic_match = "Different"
+                    phonetic_color = "#e74c3c"
+                    result_icon = "✗"
+                    result_text = "Different"
                 feedback_html += f"""
                 <tr style='border-bottom: 1px solid #eee;'>
                     <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
                         <div style='font-size: 13px; color: #888;'>({expected_roman})</div>
                     </td>
                     <td style='padding: 12px; border-right: 1px solid #ddd;'>
+                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: {phonetic_color};'>{actual_word}</div>
                         <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
                     </td>
+                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
+                        <span style='color: {phonetic_color}; font-weight: bold;'>{phonetic_match}</span>
+                    </td>
                     <td style='padding: 12px; text-align: center;'>
+                        <span style='color: {phonetic_color}; font-weight: bold; font-size: 20px;'>{result_icon}</span>
+                        <div style='font-size: 12px; color: {phonetic_color}; margin-top: 2px;'>{result_text}</div>
                     </td>
                 </tr>
                 """
                     <td style='padding: 12px; color: #f39c12; font-style: italic; border-right: 1px solid #ddd;'>
                         <em>Not spoken</em>
                     </td>
+                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
+                        <span style='color: #f39c12; font-weight: bold;'>Missing</span>
+                    </td>
                     <td style='padding: 12px; text-align: center;'>
                         <span style='color: #f39c12; font-weight: bold; font-size: 20px;'>⚠</span>
                         <div style='font-size: 12px; color: #f39c12; margin-top: 2px;'>Missing</div>
                         <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #9b59b6;'>{word}</div>
                         <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
                     </td>
+                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
+                        <span style='color: #9b59b6; font-weight: bold;'>Extra</span>
+                    </td>
                     <td style='padding: 12px; text-align: center;'>
                         <span style='color: #9b59b6; font-weight: bold; font-size: 20px;'>+</span>
                         <div style='font-size: 12px; color: #9b59b6; margin-top: 2px;'>Extra</div>
     </div>
     """
+    # Calculate enhanced accuracy
     accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
+    # Enhanced summary section
     feedback_html += f"""
     <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 25px; border-radius: 12px; text-align: center; margin-top: 20px;'>
+        <h4 style='margin: 0 0 20px 0; font-size: 24px;'>🎯 Enhanced Pronunciation Score</h4>
         <div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 20px;'>
             <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
                 <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{accuracy:.0f}%</div>
+                <div style='font-size: 16px; opacity: 0.9;'>Phonetic Accuracy</div>
             </div>
             <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
+                <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{correct_words:.1f}/{total_words}</div>
+                <div style='font-size: 16px; opacity: 0.9;'>Words Matched</div>
             </div>
         </div>
+        <div style='margin-top: 15px; font-size: 14px; opacity: 0.8;'>
+            ✨ Now with enhanced phonetic matching for better accuracy!
+        </div>
     """
+    # Enhanced motivational message
     if accuracy >= 95:
+        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>🎉 Outstanding! Perfect natural pronunciation!</span></div>"
     elif accuracy >= 85:
+        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>🌟 Excellent! Very natural sounding!</span></div>"
     elif accuracy >= 70:
+        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>👍 Good job! Your pronunciation is improving!</span></div>"
     elif accuracy >= 50:
+        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>📚 Getting there! Focus on the highlighted sounds!</span></div>"
     else:
+        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>💪 Keep practicing! Every attempt makes you better!</span></div>"
+    feedback_html += "</div></div>"
     return feedback_html, accuracy
 # ---------------- MAIN ---------------- #
 @spaces.GPU
+def compare_pronunciation(audio, lang_choice, intended_display_text, pass1_beam, pass1_temp):
+    if audio is None or not intended_display_text.strip():
         return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
     try:
+        # Extract just the original sentence (before the transliteration part)
+        if "🔤" in intended_display_text:
+            intended_sentence = intended_display_text.split("🔤")[0].strip()
+        else:
+            intended_sentence = intended_display_text.strip()
         # Single transcription pass with user settings
         actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
         wer_val = jiwer.wer(intended_sentence, actual_text)
         cer_val = jiwer.cer(intended_sentence, actual_text)
+        # Get improved transliterations for both texts
+        intended_roman = transliterate_to_natural_roman(intended_sentence, lang_choice)
+        actual_roman = transliterate_to_natural_roman(actual_text, lang_choice)
+        # Create enhanced tabular feedback with phonetic awareness
+        feedback_html, accuracy = create_enhanced_tabular_feedback(intended_sentence, actual_text, lang_choice)
         return (
             actual_text,
 # ---------------- UI ---------------- #
 with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎙️ AI Pronunciation Coach (Enhanced)
     ### Practice English, Tamil & Malayalam with AI feedback
+    **New Features:**
+    - ✨ **Natural Romanization**: Improved Thanglish/Manglish that looks like how you actually type
+    - 🎯 **Phonetic Matching**: Gives partial credit for sounds that are close (zh/z/l variations)
+    - 📊 **Enhanced Feedback**: More accurate scoring with linguistic awareness
     **How to use:**
     1. Select your language
+    2. Generate a practice sentence
     3. Record yourself reading it aloud
+    4. Get instant enhanced feedback on your pronunciation!
     """)
     with gr.Row():
     intended_display = gr.Textbox(
         label="📝 Practice Sentence (Read this aloud)",
         interactive=False,
+        placeholder="Click 'Generate Practice Sentence' to get started...",
+        lines=3
     )
     with gr.Row():
     with gr.Row():
         with gr.Column():
             pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
+            actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Natural Romanized)", interactive=False)
         with gr.Column():
             wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
             cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
+    gr.Markdown("### 📋 Enhanced Detailed Analysis")
     feedback_display = gr.HTML()
+def get_sentence_for_display(language_choice):
+    sentence, transliteration = get_random_sentence_with_transliteration(language_choice)
+    return sentence
+# Event handlers
+gen_btn.click(
+    fn=get_sentence_for_display,
+    inputs=[lang_choice],
+    outputs=[intended_display]
+)
+analyze_btn.click(
+    fn=compare_pronunciation,
+    inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
+    outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
+)
 if __name__ == "__main__":
     demo.launch()