Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 4 days ago

Commit

be6893d

verified ·

1 Parent(s): 59775e3

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -698

app.py CHANGED Viewed

@@ -1,20 +1,20 @@
 import gradio as gr
 import random
 import difflib
-import re
-import unicodedata
 import jiwer
 import torch
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
-from indic_transliteration import sanscript
-from indic_transliteration.sanscript import transliterate
 import spaces
 import gc
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Updated model configurations for each language
 MODEL_CONFIGS = {
     "English": "openai/whisper-large-v2",
     "Tamil": "vasista22/whisper-tamil-large-v2",
@@ -27,847 +27,364 @@ LANG_CODES = {
     "Malayalam": "ml"
 }
-LANG_PRIMERS = {
-    "English": ("The transcript should be in English only.",
-                "Write only in English without translation. Example: This is an English sentence."),
-    "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
-              "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
-    "Malayalam": ("ട്രാൻസ്ഖ്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
-                  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം.")
-}
-SCRIPT_PATTERNS = {
-    "Tamil": re.compile(r"[஀-௿]"),
-    "Malayalam": re.compile(r"[ഀ-ൿ]"),
-    "English": re.compile(r"[A-Za-z]")
-}
 SENTENCE_BANK = {
     "English": [
         "The sun sets over the horizon.",
         "Learning languages is fun.",
         "I like to drink coffee in the morning.",
         "Technology helps us communicate better.",
-        "Reading books expands our knowledge.",
-        "Music brings people together.",
-        "Exercise keeps us healthy and strong.",
-        "Cooking is both art and science."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
         "எனக்கு புத்தகம் படிக்க விருப்பம்.",
         "தமிழ் மொழி மிகவும் அழகானது.",
-        "நான் தினமும் பள்ளிக்கு செல்கிறேன்.",
-        "எனக்கு இசை கேட்க மிகவும் பிடிக்கும்.",
-        "அன்னை தமிழ் எங்கள் தாய்மொழி.",
-        "நல்ல உணவு உடல் நலத்திற்கு அவசியம்."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
         "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
         "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
-        "ഞാൻ മലയാളം പഠിക്കുന്നു.",
-        "സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്.",
-        "നല്ല ആരോഗ്യം വളരെ പ്രധാനമാണ്.",
-        "വിദ്യാഭ്യാസം ജീവിതത്തിൽ അത്യാവശ്യമാണ്."
     ]
 }
-# ---------------- IMPROVED TRANSLITERATION SYSTEM ---------------- #
-def transliterate_to_natural_roman(text, lang_choice):
-    """
-    Generalizable transliteration to natural romanization (Thanglish/Manglish)
-    using systematic phonetic rules instead of manual dictionaries
-    """
-    if not text or not text.strip():
-        return ""
-    if lang_choice == "English":
-        return text
-    try:
-        # Step 1: Convert to ISO 15919 (more systematic than IAST)
-        if lang_choice == "Tamil":
-            iso_text = transliterate(text, sanscript.TAMIL, sanscript.ISO)
-        elif lang_choice == "Malayalam":
-            iso_text = transliterate(text, sanscript.MALAYALAM, sanscript.ISO)
-        else:
-            return text
-        # Step 2: Apply systematic phonetic conversion
-        romanized = apply_systematic_phonetic_rules(iso_text)
-        # Step 3: Apply language-specific natural patterns
-        romanized = apply_natural_language_patterns(romanized, lang_choice)
-        # Step 4: Final phonetic cleanup and flow optimization
-        romanized = optimize_natural_flow(romanized)
-        return romanized if romanized else text
-    except Exception as e:
-        print(f"Transliteration error: {e}")
-        return text
-def apply_systematic_phonetic_rules(iso_text):
-    """
-    Apply systematic phonetic rules based on linguistic principles
-    rather than manual character mappings
-    """
-    result = iso_text
-    # === VOWEL SYSTEM ===
-    # Long vowels -> natural doubling (how native speakers type)
-    vowel_rules = [
-        (r'ā', 'aa'),   # long a
-        (r'ī', 'ii'),   # long i
-        (r'ū', 'uu'),   # long u
-        (r'ē', 'ee'),   # long e (some prefer 'e', but 'ee' is clearer)
-        (r'ō', 'oo'),   # long o (some prefer 'o', but 'oo' is clearer)
-        (r'ai', 'ai'),  # diphthong ai
-        (r'au', 'au'),  # diphthong au
-        (r'r̥', 'ru'),   # vocalic r
-        (r'r̥̄', 'ruu'), # long vocalic r
-        (r'l̥', 'lu'),   # vocalic l
-        (r'l̥̄', 'luu'), # long vocalic l
-    ]
-    # === CONSONANT SYSTEM ===
-    # Systematic consonant conversion based on phonetic properties
-    consonant_rules = [
-        # Nasals - context-sensitive
-        (r'ṅ', 'ng'),    # velar nasal
-        (r'ñ', 'nj'),    # palatal nasal (natural in South Indian typing)
-        (r'ṇ', 'n'),     # retroflex nasal -> dental (natural simplification)
-        (r'n̆', 'n'),     # any other nasal variants
-        # Stops - systematic by place of articulation
-        (r'([kg])h', r'\1h'),  # keep aspirated velars
-        (r'([cj])h', r'\1h'),  # keep aspirated palatals
-        (r'([ṭḍ])h', r'th'),   # retroflex aspirated -> dental aspirated (natural)
-        (r'([td])h', r'\1h'),  # keep dental aspirated
-        (r'([pb])h', r'\1h'),  # keep labial aspirated
-        # Retroflex simplification (how native speakers naturally type)
-        (r'ṭ', 't'),     # retroflex t -> dental t
-        (r'ḍ', 'd'),     # retroflex d -> dental d
-        (r'ṇ', 'n'),     # retroflex n -> dental n (already covered above)
-        # Liquids and approximants
-        (r'ṟ', 'r'),     # Tamil/Malayalam retroflex r -> simple r
-        (r'ṛ', 'r'),     # any other retroflex r -> simple r
-        (r'ḷ', 'l'),     # retroflex l -> simple l (except for special cases)
-        (r'ḻ', 'zh'),    # Tamil/Malayalam special l -> zh (important!)
-        # Sibilants - systematic
-        (r'ś', 'sh'),    # palatal sibilant
-        (r'ṣ', 'sh'),    # retroflex sibilant
-        (r's', 's'),     # dental sibilant (unchanged)
-        # Fricatives and others
-        (r'ḥ', 'h'),     # visarga -> simple h
-        (r'ḫ', 'h'),     # any other h variants
-        (r'×', ''),      # multiplication sign sometimes appears
-        # Common combinations (compound consonants)
-        (r'kṣ', 'ksh'),  # kṣa combination
-        (r'jñ', 'gn'),   # jña combination (natural pronunciation)
-        (r'śr', 'shr'),  # śra combination
-    ]
-    # Apply vowel rules first
-    for pattern, replacement in vowel_rules:
-        result = re.sub(pattern, replacement, result)
-    # Apply consonant rules
-    for pattern, replacement in consonant_rules:
-        result = re.sub(pattern, replacement, result)
-    return result
-def apply_natural_language_patterns(text, lang_choice):
-    """
-    Apply language-specific patterns that reflect how native speakers
-    naturally romanize their languages
-    """
-    if lang_choice == "Tamil":
-        return apply_tamil_natural_patterns(text)
-    elif lang_choice == "Malayalam":
-        return apply_malayalam_natural_patterns(text)
-    return text
-def apply_tamil_natural_patterns(text):
-    """Tamil-specific natural romanization patterns"""
-    tamil_patterns = [
-        # Tamil-specific sounds
-        (r'ḻ', 'zh'),           # Tamil zh sound (crucial)
-        (r'ṟ', 'r'),            # Tamil r sound
-        # Natural doubling patterns in Tamil
-        (r'([kgcjṭḍtdpb])\1', r'\1\1'),  # Keep natural gemination
-        # Tamil word-final patterns
-        (r'um$', 'um'),         # Tamil suffix -um
-        (r'an$', 'an'),         # Tamil suffix -an
-        (r'al$', 'al'),         # Tamil suffix -al
-        # Natural vowel harmony adjustments
-        (r'([aeiou])u([mnlr])', r'\1\2u'),  # Vowel + u + liquid/nasal
-    ]
-    for pattern, replacement in tamil_patterns:
-        text = re.sub(pattern, replacement, text)
-    return text
-def apply_malayalam_natural_patterns(text):
-    """Malayalam-specific natural romanization patterns"""
-    malayalam_patterns = [
-        # Malayalam-specific sounds
-        (r'ḻ', 'zh'),           # Malayalam zh sound (very important!)
-        (r'ṟ', 'r'),            # Malayalam r sound
-        # Natural gemination in Malayalam
-        (r'([kgcjṭḍtdpb])\1', r'\1\1'),  # Keep natural gemination
-        # Malayalam word patterns
-        (r'aanu$', 'aanu'),     # Malayalam copula ending
-        (r'unnu$', 'unnu'),     # Malayalam verb ending
-        (r'aam$', 'aam'),       # Malayalam suffix
-        # Natural flow adjustments for Malayalam
-        (r'([aeiou])([mnlr])([aeiou])', r'\1\2\3'),  # Vowel-liquid-vowel unchanged
-        # Handle Malayalam specific consonant clusters
-        (r'ngh', 'ngh'),        # Keep ngh clusters
-        (r'mph', 'mph'),        # Keep mph clusters
-    ]
-    for pattern, replacement in malayalam_patterns:
-        text = re.sub(pattern, replacement, text)
-    return text
-def optimize_natural_flow(text):
-    """
-    Final optimization for natural reading flow -
-    how native speakers would actually type/read
-    """
-    # Remove any remaining diacritical marks using Unicode normalization
-    text = ''.join(c for c in unicodedata.normalize('NFD', text)
-                   if unicodedata.category(c) != 'Mn')
-    # Natural flow optimization rules
-    flow_rules = [
-        # Vowel optimization for readability
-        (r'([aeiou])\1{2,}', r'\1\1'),        # Max 2 repeated vowels
-        (r'aaa+', 'aa'),                       # Multiple a's -> aa
-        (r'iii+', 'ii'),                       # Multiple i's -> ii
-        (r'uuu+', 'uu'),                       # Multiple u's -> uu
-        (r'eee+', 'ee'),                       # Multiple e's -> ee
-        (r'ooo+', 'oo'),                       # Multiple o's -> oo
-        # Consonant cluster optimization
-        (r'([bcdfghjklmnpqrstvwxyz])\1{2,}', r'\1\1'),  # Max 2 repeated consonants
-        # Natural word boundaries and spacing
-        (r'\s+', ' '),                         # Normalize spaces
-        (r'^\s+|\s+$', ''),                    # Trim leading/trailing spaces
-        # Handle common awkward sequences
-        (r'([aeiou])h([aeiou])', r'\1\2'),     # Remove h between vowels if awkward
-        (r'([bcdfghjklmnpqrstvwxyz])y([bcdfghjklmnpqrstvwxyz])', r'\1i\2'),  # y->i in consonant clusters
-        # Ensure readability of common endings
-        (r'([mnlr])u$', r'\1u'),               # Keep natural endings
-        (r'([kgt])u$', r'\1u'),                # Keep natural endings
-    ]
-    for pattern, replacement in flow_rules:
-        text = re.sub(pattern, replacement, text)
-    return text
-def enhanced_phonetic_similarity_check(intended_roman, actual_roman):
-    """
-    Enhanced similarity check that accounts for natural variations
-    in how people might romanize the same sounds
-    """
-    # Define phonetically equivalent mappings
-    phonetic_equivalents = {
-        'aa': ['a', 'aa'],
-        'ii': ['i', 'ii'],
-        'uu': ['u', 'uu'],
-        'ee': ['e', 'ee'],
-        'oo': ['o', 'oo'],
-        'zh': ['zh', 'z', 'l'],  # Common variations for zh sound
-        'sh': ['sh', 's'],       # sh vs s variations
-        'ch': ['ch', 'c'],       # ch vs c variations
-        'th': ['th', 't'],       # th vs t variations
-        'dh': ['dh', 'd'],       # dh vs d variations
-        'ksh': ['ksh', 'ksh', 'ks'],  # ksh variations
-        'gn': ['gn', 'ny', 'nj'],     # gn/ny/nj variations
-    }
-    # Normalize both strings for comparison
-    intended_normalized = normalize_for_comparison(intended_roman, phonetic_equivalents)
-    actual_normalized = normalize_for_comparison(actual_roman, phonetic_equivalents)
-    return intended_normalized, actual_normalized
-def normalize_for_comparison(text, equivalents):
-    """Normalize text for phonetic comparison"""
-    text = text.lower().strip()
-    # Replace equivalents with canonical forms
-    for canonical, variants in equivalents.items():
-        for variant in variants:
-            text = text.replace(variant, canonical)
-    return text
-# ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
-# Store only currently loaded model to save memory
-current_model = {"language": None, "model": None, "processor": None}
-def load_model_for_language(language_choice):
-    """Load model on-demand and clear previous model from memory"""
-    global current_model
-    # If same language is already loaded, return current model
-    if current_model["language"] == language_choice and current_model["model"] is not None:
-        return current_model["model"], current_model["processor"]
-    # Clear previous model from memory
-    if current_model["model"] is not None:
-        del current_model["model"]
-        del current_model["processor"]
         gc.collect()
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
     # Load new model
     model_id = MODEL_CONFIGS[language_choice]
-    print(f"Loading {language_choice} model: {model_id}")
     try:
         model = WhisperForConditionalGeneration.from_pretrained(
-            model_id,
-            torch_dtype=torch.float32
         ).to(DEVICE)
         processor = WhisperProcessor.from_pretrained(model_id)
-        current_model = {
             "language": language_choice,
             "model": model,
             "processor": processor
         }
-        print(f"✓ {language_choice} model loaded successfully")
         return model, processor
     except Exception as e:
-        print(f"✗ Error loading {language_choice} model: {e}")
-        # Fallback to base whisper model
-        print(f"Falling back to openai/whisper-base for {language_choice}")
         model = WhisperForConditionalGeneration.from_pretrained(
-            "openai/whisper-base",
-            torch_dtype=torch.float32
         ).to(DEVICE)
         processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-        current_model = {
             "language": language_choice,
             "model": model,
             "processor": processor
         }
         return model, processor
-# ---------------- HELPERS ---------------- #
-def get_random_sentence(language_choice):
-    return random.choice(SENTENCE_BANK[language_choice])
-def get_random_sentence_with_transliteration(language_choice):
-    sentence = random.choice(SENTENCE_BANK[language_choice])
-    if language_choice in ["Tamil", "Malayalam"]:
-        # Use the new improved transliteration system
-        transliteration = transliterate_to_natural_roman(sentence, language_choice)
-        # Combine sentence with transliteration in the same box
-        combined_sentence = f"{sentence}\n\n🔤 {transliteration}"
-        return combined_sentence, transliteration
-    else:
-        return sentence, ""
-def is_script(text, lang_name):
-    pattern = SCRIPT_PATTERNS.get(lang_name)
-    return bool(pattern.search(text)) if pattern else True
-def transliterate_to_hk(text, lang_choice):
-    """Improved transliteration with better handling"""
-    if not text or not text.strip():
-        return ""
-    mapping = {
-        "Tamil": sanscript.TAMIL,
-        "Malayalam": sanscript.MALAYALAM,
-        "English": None
-    }
-    if mapping[lang_choice] is None:
-        return text  # Return as-is for English
     try:
-        # Clean the text and transliterate
-        cleaned_text = text.strip()
-        transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.HK)
-        return transliterated if transliterated else text
     except Exception as e:
-        print(f"Transliteration error: {e}")
         return text
-# Updated function that uses the new transliteration system
-def transliterate_to_simple_roman(text, lang_choice):
-    """
-    IMPROVED VERSION: Natural transliteration using systematic phonetic rules
-    """
-    return transliterate_to_natural_roman(text, lang_choice)
 @spaces.GPU
-def transcribe_once(audio_path, language_choice, beam_size, temperature):
-    # Get the appropriate model and processor for the language
-    model, processor = load_model_for_language(language_choice)
     lang_code = LANG_CODES[language_choice]
-    # Load and process audio
     import librosa
     audio, sr = librosa.load(audio_path, sr=16000)
-    # Process audio with the specific model's processor
     input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
-    # Ensure input tensor matches model dtype
-    model_dtype = next(model.parameters()).dtype
-    input_features = input_features.to(device=DEVICE, dtype=model_dtype)
-    # Generate transcription with fallback for different model capabilities
     with torch.no_grad():
         try:
-            # Try with forced decoder ids first (standard Whisper models)
             forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
             predicted_ids = model.generate(
                 input_features,
                 forced_decoder_ids=forced_decoder_ids,
                 max_length=448,
-                num_beams=beam_size,
-                temperature=temperature if temperature > 0 else None,
-                do_sample=temperature > 0,
             )
-        except (TypeError, ValueError) as e:
-            # Fallback for models that don't support forced_decoder_ids (like some fine-tuned models)
-            print(f"Fallback generation for {language_choice}: {e}")
             predicted_ids = model.generate(
                 input_features,
                 max_length=448,
-                num_beams=beam_size,
-                temperature=temperature if temperature > 0 else None,
-                do_sample=temperature > 0,
             )
-    # Decode the transcription
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription.strip()
-def normalize_word(word):
-    """Remove punctuation and normalize word for comparison"""
-    import string
-    # Remove punctuation and whitespace
-    return word.strip().translate(str.maketrans('', '', string.punctuation)).lower()
-def create_enhanced_tabular_feedback(intended, actual, lang_choice):
-    """
-    Enhanced feedback system with better phonetic comparison
-    """
-    # Get natural transliterations using the new system
-    intended_roman = transliterate_to_natural_roman(intended, lang_choice)
-    actual_roman = transliterate_to_natural_roman(actual, lang_choice)
-    # Split into words for comparison
     intended_words = intended.strip().split()
     actual_words = actual.strip().split()
-    intended_roman_words = intended_roman.strip().split()
-    actual_roman_words = actual_roman.strip().split()
-    # Calculate accuracy with phonetic awareness
-    correct_words = 0
-    total_words = len(intended_words)
-    # Create word-by-word comparison table
-    feedback_html = """
-    <div style='font-family: Arial, sans-serif; padding: 20px; margin: 10px 0;'>
-        <h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Enhanced Pronunciation Analysis</h3>
-    """
-    # Overview table with improved romanization
-    feedback_html += """
-    <div style='margin-bottom: 25px;'>
-        <h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison (Improved Natural Romanization)</h4>
-        <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
-            <thead>
-                <tr style='border-bottom: 2px solid #ddd;'>
-                    <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
-                    <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
-                    <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Natural Romanization</th>
-                </tr>
-            </thead>
-            <tbody>
-                <tr style='border-bottom: 1px solid #ddd;'>
-                    <td style='padding: 15px; font-weight: bold; color: #27ae60; border-right: 1px solid #ddd;'>🎯 Target</td>
-                    <td style='padding: 15px; font-family: monospace; font-size: 18px; border-right: 1px solid #ddd;'>{}</td>
-                    <td style='padding: 15px; font-family: monospace; font-size: 16px; color: #555;'>{}</td>
-                </tr>
-                <tr>
-                    <td style='padding: 15px; font-weight: bold; color: #e67e22; border-right: 1px solid #ddd;'>🗣️ You Said</td>
-                    <td style='padding: 15px; font-family: monospace; font-size: 18px; border-right: 1px solid #ddd;'>{}</td>
-                    <td style='padding: 15px; font-family: monospace; font-size: 16px; color: #555;'>{}</td>
-                </tr>
-            </tbody>
-        </table>
-    </div>
-    """.format(intended, intended_roman, actual, actual_roman)
-    # Enhanced word-by-word analysis with phonetic awareness
-    feedback_html += """
-    <div style='margin-bottom: 25px;'>
-        <h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Enhanced Word-by-Word Analysis</h4>
-        <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
-            <thead>
-                <tr style='border-bottom: 2px solid #ddd;'>
-                    <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>#</th>
-                    <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Expected Word</th>
-                    <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>What You Said</th>
-                    <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Phonetic Match</th>
-                    <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50;'>Result</th>
-                </tr>
-            </thead>
-            <tbody>
-    """
-    # Enhanced word comparison with phonetic similarity
     sm = difflib.SequenceMatcher(None, intended_words, actual_words)
-    word_index = 0
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        if tag == 'equal':
-            # Correct words
-            for idx, word in enumerate(intended_words[i1:i2]):
-                word_index += 1
-                correct_words += 1
-                roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
-                actual_word = actual_words[j1 + idx] if (j1 + idx) < len(actual_words) else ""
-                actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
-                feedback_html += f"""
-                <tr style='border-bottom: 1px solid #eee;'>
-                    <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
-                    <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px;'>{word}</div>
-                        <div style='font-size: 13px; color: #888;'>({roman_word})</div>
-                    </td>
-                    <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #27ae60;'>{actual_word}</div>
-                        <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
-                    </td>
-                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
-                        <span style='color: #27ae60; font-weight: bold;'>Perfect</span>
-                    </td>
-                    <td style='padding: 12px; text-align: center;'>
-                        <span style='color: #27ae60; font-weight: bold; font-size: 20px;'>✓</span>
-                        <div style='font-size: 12px; color: #27ae60; margin-top: 2px;'>Exact</div>
-                    </td>
-                </tr>
-                """
-        elif tag == 'replace':
-            # Check for phonetic similarity in replacements
-            max_words = max(i2-i1, j2-j1)
-            for idx in range(max_words):
-                word_index += 1
-                expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
-                expected_roman = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
-                actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
-                actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
-                # Check phonetic similarity
-                if expected_roman and actual_roman_word:
-                    norm_expected, norm_actual = enhanced_phonetic_similarity_check(expected_roman, actual_roman_word)
-                    similarity_ratio = difflib.SequenceMatcher(None, norm_expected, norm_actual).ratio()
-                    if similarity_ratio > 0.8:  # High phonetic similarity
-                        phonetic_match = "Very Close"
-                        phonetic_color = "#f39c12"
-                        result_icon = "≈"
-                        result_text = "Similar"
-                        correct_words += 0.8  # Partial credit
-                    elif similarity_ratio > 0.6:  # Moderate similarity
-                        phonetic_match = "Close"
-                        phonetic_color = "#e67e22"
-                        result_icon = "~"
-                        result_text = "Close"
-                        correct_words += 0.5  # Partial credit
-                    else:
-                        phonetic_match = "Different"
-                        phonetic_color = "#e74c3c"
-                        result_icon = "✗"
-                        result_text = "Different"
-                else:
-                    phonetic_match = "Different"
-                    phonetic_color = "#e74c3c"
-                    result_icon = "✗"
-                    result_text = "Different"
-                feedback_html += f"""
-                <tr style='border-bottom: 1px solid #eee;'>
-                    <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
-                    <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px;'>{expected_word}</div>
-                        <div style='font-size: 13px; color: #888;'>({expected_roman})</div>
-                    </td>
-                    <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: {phonetic_color};'>{actual_word}</div>
-                        <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
-                    </td>
-                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
-                        <span style='color: {phonetic_color}; font-weight: bold;'>{phonetic_match}</span>
-                    </td>
-                    <td style='padding: 12px; text-align: center;'>
-                        <span style='color: {phonetic_color}; font-weight: bold; font-size: 20px;'>{result_icon}</span>
-                        <div style='font-size: 12px; color: {phonetic_color}; margin-top: 2px;'>{result_text}</div>
-                    </td>
-                </tr>
-                """
-        elif tag == 'delete':
-            # Missing words
-            for idx, word in enumerate(intended_words[i1:i2]):
-                word_index += 1
-                roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
-                feedback_html += f"""
-                <tr style='border-bottom: 1px solid #eee;'>
-                    <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
-                    <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px;'>{word}</div>
-                        <div style='font-size: 13px; color: #888;'>({roman_word})</div>
-                    </td>
-                    <td style='padding: 12px; color: #f39c12; font-style: italic; border-right: 1px solid #ddd;'>
-                        <em>Not spoken</em>
-                    </td>
-                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
-                        <span style='color: #f39c12; font-weight: bold;'>Missing</span>
-                    </td>
-                    <td style='padding: 12px; text-align: center;'>
-                        <span style='color: #f39c12; font-weight: bold; font-size: 20px;'>⚠</span>
-                        <div style='font-size: 12px; color: #f39c12; margin-top: 2px;'>Missing</div>
-                    </td>
-                </tr>
-                """
-        elif tag == 'insert':
-            # Extra words
-            for idx, word in enumerate(actual_words[j1:j2]):
-                actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
-                feedback_html += f"""
-                <tr style='border-bottom: 1px solid #eee;'>
-                    <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>+</td>
-                    <td style='padding: 12px; color: #9b59b6; font-style: italic; border-right: 1px solid #ddd;'>
-                        <em>Not expected</em>
-                    </td>
-                    <td style='padding: 12px; border-right: 1px solid #ddd;'>
-                        <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #9b59b6;'>{word}</div>
-                        <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
-                    </td>
-                    <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
-                        <span style='color: #9b59b6; font-weight: bold;'>Extra</span>
-                    </td>
-                    <td style='padding: 12px; text-align: center;'>
-                        <span style='color: #9b59b6; font-weight: bold; font-size: 20px;'>+</span>
-                        <div style='font-size: 12px; color: #9b59b6; margin-top: 2px;'>Extra</div>
-                    </td>
-                </tr>
-                """
-    feedback_html += """
-            </tbody>
         </table>
-    </div>
-    """
-    # Calculate enhanced accuracy
-    accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
-    # Enhanced summary section
-    feedback_html += f"""
-    <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 25px; border-radius: 12px; text-align: center; margin-top: 20px;'>
-        <h4 style='margin: 0 0 20px 0; font-size: 24px;'>🎯 Enhanced Pronunciation Score</h4>
-        <div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 20px;'>
-            <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
-                <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{accuracy:.0f}%</div>
-                <div style='font-size: 16px; opacity: 0.9;'>Phonetic Accuracy</div>
-            </div>
-            <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
-                <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{correct_words:.1f}/{total_words}</div>
-                <div style='font-size: 16px; opacity: 0.9;'>Words Matched</div>
             </div>
         </div>
-        <div style='margin-top: 15px; font-size: 14px; opacity: 0.8;'>
-            ✨ Now with enhanced phonetic matching for better accuracy!
-        </div>
     """
-    # Enhanced motivational message
-    if accuracy >= 95:
-        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>🎉 Outstanding! Perfect natural pronunciation!</span></div>"
-    elif accuracy >= 85:
-        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>🌟 Excellent! Very natural sounding!</span></div>"
-    elif accuracy >= 70:
-        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>👍 Good job! Your pronunciation is improving!</span></div>"
-    elif accuracy >= 50:
-        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>📚 Getting there! Focus on the highlighted sounds!</span></div>"
-    else:
-        feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>💪 Keep practicing! Every attempt makes you better!</span></div>"
-    feedback_html += "</div></div>"
     return feedback_html, accuracy
-# ---------------- MAIN ---------------- #
 @spaces.GPU
-def compare_pronunciation(audio, lang_choice, intended_display_text, pass1_beam, pass1_temp):
-    if audio is None or not intended_display_text.strip():
-        return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
     try:
-        # Extract just the original sentence (before the transliteration part)
-        if "🔤" in intended_display_text:
-            intended_sentence = intended_display_text.split("🔤")[0].strip()
         else:
-            intended_sentence = intended_display_text.strip()
-        # Single transcription pass with user settings
-        actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
         if not actual_text.strip():
-            return ("⚠️ No speech detected. Please try recording again.", "", "", "", "")
-        # Compute metrics
         wer_val = jiwer.wer(intended_sentence, actual_text)
         cer_val = jiwer.cer(intended_sentence, actual_text)
-        # Get improved transliterations for both texts
-        intended_roman = transliterate_to_natural_roman(intended_sentence, lang_choice)
-        actual_roman = transliterate_to_natural_roman(actual_text, lang_choice)
-        # Create enhanced tabular feedback with phonetic awareness
-        feedback_html, accuracy = create_enhanced_tabular_feedback(intended_sentence, actual_text, lang_choice)
-        return (
-            actual_text,
-            actual_roman,
-            f"{wer_val:.1%}",
-            f"{cer_val:.1%}",
-            feedback_html
-        )
     except Exception as e:
-        error_msg = f"❌ Error during transcription: {str(e)}"
-        print(error_msg)
-        return (error_msg, "", "", "", "")
-def get_sentence_for_display(language_choice):
-    sentence, transliteration = get_random_sentence_with_transliteration(language_choice)
     return sentence
 # ---------------- UI ---------------- #
-with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎙️ AI Pronunciation Coach (Enhanced)
-    ### Practice English, Tamil & Malayalam with AI feedback
-    **New Features:**
-    - ✨ **Natural Romanization**: Improved Thanglish/Manglish that looks like how you actually type
-    - 🎯 **Phonetic Matching**: Gives partial credit for sounds that are close (zh/z/l variations)
-    - 📊 **Enhanced Feedback**: More accurate scoring with linguistic awareness
     **How to use:**
     1. Select your language
     2. Generate a practice sentence
     3. Record yourself reading it aloud
-    4. Get instant enhanced feedback on your pronunciation!
     """)
     with gr.Row():
-        with gr.Column(scale=2):
-            lang_choice = gr.Dropdown(
-                choices=list(LANG_CODES.keys()),
-                value="Malayalam",
-                label="🌍 Choose Language"
-            )
-        with gr.Column(scale=1):
-            gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
     intended_display = gr.Textbox(
-        label="📝 Practice Sentence (Read this aloud)",
         interactive=False,
         placeholder="Click 'Generate Practice Sentence' to get started...",
         lines=3
     )
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(
-                sources=["microphone"],
-                type="filepath",
-                label="🎤 Record Your Pronunciation"
-            )
-        with gr.Column():
-            gr.Markdown("### ⚙️ Advanced Settings")
-            pass1_beam = gr.Slider(1, 10, value=5, step=1, label="Beam Size (accuracy vs speed)")
-            pass1_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature (creativity)")
     analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
     with gr.Row():
-        with gr.Column():
-            pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
-            actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Natural Romanized)", interactive=False)
-        with gr.Column():
-            wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
-            cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
-    gr.Markdown("### 📋 Enhanced Detailed Analysis")
     feedback_display = gr.HTML()
     # Event handlers
     gen_btn.click(
-        fn=get_sentence_for_display,
-        inputs=[lang_choice],
         outputs=[intended_display]
     )
     analyze_btn.click(
-        fn=compare_pronunciation,
-        inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
-        outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
     )
 if __name__ == "__main__":

 import gradio as gr
 import random
 import difflib
 import jiwer
 import torch
+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    AutoModelForCausalLM,
+    AutoTokenizer
+)
 import spaces
 import gc
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_CONFIGS = {
     "English": "openai/whisper-large-v2",
     "Tamil": "vasista22/whisper-tamil-large-v2",
     "Malayalam": "ml"
 }
 SENTENCE_BANK = {
     "English": [
         "The sun sets over the horizon.",
         "Learning languages is fun.",
         "I like to drink coffee in the morning.",
         "Technology helps us communicate better.",
+        "Reading books expands our knowledge."
     ],
     "Tamil": [
         "இன்று நல்ல வானிலை உள்ளது.",
         "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
         "எனக்கு புத்தகம் படிக்க விருப்பம்.",
         "தமிழ் மொழி மிகவும் அழகானது.",
+        "அன்னை தமிழ் எங்கள் தாய்மொழி."
     ],
     "Malayalam": [
         "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
         "ഇന്ന് മഴപെയ്യുന്നു.",
         "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
         "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
+        "സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്."
     ]
 }
+# ---------------- MODELS ---------------- #
+current_whisper_model = {"language": None, "model": None, "processor": None}
+qwen_model = {"model": None, "tokenizer": None}
+def load_whisper_model(language_choice):
+    """Load Whisper model for the selected language"""
+    global current_whisper_model
+    if current_whisper_model["language"] == language_choice and current_whisper_model["model"] is not None:
+        return current_whisper_model["model"], current_whisper_model["processor"]
+    # Clear previous model
+    if current_whisper_model["model"] is not None:
+        del current_whisper_model["model"]
+        del current_whisper_model["processor"]
         gc.collect()
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
     # Load new model
     model_id = MODEL_CONFIGS[language_choice]
+    print(f"Loading Whisper model: {model_id}")
     try:
         model = WhisperForConditionalGeneration.from_pretrained(
+            model_id, torch_dtype=torch.float32
         ).to(DEVICE)
         processor = WhisperProcessor.from_pretrained(model_id)
+        current_whisper_model = {
             "language": language_choice,
             "model": model,
             "processor": processor
         }
+        print(f"✓ Whisper model loaded successfully")
         return model, processor
     except Exception as e:
+        print(f"✗ Error loading Whisper model: {e}")
+        # Fallback to base model
         model = WhisperForConditionalGeneration.from_pretrained(
+            "openai/whisper-base", torch_dtype=torch.float32
         ).to(DEVICE)
         processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+        current_whisper_model = {
             "language": language_choice,
             "model": model,
             "processor": processor
         }
         return model, processor
+def load_qwen_model():
+    """Load Qwen2.5-1.5B-Instruct for transliteration"""
+    global qwen_model
+    if qwen_model["model"] is not None:
+        return qwen_model["model"], qwen_model["tokenizer"]
+    try:
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        print(f"Loading Qwen model: {model_name}")
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            device_map="auto" if DEVICE == "cuda" else None
+        )
+        if DEVICE == "cpu":
+            model = model.to(DEVICE)
+        model.eval()
+        qwen_model = {"model": model, "tokenizer": tokenizer}
+        print(f"✓ Qwen model loaded successfully")
+        return model, tokenizer
+    except Exception as e:
+        print(f"✗ Failed to load Qwen model: {e}")
+        return None, None
+# ---------------- TRANSLITERATION ---------------- #
+def transliterate_with_qwen(text, source_lang):
+    """Use Qwen for natural transliteration"""
+    if source_lang == "English" or not text.strip():
+        return text
+    model, tokenizer = load_qwen_model()
+    if model is None or tokenizer is None:
+        return text  # Return original if model fails
     try:
+        # Create prompts
+        if source_lang == "Tamil":
+            system_prompt = "Convert Tamil text to natural Thanglish (how Tamil people type on phones). Only output the romanized text."
+            user_prompt = f"Tamil: {text}\nThanglish:"
+        else:  # Malayalam
+            system_prompt = "Convert Malayalam text to natural Manglish (how Malayalam people type on phones). Only output the romanized text."
+            user_prompt = f"Malayalam: {text}\nManglish:"
+        # Format for Qwen
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        inputs = inputs.to(DEVICE)
+        # Generate
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=50,
+                temperature=0.1,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id
+            )
+        # Extract response
+        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = full_response[len(prompt):].strip()
+        # Clean response
+        response = response.split('\n')[0].strip()  # Take first line only
+        return response if response else text
     except Exception as e:
+        print(f"Qwen transliteration error: {e}")
         return text
+# ---------------- SPEECH RECOGNITION ---------------- #
 @spaces.GPU
+def transcribe_audio(audio_path, language_choice):
+    """Transcribe audio using Whisper"""
+    model, processor = load_whisper_model(language_choice)
     lang_code = LANG_CODES[language_choice]
+    # Load audio
     import librosa
     audio, sr = librosa.load(audio_path, sr=16000)
+    # Process audio
     input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+    input_features = input_features.to(DEVICE, dtype=next(model.parameters()).dtype)
+    # Generate transcription
     with torch.no_grad():
         try:
             forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
             predicted_ids = model.generate(
                 input_features,
                 forced_decoder_ids=forced_decoder_ids,
                 max_length=448,
+                num_beams=5,
+                temperature=0.0
             )
+        except:
             predicted_ids = model.generate(
                 input_features,
                 max_length=448,
+                num_beams=5,
+                temperature=0.0
             )
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription.strip()
+# ---------------- FEEDBACK SYSTEM ---------------- #
+def create_feedback(intended, actual, lang_choice):
+    """Create simple feedback comparison"""
+    # Get transliterations
+    intended_roman = transliterate_with_qwen(intended, lang_choice)
+    actual_roman = transliterate_with_qwen(actual, lang_choice)
+    # Calculate accuracy
     intended_words = intended.strip().split()
     actual_words = actual.strip().split()
+    # Simple word-level accuracy
     sm = difflib.SequenceMatcher(None, intended_words, actual_words)
+    accuracy = sm.ratio() * 100
+    # Create feedback HTML
+    feedback_html = f"""
+    <div style='font-family: Arial, sans-serif; padding: 20px;'>
+        <h3 style='color: #2c3e50; text-align: center;'>📊 Pronunciation Analysis</h3>
+        <table style='width: 100%; border-collapse: collapse; margin: 20px 0;'>
+            <tr style='background: #f8f9fa;'>
+                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Target</td>
+                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace;'>{intended}</td>
+            </tr>
+            <tr style='background: #f8f9fa;'>
+                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Romanized</td>
+                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace; color: #666;'>{intended_roman}</td>
+            </tr>
+            <tr>
+                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>You Said</td>
+                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace;'>{actual}</td>
+            </tr>
+            <tr>
+                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Your Romanized</td>
+                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace; color: #666;'>{actual_roman}</td>
+            </tr>
         </table>
+        <div style='text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px;'>
+            <h4 style='margin: 0 0 10px 0;'>Accuracy Score</h4>
+            <div style='font-size: 36px; font-weight: bold;'>{accuracy:.0f}%</div>
+            <div style='margin-top: 10px;'>
+                {'🎉 Excellent!' if accuracy >= 90 else '👍 Good job!' if accuracy >= 70 else '📚 Keep practicing!'}
             </div>
         </div>
+    </div>
     """
     return feedback_html, accuracy
+# ---------------- MAIN FUNCTION ---------------- #
 @spaces.GPU
+def analyze_pronunciation(audio, lang_choice, intended_text):
+    """Main function to analyze pronunciation"""
+    if audio is None or not intended_text.strip():
+        return "⚠️ Please record audio and generate a sentence first.", "", "", ""
     try:
+        # Extract original sentence (remove romanization if present)
+        if "🔤" in intended_text:
+            intended_sentence = intended_text.split("🔤")[0].strip()
         else:
+            intended_sentence = intended_text.strip()
+        # Transcribe audio
+        actual_text = transcribe_audio(audio, lang_choice)
         if not actual_text.strip():
+            return "⚠️ No speech detected. Please try recording again.", "", "", ""
+        # Calculate metrics
         wer_val = jiwer.wer(intended_sentence, actual_text)
         cer_val = jiwer.cer(intended_sentence, actual_text)
+        # Get romanizations
+        actual_roman = transliterate_with_qwen(actual_text, lang_choice)
+        # Create feedback
+        feedback_html, accuracy = create_feedback(intended_sentence, actual_text, lang_choice)
+        return actual_text, actual_roman, f"{wer_val:.1%}", feedback_html
     except Exception as e:
+        return f"❌ Error: {str(e)}", "", "", ""
+# ---------------- HELPERS ---------------- #
+def get_random_sentence_with_transliteration(language_choice):
+    """Get a random sentence with its transliteration"""
+    sentence = random.choice(SENTENCE_BANK[language_choice])
+    if language_choice in ["Tamil", "Malayalam"]:
+        transliteration = transliterate_with_qwen(sentence, language_choice)
+        combined = f"{sentence}\n\n🔤 {transliteration}"
+        return combined
     return sentence
 # ---------------- UI ---------------- #
+with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 🎙️ AI Pronunciation Coach
+    ### Practice English, Tamil & Malayalam with AI feedback powered by Qwen2.5
+    **Features:**
+    - ✨ **Smart Transliteration**: Natural Thanglish/Manglish using Qwen2.5-1.5B-Instruct
+    - 🎯 **Accurate Recognition**: Language-specific Whisper models
+    - 📊 **Instant Feedback**: Real-time pronunciation analysis
     **How to use:**
     1. Select your language
     2. Generate a practice sentence
     3. Record yourself reading it aloud
+    4. Get instant feedback!
     """)
     with gr.Row():
+        lang_choice = gr.Dropdown(
+            choices=list(LANG_CODES.keys()),
+            value="Malayalam",
+            label="🌍 Choose Language"
+        )
+        gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
     intended_display = gr.Textbox(
+        label="📝 Practice Sentence",
         interactive=False,
         placeholder="Click 'Generate Practice Sentence' to get started...",
         lines=3
     )
+    audio_input = gr.Audio(
+        sources=["microphone"],
+        type="filepath",
+        label="🎤 Record Your Pronunciation"
+    )
     analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
     with gr.Row():
+        actual_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
+        actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
+        wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
     feedback_display = gr.HTML()
     # Event handlers
     gen_btn.click(
+        fn=get_random_sentence_with_transliteration,
+        inputs=[lang_choice],
         outputs=[intended_display]
     )
     analyze_btn.click(
+        fn=analyze_pronunciation,
+        inputs=[audio_input, lang_choice, intended_display],
+        outputs=[actual_out, actual_roman_out, wer_out, feedback_display]
     )
 if __name__ == "__main__":