Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 13 days ago

Commit

024461f

verified ·

1 Parent(s): 455645c

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -106

app.py CHANGED Viewed

@@ -1,45 +1,35 @@
 import gradio as gr
 import random
 import difflib
 from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
-import re
-import jiwer
 # ---------------- CONFIG ---------------- #
 MODEL_NAME = "large-v2"
 DEVICE = "cpu"
 LANG_CODES = {
-    "English": "en",
-    "Tamil": "ta",
-    "Malayalam": "ml",
-    "Hindi": "hi",
-    "Sanskrit": "sa"
 }
 LANG_PRIMERS = {
-    "English": (
-        "The transcript should be in English only.",
-        "Write only in English without translation. Example: This is an English sentence."
-    ),
-    "Tamil": (
-        "நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
-        "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."
-    ),
-    "Malayalam": (
-        "ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
-        "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."
-    ),
-    "Hindi": (
-        "प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
-        "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"
-    ),
-    "Sanskrit": (
-        "प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
-        "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।"
-    )
 }
 SCRIPT_PATTERNS = {
@@ -51,72 +41,69 @@ SCRIPT_PATTERNS = {
 }
 SENTENCE_BANK = {
-    "English": [
-        "The sun sets over the horizon.",
-        "Learning languages is fun.",
-        "I like to drink coffee in the morning."
-    ],
-    "Tamil": [
-        "இன்று நல்ல வானிலை உள்ளது.",
-        "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
-        "எனக்கு புத்தகம் படிக்க விருப்பம்."
-    ],
-    "Malayalam": [
-        "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
-        "ഇന്ന് മഴപെയ്യുന്നു.",
-        "ഞാൻ പുസ്തകം വായിക്കുന്നു."
-    ],
-    "Hindi": [
-        "आज मौसम अच्छा है।",
-        "मुझे हिंदी बोलना पसंद है।",
-        "मैं किताब पढ़ रहा हूँ।"
-    ],
-    "Sanskrit": [
-        "अहं ग्रन्थं पठामि।",
-        "अद्य सूर्यः तेजस्वी अस्ति।",
-        "मम नाम रामः।"
-    ]
 }
-# ---------------- MODEL ---------------- #
 print("Loading Whisper model...")
-model = WhisperModel(MODEL_NAME, device=DEVICE)
 # ---------------- HELPERS ---------------- #
 def is_script(text, lang_name):
-    pattern = SCRIPT_PATTERNS.get(lang_name)
-    return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
     mapping = {
-        "Tamil": sanscript.TAMIL,
-        "Malayalam": sanscript.MALAYALAM,
-        "Hindi": sanscript.DEVANAGARI,
-        "Sanskrit": sanscript.DEVANAGARI,
         "English": None
     }
     return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
 def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
-    segments, _ = model.transcribe(
-        audio_path,
-        language=lang_code,
-        task="transcribe",
-        initial_prompt=initial_prompt,
-        beam_size=beam_size,
-        temperature=temperature,
-        condition_on_previous_text=condition_on_previous_text,
         word_timestamps=False
     )
     return "".join(s.text for s in segments).strip()
-def get_random_sentence(language_choice):
-    return random.choice(SENTENCE_BANK[language_choice])
 def highlight_differences(ref, hyp):
-    """Return HTML string highlighting differences between ref and hyp at word level."""
-    ref_words = ref.strip().split()
-    hyp_words = hyp.strip().split()
     sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
@@ -131,57 +118,73 @@ def highlight_differences(ref, hyp):
             out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
-# ---------------- MAIN PIPELINE ---------------- #
-def compare_pronunciation(audio, language_choice, intended_sentence, pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
-        return "No audio or intended sentence provided.", "", "", "", "", ""
     lang_code = LANG_CODES[language_choice]
     primer_weak, primer_strong = LANG_PRIMERS[language_choice]
-    # Pass 1
-    actual_text = transcribe_once(
-        audio_path=audio,
-        lang_code=lang_code,
-        initial_prompt=primer_weak,
-        beam_size=pass1_beam,
-        temperature=pass1_temp,
-        condition_on_previous_text=pass1_condition
-    )
-    # Pass 2 (fixed settings)
     strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
-    corrected_text = transcribe_once(
-        audio_path=audio,
-        lang_code=lang_code,
-        initial_prompt=strict_prompt,
-        beam_size=5,
-        temperature=0.0,
-        condition_on_previous_text=False
-    )
     # Scores
     wer_val = jiwer.wer(intended_sentence, actual_text)
     cer_val = jiwer.cer(intended_sentence, actual_text)
-    # HK translit
-    hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
-    # Highlighted diff
     diff_html = highlight_differences(intended_sentence, actual_text)
-    return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html
 # ---------------- UI ---------------- #
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙 Pronunciation Comparator with Random Sentence & Word Highlighting\n"
-                "Generate a sentence, read it aloud, and see exactly which words differ from the target.")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
         gen_btn = gr.Button("🎲 Generate Sentence")
-    intended_display = gr.Textbox(label="Generated Sentence (Read this aloud)", interactive=False)
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
@@ -195,10 +198,15 @@ with gr.Blocks() as demo:
         hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
     with gr.Row():
-        wer_out = gr.Textbox(label="Word Error Rate vs Intended")
-        cer_out = gr.Textbox(label="Character Error Rate vs Intended")
-    diff_html_box = gr.HTML(label="Differences Highlighted")
     gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
@@ -206,7 +214,11 @@ with gr.Blocks() as demo:
     submit_btn.click(
         fn=compare_pronunciation,
         inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
-        outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out, diff_html_box]
     )
 if __name__ == "__main__":

 import gradio as gr
 import random
 import difflib
+import re
+import jiwer
+import torch
+import soundfile as sf
 from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
+from transformers import AutoModelForTextToSpeech, AutoTokenizer, pipeline
 # ---------------- CONFIG ---------------- #
 MODEL_NAME = "large-v2"
 DEVICE = "cpu"
 LANG_CODES = {
+    "English": "en", "Tamil": "ta", "Malayalam": "ml",
+    "Hindi": "hi", "Sanskrit": "sa"
 }
 LANG_PRIMERS = {
+    "English": ("The transcript should be in English only.",
+                "Write only in English without translation. Example: This is an English sentence."),
+    "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
+              "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
+    "Malayalam": ("ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
+                  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
+    "Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
+              "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
+    "Sanskrit": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
+                 "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।")
 }
 SCRIPT_PATTERNS = {
 }
 SENTENCE_BANK = {
+    "English": ["The sun sets over the horizon.",
+                "Learning languages is fun.",
+                "I like to drink coffee in the morning."],
+    "Tamil": ["இன்று நல்ல வானிலை உள்ளது.",
+              "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
+              "எனக்கு புத்தகம் படிக்க விருப்பம்."],
+    "Malayalam": ["എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
+                  "ഇന്ന് മഴപെയ്യുന്നു.",
+                  "ഞാൻ പുസ്തകം വായിക്കുന്നു."],
+    "Hindi": ["आज मौसम अच्छा है।",
+              "मुझे हिंदी बोलना पसंद है।",
+              "मैं किताब पढ़ रहा हूँ।"],
+    "Sanskrit": ["अहं ग्रन्थं पठामि।",
+                 "अद्य सूर्यः तेजस्वी अस्ति।",
+                 "मम नाम रामः।"]
 }
+# Voice/style mapping for IndicParler-TTS
+VOICE_STYLE = {
+    "English": "An English female voice with a neutral Indian accent.",
+    "Tamil": "A female speaker with a clear Tamil accent.",
+    "Malayalam": "A female speaker with a clear Malayali accent.",
+    "Hindi": "A female speaker with a neutral Hindi accent.",
+    "Sanskrit": "A female speaker reading in classical Sanskrit style."
+}
+# ---------------- LOAD MODELS ---------------- #
 print("Loading Whisper model...")
+whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
+print("Loading IndicParler-TTS...")
+TTS_MODEL_ID = "ai4bharat/indic-parler-tts"
+tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_ID)
+tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
+tts_pipe = pipeline("text-to-speech", model=tts_model, tokenizer=tts_tokenizer)
 # ---------------- HELPERS ---------------- #
+def get_random_sentence(language_choice):
+    return random.choice(SENTENCE_BANK[language_choice])
 def is_script(text, lang_name):
+    pat = SCRIPT_PATTERNS.get(lang_name)
+    return bool(pat.search(text)) if pat else True
 def transliterate_to_hk(text, lang_choice):
     mapping = {
+        "Tamil": sanscript.TAMIL, "Malayalam": sanscript.MALAYALAM,
+        "Hindi": sanscript.DEVANAGARI, "Sanskrit": sanscript.DEVANAGARI,
         "English": None
     }
     return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
 def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
+    segments, _ = whisper_model.transcribe(
+        audio_path, language=lang_code, task="transcribe",
+        initial_prompt=initial_prompt, beam_size=beam_size,
+        temperature=temperature, condition_on_previous_text=condition_on_previous_text,
         word_timestamps=False
     )
     return "".join(s.text for s in segments).strip()
 def highlight_differences(ref, hyp):
+    ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
     sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
     out_html = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
             out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
     return " ".join(out_html)
+def char_level_highlight(ref, hyp):
+    # Highlight correct in green, incorrect in red underline
+    sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
+    out = []
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag == 'equal':
+            out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
+        elif tag in ('replace', 'delete'):
+            out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
+        elif tag == 'insert':
+            # Characters only in hyp - show orange
+            out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
+    return "".join(out)
+def synthesize_tts(text, lang_choice):
+    if not text.strip():
+        return None
+    prompt_style = VOICE_STYLE.get(lang_choice, "")
+    audio_out = tts_pipe(text, forward_params={"description": prompt_style})
+    return (audio_out["sampling_rate"], audio_out["audio"])
+# ---------------- MAIN ---------------- #
+def compare_pronunciation(audio, language_choice, intended_sentence,
+                           pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
+        return "No audio or intended sentence.", "", "", "", "", "", None, None, "", ""
     lang_code = LANG_CODES[language_choice]
     primer_weak, primer_strong = LANG_PRIMERS[language_choice]
+    # Pass 1 - actual speech
+    actual_text = transcribe_once(audio, lang_code, primer_weak,
+                                  pass1_beam, pass1_temp, pass1_condition)
+    # Pass 2 - target biased (fixed)
     strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
+    corrected_text = transcribe_once(audio, lang_code, strict_prompt,
+                                     beam_size=5, temperature=0.0, condition_on_previous_text=False)
     # Scores
     wer_val = jiwer.wer(intended_sentence, actual_text)
     cer_val = jiwer.cer(intended_sentence, actual_text)
+    # Transliteration - pass1
+    hk_translit = transliterate_to_hk(actual_text, language_choice) \
+                  if is_script(actual_text, language_choice) \
+                  else f"[Script mismatch: expected {language_choice}]"
     diff_html = highlight_differences(intended_sentence, actual_text)
+    char_html = char_level_highlight(intended_sentence, actual_text)
+    # TTS for intended & pass1
+    tts_intended = synthesize_tts(intended_sentence, language_choice)
+    tts_pass1 = synthesize_tts(actual_text, language_choice)
+    return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html, tts_intended, tts_pass1, char_html, intended_sentence
 # ---------------- UI ---------------- #
 with gr.Blocks() as demo:
+    gr.Markdown("## 🎙 Pronunciation Comparator + IndicParler‑TTS + Error Highlighting\n"
+                "Generate sentence → Listen to TTS → Read aloud → See errors → Listen to your transcription")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
         gen_btn = gr.Button("🎲 Generate Sentence")
+    intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
         hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
     with gr.Row():
+        wer_out = gr.Textbox(label="Word Error Rate")
+        cer_out = gr.Textbox(label="Character Error Rate")
+    diff_html_box = gr.HTML(label="Word Differences Highlighted")
+    char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
+    with gr.Row():
+        intended_tts_audio = gr.Audio(label="TTS - Intended Sentence", type="numpy")
+        pass1_tts_audio = gr.Audio(label="TTS - Pass1 Output", type="numpy")
     gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
     submit_btn.click(
         fn=compare_pronunciation,
         inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
+        outputs=[
+            pass1_out, pass2_out, hk_out, wer_out, cer_out,
+            diff_html_box, intended_tts_audio, pass1_tts_audio,
+            char_html_box, intended_display
+        ]
     )
 if __name__ == "__main__":