Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 12 days ago

Commit

455645c

verified ·

1 Parent(s): 35c6cb3

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -34

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import gradio as gr
 import random
 from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
 import re
-import jiwer  # pip install jiwer
 # ---------------- CONFIG ---------------- #
 MODEL_NAME = "large-v2"
@@ -49,7 +50,6 @@ SCRIPT_PATTERNS = {
     "English": re.compile(r"[A-Za-z]")
 }
-# Example sentence bank for random generation
 SENTENCE_BANK = {
     "English": [
         "The sun sets over the horizon.",
@@ -85,9 +85,7 @@ model = WhisperModel(MODEL_NAME, device=DEVICE)
 # ---------------- HELPERS ---------------- #
 def is_script(text, lang_name):
     pattern = SCRIPT_PATTERNS.get(lang_name)
-    if not pattern:
-        return True
-    return bool(pattern.search(text))
 def transliterate_to_hk(text, lang_choice):
     mapping = {
@@ -97,13 +95,10 @@ def transliterate_to_hk(text, lang_choice):
         "Sanskrit": sanscript.DEVANAGARI,
         "English": None
     }
-    if mapping[lang_choice]:
-        return transliterate(text, mapping[lang_choice], sanscript.HK)
-    else:
-        return text
 def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
-    segments, info = model.transcribe(
         audio_path,
         language=lang_code,
         task="transcribe",
@@ -118,50 +113,69 @@ def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperatur
 def get_random_sentence(language_choice):
     return random.choice(SENTENCE_BANK[language_choice])
 # ---------------- MAIN PIPELINE ---------------- #
-def compare_pronunciation(audio, language_choice, intended_sentence, pass2_beam, pass2_temp, pass2_condition):
     if audio is None or not intended_sentence.strip():
-        return "No audio or intended sentence provided.", "", "", "", ""
     lang_code = LANG_CODES[language_choice]
     primer_weak, primer_strong = LANG_PRIMERS[language_choice]
-    # Pass 1: Actual speech (no bias with intended sentence)
     actual_text = transcribe_once(
         audio_path=audio,
         lang_code=lang_code,
         initial_prompt=primer_weak,
-        beam_size=8,
-        temperature=0.4,
-        condition_on_previous_text=True
     )
-    # Pass 2: Target-biased output
     strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
     corrected_text = transcribe_once(
         audio_path=audio,
         lang_code=lang_code,
         initial_prompt=strict_prompt,
-        beam_size=pass2_beam,
-        temperature=pass2_temp,
-        condition_on_previous_text=pass2_condition
     )
-    # Error Rates
     wer_val = jiwer.wer(intended_sentence, actual_text)
     cer_val = jiwer.cer(intended_sentence, actual_text)
-    # Transliteration
-    if is_script(actual_text, language_choice):
-        hk_translit = transliterate_to_hk(actual_text, language_choice)
-    else:
-        hk_translit = f"[Script mismatch: expected {language_choice}]"
-    return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}"
 # ---------------- UI ---------------- #
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ Pronunciation Comparator with Random Sentence\nClick 'Generate Sentence', read it aloud, and compare actual vs intended output.")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
@@ -171,9 +185,9 @@ with gr.Blocks() as demo:
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
-        pass2_beam = gr.Slider(1, 10, value=5, step=1, label="Pass 2 Beam Size")
-        pass2_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Pass 2 Temperature")
-        pass2_condition = gr.Checkbox(value=False, label="Pass 2: Condition on previous text")
     with gr.Row():
         pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
@@ -184,14 +198,15 @@ with gr.Blocks() as demo:
         wer_out = gr.Textbox(label="Word Error Rate vs Intended")
         cer_out = gr.Textbox(label="Character Error Rate vs Intended")
     gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
     submit_btn = gr.Button("Analyze Pronunciation")
     submit_btn.click(
         fn=compare_pronunciation,
-        inputs=[audio_input, lang_choice, intended_display, pass2_beam, pass2_temp, pass2_condition],
-        outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out]
     )
 if __name__ == "__main__":

 import gradio as gr
 import random
+import difflib
 from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
 import re
+import jiwer
 # ---------------- CONFIG ---------------- #
 MODEL_NAME = "large-v2"
     "English": re.compile(r"[A-Za-z]")
 }
 SENTENCE_BANK = {
     "English": [
         "The sun sets over the horizon.",
 # ---------------- HELPERS ---------------- #
 def is_script(text, lang_name):
     pattern = SCRIPT_PATTERNS.get(lang_name)
+    return bool(pattern.search(text)) if pattern else True
 def transliterate_to_hk(text, lang_choice):
     mapping = {
         "Sanskrit": sanscript.DEVANAGARI,
         "English": None
     }
+    return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
 def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
+    segments, _ = model.transcribe(
         audio_path,
         language=lang_code,
         task="transcribe",
 def get_random_sentence(language_choice):
     return random.choice(SENTENCE_BANK[language_choice])
+def highlight_differences(ref, hyp):
+    """Return HTML string highlighting differences between ref and hyp at word level."""
+    ref_words = ref.strip().split()
+    hyp_words = hyp.strip().split()
+    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
+    out_html = []
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag == 'equal':
+            out_html.extend([f"<span style='color:green'>{w}</span>" for w in ref_words[i1:i2]])
+        elif tag == 'replace':
+            out_html.extend([f"<span style='color:red'>{w}</span>" for w in ref_words[i1:i2]])
+            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
+        elif tag == 'delete':
+            out_html.extend([f"<span style='color:red;text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
+        elif tag == 'insert':
+            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
+    return " ".join(out_html)
 # ---------------- MAIN PIPELINE ---------------- #
+def compare_pronunciation(audio, language_choice, intended_sentence, pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
+        return "No audio or intended sentence provided.", "", "", "", "", ""
     lang_code = LANG_CODES[language_choice]
     primer_weak, primer_strong = LANG_PRIMERS[language_choice]
+    # Pass 1
     actual_text = transcribe_once(
         audio_path=audio,
         lang_code=lang_code,
         initial_prompt=primer_weak,
+        beam_size=pass1_beam,
+        temperature=pass1_temp,
+        condition_on_previous_text=pass1_condition
     )
+    # Pass 2 (fixed settings)
     strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
     corrected_text = transcribe_once(
         audio_path=audio,
         lang_code=lang_code,
         initial_prompt=strict_prompt,
+        beam_size=5,
+        temperature=0.0,
+        condition_on_previous_text=False
     )
+    # Scores
     wer_val = jiwer.wer(intended_sentence, actual_text)
     cer_val = jiwer.cer(intended_sentence, actual_text)
+    # HK translit
+    hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
+    # Highlighted diff
+    diff_html = highlight_differences(intended_sentence, actual_text)
+    return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html
 # ---------------- UI ---------------- #
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎙 Pronunciation Comparator with Random Sentence & Word Highlighting\n"
+                "Generate a sentence, read it aloud, and see exactly which words differ from the target.")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
+        pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
+        pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
+        pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")
     with gr.Row():
         pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
         wer_out = gr.Textbox(label="Word Error Rate vs Intended")
         cer_out = gr.Textbox(label="Character Error Rate vs Intended")
+    diff_html_box = gr.HTML(label="Differences Highlighted")
     gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
     submit_btn = gr.Button("Analyze Pronunciation")
     submit_btn.click(
         fn=compare_pronunciation,
+        inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
+        outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out, diff_html_box]
     )
 if __name__ == "__main__":