Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 12 days ago

Commit

bc807f8

verified ·

1 Parent(s): 73253af

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -15

app.py CHANGED Viewed

@@ -3,12 +3,10 @@ import random
 import difflib
 import re
 import jiwer
-import torch
-import soundfile as sf
 from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
-from transformers import AutoModelForTextToSpeech, AutoTokenizer, pipeline
 # ---------------- CONFIG ---------------- #
 MODEL_NAME = "large-v2"
@@ -58,7 +56,6 @@ SENTENCE_BANK = {
                  "मम नाम रामः।"]
 }
-# Voice/style mapping for IndicParler-TTS
 VOICE_STYLE = {
     "English": "An English female voice with a neutral Indian accent.",
     "Tamil": "A female speaker with a clear Tamil accent.",
@@ -71,11 +68,9 @@ VOICE_STYLE = {
 print("Loading Whisper model...")
 whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
-print("Loading IndicParler-TTS...")
 TTS_MODEL_ID = "ai4bharat/indic-parler-tts"
-tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_ID)
-tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
-tts_pipe = pipeline("text-to-speech", model=tts_model, tokenizer=tts_tokenizer)
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
@@ -119,7 +114,6 @@ def highlight_differences(ref, hyp):
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
-    # Highlight correct in green, incorrect in red underline
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
@@ -128,7 +122,6 @@ def char_level_highlight(ref, hyp):
         elif tag in ('replace', 'delete'):
             out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
-            # Characters only in hyp - show orange
             out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
@@ -148,11 +141,11 @@ def compare_pronunciation(audio, language_choice, intended_sentence,
     lang_code = LANG_CODES[language_choice]
     primer_weak, primer_strong = LANG_PRIMERS[language_choice]
-    # Pass 1 - actual speech
     actual_text = transcribe_once(audio, lang_code, primer_weak,
                                   pass1_beam, pass1_temp, pass1_condition)
-    # Pass 2 - target biased (fixed)
     strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
     corrected_text = transcribe_once(audio, lang_code, strict_prompt,
                                      beam_size=5, temperature=0.0, condition_on_previous_text=False)
@@ -161,11 +154,12 @@ def compare_pronunciation(audio, language_choice, intended_sentence,
     wer_val = jiwer.wer(intended_sentence, actual_text)
     cer_val = jiwer.cer(intended_sentence, actual_text)
-    # Transliteration - pass1
     hk_translit = transliterate_to_hk(actual_text, language_choice) \
                   if is_script(actual_text, language_choice) \
                   else f"[Script mismatch: expected {language_choice}]"
     diff_html = highlight_differences(intended_sentence, actual_text)
     char_html = char_level_highlight(intended_sentence, actual_text)
@@ -177,8 +171,7 @@ def compare_pronunciation(audio, language_choice, intended_sentence,
 # ---------------- UI ---------------- #
 with gr.Blocks() as demo:
-    gr.Markdown("## 🎙 Pronunciation Comparator + IndicParler‑TTS + Error Highlighting\n"
-                "Generate sentence → Listen to TTS → Read aloud → See errors → Listen to your transcription")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")

 import difflib
 import re
 import jiwer
 from faster_whisper import WhisperModel
 from indic_transliteration import sanscript
 from indic_transliteration.sanscript import transliterate
+from transformers import pipeline  # only pipeline is needed for TTS
 # ---------------- CONFIG ---------------- #
 MODEL_NAME = "large-v2"
                  "मम नाम रामः।"]
 }
 VOICE_STYLE = {
     "English": "An English female voice with a neutral Indian accent.",
     "Tamil": "A female speaker with a clear Tamil accent.",
 print("Loading Whisper model...")
 whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
+print("Loading IndicParler-TTS via pipeline...")
 TTS_MODEL_ID = "ai4bharat/indic-parler-tts"
+tts_pipe = pipeline("text-to-speech", model=TTS_MODEL_ID)
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
     return " ".join(out_html)
 def char_level_highlight(ref, hyp):
     sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
     out = []
     for tag, i1, i2, j1, j2 in sm.get_opcodes():
         elif tag in ('replace', 'delete'):
             out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
         elif tag == 'insert':
             out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
     return "".join(out)
     lang_code = LANG_CODES[language_choice]
     primer_weak, primer_strong = LANG_PRIMERS[language_choice]
+    # Pass 1
     actual_text = transcribe_once(audio, lang_code, primer_weak,
                                   pass1_beam, pass1_temp, pass1_condition)
+    # Pass 2 (fixed)
     strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
     corrected_text = transcribe_once(audio, lang_code, strict_prompt,
                                      beam_size=5, temperature=0.0, condition_on_previous_text=False)
     wer_val = jiwer.wer(intended_sentence, actual_text)
     cer_val = jiwer.cer(intended_sentence, actual_text)
+    # Transliteration
     hk_translit = transliterate_to_hk(actual_text, language_choice) \
                   if is_script(actual_text, language_choice) \
                   else f"[Script mismatch: expected {language_choice}]"
+    # Highlights
     diff_html = highlight_differences(intended_sentence, actual_text)
     char_html = char_level_highlight(intended_sentence, actual_text)
 # ---------------- UI ---------------- #
 with gr.Blocks() as demo:
+    gr.Markdown("## 🎙 Pronunciation Comparator + IndicParler‑TTS + Error Highlighting")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")