Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

File size: 11,802 Bytes

import gradio as gr
import random
import difflib
import re
import jiwer
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from faster_whisper import WhisperModel
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import soundfile as sf

# ---------------- CONFIG ---------------- #
MODEL_NAME = "large-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

LANG_CODES = {
    "English": "en",
    "Tamil": "ta",
    "Malayalam": "ml",
    "Hindi": "hi",
    "Sanskrit": "sa"
}

LANG_PRIMERS = {
    "English": ("The transcript should be in English only.",
                "Write only in English without translation. Example: This is an English sentence."),
    "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
              "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
    "Malayalam": ("ട്രാൻസ്ഖ്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
                  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
    "Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
              "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
    "Sanskrit": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
                 "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।")
}

SCRIPT_PATTERNS = {
    "Tamil": re.compile(r"[஀-௿]"),
    "Malayalam": re.compile(r"[ഀ-ൿ]"),
    "Hindi": re.compile(r"[ऀ-ॿ]"),
    "Sanskrit": re.compile(r"[ऀ-ॿ]"),
    "English": re.compile(r"[A-Za-z]")
}

SENTENCE_BANK = {
    "English": [
        "The sun sets over the horizon.",
        "Learning languages is fun.",
        "I like to drink coffee in the morning."
    ],
    "Tamil": [
        "இன்று நல்ல வானிலை உள்ளது.",
        "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
        "எனக்கு புத்தகம் படிக்க விருப்பம்."
    ],
    "Malayalam": [
        "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
        "ഇന്ന് മഴപെയ്യുന്നു.",
        "ഞാൻ പുസ്തകം വായിക്കുന്നു."
    ],
    "Hindi": [
        "आज मौसम अच्छा है।",
        "मुझे हिंदी बोलना पसंद है।",
        "मैं किताब पढ़ रहा हूँ।"
    ],
    "Sanskrit": [
        "अहं ग्रन्थं पठामि।",
        "अद्य सूर्यः तेजस्वी अस्ति।",
        "मम नाम रामः।"
    ]
}

VOICE_STYLE = {
    "English": "An English female voice with a neutral Indian accent.",
    "Tamil": "A female speaker with a clear Tamil accent.",
    "Malayalam": "A female speaker with a clear Malayali accent.",
    "Hindi": "A female speaker with a neutral Hindi accent.",
    "Sanskrit": "A female speaker reading in classical Sanskrit style."
}

# ---------------- LOAD MODELS ---------------- #
print("Loading Whisper model...")
whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)

print("Loading Parler-TTS model...")
parler_model_id = "parler-tts/parler-tts-mini-v1"  # You may switch to larger models if desired
parler_tts_model = ParlerTTSForConditionalGeneration.from_pretrained(parler_model_id).to(DEVICE)
parler_tts_tokenizer = AutoTokenizer.from_pretrained(parler_model_id)

# ---------------- HELPERS ---------------- #
def get_random_sentence(language_choice):
    return random.choice(SENTENCE_BANK[language_choice])

def is_script(text, lang_name):
    pattern = SCRIPT_PATTERNS.get(lang_name)
    return bool(pattern.search(text)) if pattern else True

def transliterate_to_hk(text, lang_choice):
    mapping = {
        "Tamil": sanscript.TAMIL,
        "Malayalam": sanscript.MALAYALAM,
        "Hindi": sanscript.DEVANAGARI,
        "Sanskrit": sanscript.DEVANAGARI,
        "English": None
    }
    return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text

def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
    segments, _ = whisper_model.transcribe(
        audio_path,
        language=lang_code,
        task="transcribe",
        initial_prompt=initial_prompt,
        beam_size=beam_size,
        temperature=temperature,
        condition_on_previous_text=condition_on_previous_text,
        word_timestamps=False
    )
    return "".join(s.text for s in segments).strip()

def highlight_differences(ref, hyp):
    ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
    sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
    out_html = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'equal':
            out_html.extend([f"<span style='color:green'>{w}</span>" for w in ref_words[i1:i2]])
        elif tag == 'replace':
            out_html.extend([f"<span style='color:red'>{w}</span>" for w in ref_words[i1:i2]])
            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
        elif tag == 'delete':
            out_html.extend([f"<span style='color:red;text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
        elif tag == 'insert':
            out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
    return " ".join(out_html)

def char_level_highlight(ref, hyp):
    sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
    out = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'equal':
            out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
        elif tag in ('replace', 'delete'):
            out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
        elif tag == 'insert':
            out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
    return "".join(out)

def synthesize_tts(text, lang_choice):
    if not text.strip():
        return None
    description = VOICE_STYLE.get(lang_choice, "")
    description_input = parler_tts_tokenizer(description, return_tensors='pt').to(DEVICE)
    prompt_input = parler_tts_tokenizer(text, return_tensors='pt').to(DEVICE)
    generation = parler_tts_model.generate(
        input_ids=description_input.input_ids,
        attention_mask=description_input.attention_mask,
        prompt_input_ids=prompt_input.input_ids,
        prompt_attention_mask=prompt_input.attention_mask
    )
    audio_arr = generation.cpu().numpy().squeeze()
    # Parler-TTS default sample rate is 24000
    return 24000, audio_arr

# ---------------- MAIN ---------------- #
def compare_pronunciation(audio, language_choice, intended_sentence,
                          pass1_beam, pass1_temp, pass1_condition):
    if audio is None or not intended_sentence.strip():
        return ("No audio or intended sentence.", "", "", "", "", "",
                None, None, "", "")

    lang_code = LANG_CODES[language_choice]
    primer_weak, primer_strong = LANG_PRIMERS[language_choice]

    # Pass 1: raw transcription with user-configured decoding parameters
    actual_text = transcribe_once(audio, lang_code, primer_weak,
                                  pass1_beam, pass1_temp, pass1_condition)

    # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
    strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
    corrected_text = transcribe_once(audio, lang_code, strict_prompt,
                                     beam_size=5, temperature=0.0, condition_on_previous_text=False)

    # Compute WER and CER
    wer_val = jiwer.wer(intended_sentence, actual_text)
    cer_val = jiwer.cer(intended_sentence, actual_text)

    # Transliteration of Pass 1 output
    hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"

    # Highlight word-level and character-level differences
    diff_html = highlight_differences(intended_sentence, actual_text)
    char_html = char_level_highlight(intended_sentence, actual_text)

    # Synthesized TTS audios for intended and Pass 1 text
    tts_intended = synthesize_tts(intended_sentence, language_choice)
    tts_pass1 = synthesize_tts(actual_text, language_choice)

    return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
            diff_html, tts_intended, tts_pass1, char_html, intended_sentence)

# ---------------- UI ---------------- #
with gr.Blocks() as demo:
    gr.Markdown("## 🎙 Pronunciation Comparator + Parler-TTS + Highlights")

    with gr.Row():
        lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
        gen_btn = gr.Button("🎲 Generate Sentence")

    intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)

    with gr.Row():
        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
        pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
        pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
        pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")

    with gr.Row():
        pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
        pass2_out = gr.Textbox(label="Pass 2: Target-Biased Output")
        hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")

    with gr.Row():
        wer_out = gr.Textbox(label="Word Error Rate")
        cer_out = gr.Textbox(label="Character Error Rate")

    diff_html_box = gr.HTML(label="Word Differences Highlighted")
    char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")

    with gr.Row():
        intended_tts_audio = gr.Audio(label="TTS - Intended Sentence", type="numpy")
        pass1_tts_audio = gr.Audio(label="TTS - Pass1 Output", type="numpy")

    gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])

    submit_btn = gr.Button("Analyze Pronunciation")

    submit_btn.click(
        fn=compare_pronunciation,
        inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
        outputs=[
            pass1_out, pass2_out, hk_out, wer_out, cer_out,
            diff_html_box, intended_tts_audio, pass1_tts_audio,
            char_html_box, intended_display
        ]
    )

if __name__ == "__main__":
    demo.launch()