Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import random | |
from faster_whisper import WhisperModel | |
from indic_transliteration import sanscript | |
from indic_transliteration.sanscript import transliterate | |
import re | |
import jiwer # pip install jiwer | |
# ---------------- CONFIG ---------------- # | |
MODEL_NAME = "large-v2" | |
DEVICE = "cpu" | |
LANG_CODES = { | |
"English": "en", | |
"Tamil": "ta", | |
"Malayalam": "ml", | |
"Hindi": "hi", | |
"Sanskrit": "sa" | |
} | |
LANG_PRIMERS = { | |
"English": ( | |
"The transcript should be in English only.", | |
"Write only in English without translation. Example: This is an English sentence." | |
), | |
"Tamil": ( | |
"நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.", | |
"தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்." | |
), | |
"Malayalam": ( | |
"ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.", | |
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം." | |
), | |
"Hindi": ( | |
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।", | |
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।" | |
), | |
"Sanskrit": ( | |
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।", | |
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।" | |
) | |
} | |
SCRIPT_PATTERNS = { | |
"Tamil": re.compile(r"[\u0B80-\u0BFF]"), | |
"Malayalam": re.compile(r"[\u0D00-\u0D7F]"), | |
"Hindi": re.compile(r"[\u0900-\u097F]"), | |
"Sanskrit": re.compile(r"[\u0900-\u097F]"), | |
"English": re.compile(r"[A-Za-z]") | |
} | |
# Example sentence bank for random generation | |
SENTENCE_BANK = { | |
"English": [ | |
"The sun sets over the horizon.", | |
"Learning languages is fun.", | |
"I like to drink coffee in the morning." | |
], | |
"Tamil": [ | |
"இன்று நல்ல வானிலை உள்ளது.", | |
"நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.", | |
"எனக்கு புத்தகம் படிக்க விருப்பம்." | |
], | |
"Malayalam": [ | |
"എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.", | |
"ഇന്ന് മഴപെയ്യുന്നു.", | |
"ഞാൻ പുസ്തകം വായിക്കുന്നു." | |
], | |
"Hindi": [ | |
"आज मौसम अच्छा है।", | |
"मुझे हिंदी बोलना पसंद है।", | |
"मैं किताब पढ़ रहा हूँ।" | |
], | |
"Sanskrit": [ | |
"अहं ग्रन्थं पठामि।", | |
"अद्य सूर्यः तेजस्वी अस्ति।", | |
"मम नाम रामः।" | |
] | |
} | |
# ---------------- MODEL ---------------- # | |
print("Loading Whisper model...") | |
model = WhisperModel(MODEL_NAME, device=DEVICE) | |
# ---------------- HELPERS ---------------- # | |
def is_script(text, lang_name): | |
pattern = SCRIPT_PATTERNS.get(lang_name) | |
if not pattern: | |
return True | |
return bool(pattern.search(text)) | |
def transliterate_to_hk(text, lang_choice): | |
mapping = { | |
"Tamil": sanscript.TAMIL, | |
"Malayalam": sanscript.MALAYALAM, | |
"Hindi": sanscript.DEVANAGARI, | |
"Sanskrit": sanscript.DEVANAGARI, | |
"English": None | |
} | |
if mapping[lang_choice]: | |
return transliterate(text, mapping[lang_choice], sanscript.HK) | |
else: | |
return text | |
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text): | |
segments, info = model.transcribe( | |
audio_path, | |
language=lang_code, | |
task="transcribe", | |
initial_prompt=initial_prompt, | |
beam_size=beam_size, | |
temperature=temperature, | |
condition_on_previous_text=condition_on_previous_text, | |
word_timestamps=False | |
) | |
return "".join(s.text for s in segments).strip() | |
def get_random_sentence(language_choice): | |
return random.choice(SENTENCE_BANK[language_choice]) | |
# ---------------- MAIN PIPELINE ---------------- # | |
def compare_pronunciation(audio, language_choice, intended_sentence, pass2_beam, pass2_temp, pass2_condition): | |
if audio is None or not intended_sentence.strip(): | |
return "No audio or intended sentence provided.", "", "", "", "" | |
lang_code = LANG_CODES[language_choice] | |
primer_weak, primer_strong = LANG_PRIMERS[language_choice] | |
# Pass 1: Actual speech (no bias with intended sentence) | |
actual_text = transcribe_once( | |
audio_path=audio, | |
lang_code=lang_code, | |
initial_prompt=primer_weak, | |
beam_size=8, | |
temperature=0.4, | |
condition_on_previous_text=True | |
) | |
# Pass 2: Target-biased output | |
strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}" | |
corrected_text = transcribe_once( | |
audio_path=audio, | |
lang_code=lang_code, | |
initial_prompt=strict_prompt, | |
beam_size=pass2_beam, | |
temperature=pass2_temp, | |
condition_on_previous_text=pass2_condition | |
) | |
# Error Rates | |
wer_val = jiwer.wer(intended_sentence, actual_text) | |
cer_val = jiwer.cer(intended_sentence, actual_text) | |
# Transliteration | |
if is_script(actual_text, language_choice): | |
hk_translit = transliterate_to_hk(actual_text, language_choice) | |
else: | |
hk_translit = f"[Script mismatch: expected {language_choice}]" | |
return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}" | |
# ---------------- UI ---------------- # | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🎙️ Pronunciation Comparator with Random Sentence\nClick 'Generate Sentence', read it aloud, and compare actual vs intended output.") | |
with gr.Row(): | |
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language") | |
gen_btn = gr.Button("🎲 Generate Sentence") | |
intended_display = gr.Textbox(label="Generated Sentence (Read this aloud)", interactive=False) | |
with gr.Row(): | |
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
pass2_beam = gr.Slider(1, 10, value=5, step=1, label="Pass 2 Beam Size") | |
pass2_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Pass 2 Temperature") | |
pass2_condition = gr.Checkbox(value=False, label="Pass 2: Condition on previous text") | |
with gr.Row(): | |
pass1_out = gr.Textbox(label="Pass 1: What You Actually Said") | |
pass2_out = gr.Textbox(label="Pass 2: Target-Biased Output") | |
hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)") | |
with gr.Row(): | |
wer_out = gr.Textbox(label="Word Error Rate vs Intended") | |
cer_out = gr.Textbox(label="Character Error Rate vs Intended") | |
gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display]) | |
submit_btn = gr.Button("Analyze Pronunciation") | |
submit_btn.click( | |
fn=compare_pronunciation, | |
inputs=[audio_input, lang_choice, intended_display, pass2_beam, pass2_temp, pass2_condition], | |
outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out] | |
) | |
if __name__ == "__main__": | |
demo.launch() | |