sudhanm's picture
Update app.py
7a61446 verified
raw
history blame
8.18 kB
import gradio as gr
import random
from faster_whisper import WhisperModel
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import re
import jiwer # pip install jiwer
# ---------------- CONFIG ---------------- #
MODEL_NAME = "large-v2"
DEVICE = "cpu"
LANG_CODES = {
"English": "en",
"Tamil": "ta",
"Malayalam": "ml",
"Hindi": "hi",
"Sanskrit": "sa"
}
LANG_PRIMERS = {
"English": (
"The transcript should be in English only.",
"Write only in English without translation. Example: This is an English sentence."
),
"Tamil": (
"நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
"தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."
),
"Malayalam": (
"ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."
),
"Hindi": (
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"
),
"Sanskrit": (
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।"
)
}
SCRIPT_PATTERNS = {
"Tamil": re.compile(r"[\u0B80-\u0BFF]"),
"Malayalam": re.compile(r"[\u0D00-\u0D7F]"),
"Hindi": re.compile(r"[\u0900-\u097F]"),
"Sanskrit": re.compile(r"[\u0900-\u097F]"),
"English": re.compile(r"[A-Za-z]")
}
# Example sentence bank for random generation
SENTENCE_BANK = {
"English": [
"The sun sets over the horizon.",
"Learning languages is fun.",
"I like to drink coffee in the morning."
],
"Tamil": [
"இன்று நல்ல வானிலை உள்ளது.",
"நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
"எனக்கு புத்தகம் படிக்க விருப்பம்."
],
"Malayalam": [
"എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
"ഇന്ന് മഴപെയ്യുന്നു.",
"ഞാൻ പുസ്തകം വായിക്കുന്നു."
],
"Hindi": [
"आज मौसम अच्छा है।",
"मुझे हिंदी बोलना पसंद है।",
"मैं किताब पढ़ रहा हूँ।"
],
"Sanskrit": [
"अहं ग्रन्थं पठामि।",
"अद्य सूर्यः तेजस्वी अस्ति।",
"मम नाम रामः।"
]
}
# ---------------- MODEL ---------------- #
print("Loading Whisper model...")
model = WhisperModel(MODEL_NAME, device=DEVICE)
# ---------------- HELPERS ---------------- #
def is_script(text, lang_name):
pattern = SCRIPT_PATTERNS.get(lang_name)
if not pattern:
return True
return bool(pattern.search(text))
def transliterate_to_hk(text, lang_choice):
mapping = {
"Tamil": sanscript.TAMIL,
"Malayalam": sanscript.MALAYALAM,
"Hindi": sanscript.DEVANAGARI,
"Sanskrit": sanscript.DEVANAGARI,
"English": None
}
if mapping[lang_choice]:
return transliterate(text, mapping[lang_choice], sanscript.HK)
else:
return text
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
segments, info = model.transcribe(
audio_path,
language=lang_code,
task="transcribe",
initial_prompt=initial_prompt,
beam_size=beam_size,
temperature=temperature,
condition_on_previous_text=condition_on_previous_text,
word_timestamps=False
)
return "".join(s.text for s in segments).strip()
def get_random_sentence(language_choice):
return random.choice(SENTENCE_BANK[language_choice])
# ---------------- MAIN PIPELINE ---------------- #
def compare_pronunciation(audio, language_choice, intended_sentence, pass2_beam, pass2_temp, pass2_condition):
if audio is None or not intended_sentence.strip():
return "No audio or intended sentence provided.", "", "", "", ""
lang_code = LANG_CODES[language_choice]
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
# Pass 1: Actual speech (no bias with intended sentence)
actual_text = transcribe_once(
audio_path=audio,
lang_code=lang_code,
initial_prompt=primer_weak,
beam_size=8,
temperature=0.4,
condition_on_previous_text=True
)
# Pass 2: Target-biased output
strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
corrected_text = transcribe_once(
audio_path=audio,
lang_code=lang_code,
initial_prompt=strict_prompt,
beam_size=pass2_beam,
temperature=pass2_temp,
condition_on_previous_text=pass2_condition
)
# Error Rates
wer_val = jiwer.wer(intended_sentence, actual_text)
cer_val = jiwer.cer(intended_sentence, actual_text)
# Transliteration
if is_script(actual_text, language_choice):
hk_translit = transliterate_to_hk(actual_text, language_choice)
else:
hk_translit = f"[Script mismatch: expected {language_choice}]"
return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}"
# ---------------- UI ---------------- #
with gr.Blocks() as demo:
gr.Markdown("# 🎙️ Pronunciation Comparator with Random Sentence\nClick 'Generate Sentence', read it aloud, and compare actual vs intended output.")
with gr.Row():
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
gen_btn = gr.Button("🎲 Generate Sentence")
intended_display = gr.Textbox(label="Generated Sentence (Read this aloud)", interactive=False)
with gr.Row():
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
pass2_beam = gr.Slider(1, 10, value=5, step=1, label="Pass 2 Beam Size")
pass2_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Pass 2 Temperature")
pass2_condition = gr.Checkbox(value=False, label="Pass 2: Condition on previous text")
with gr.Row():
pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
pass2_out = gr.Textbox(label="Pass 2: Target-Biased Output")
hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
with gr.Row():
wer_out = gr.Textbox(label="Word Error Rate vs Intended")
cer_out = gr.Textbox(label="Character Error Rate vs Intended")
gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
submit_btn = gr.Button("Analyze Pronunciation")
submit_btn.click(
fn=compare_pronunciation,
inputs=[audio_input, lang_choice, intended_display, pass2_beam, pass2_temp, pass2_condition],
outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out]
)
if __name__ == "__main__":
demo.launch()