Spaces:
Sleeping
Sleeping
# app.py | |
# HF Space: Whisper large-v2 (CPU) with strict script enforcement + optional English transliteration | |
# Languages: Tamil, Malayalam, English, Hindi, Sanskrit | |
import re | |
import gradio as gr | |
from faster_whisper import WhisperModel | |
from indic_transliteration import sanscript | |
from indic_transliteration.sanscript import transliterate | |
# ----------------------------- | |
# Model: load once on CPU | |
# ----------------------------- | |
# large-v2 is the best multilingual accuracy; int8 keeps CPU memory/latency reasonable on HF Spaces Free CPU | |
MODEL_NAME = "large-v2" | |
model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8") | |
# ----------------------------- | |
# Language config | |
# ----------------------------- | |
LANG_CHOICES = ["Tamil", "Malayalam", "Hindi", "Sanskrit", "English"] | |
LANG_CODES = { | |
"Tamil": "ta", | |
"Malayalam": "ml", | |
"Hindi": "hi", | |
"Sanskrit": "sa", | |
"English": "en", | |
} | |
# Unicode script ranges (basic) | |
RE_TAMIL = re.compile(r"[\u0B80-\u0BFF]") # Tamil | |
RE_MALAYALAM = re.compile(r"[\u0D00-\u0D7F]") # Malayalam | |
RE_DEVANAGARI = re.compile(r"[\u0900-\u097F]") # Devanagari (Hindi/Sanskrit) | |
RE_LATIN = re.compile(r"[A-Za-z]") # Basic Latin letters | |
# Primers: weak/strong anchors in each target script to nudge decoding | |
MALAYALAM_PRIMER_WEAK = "ഇത് മലയാളം ലിപിയിലാണ്." | |
MALAYALAM_PRIMER_STRONG = "ദയവായി എല്ലാ വാചകങ്ങളും മലയാളം ലിപിയിൽ മാത്രം എഴുതുക." | |
TAMIL_PRIMER_WEAK = "இது தமிழ் எழுத்தாகும்." | |
TAMIL_PRIMER_STRONG = "தயவுசெய்து அனைத்து வாக்கியங்களையும் தமிழ் எழுத்தில் மட்டுமே எழுதவும்." | |
HINDI_PRIMER_WEAK = "यह देवनागरी लिपि में लिखा गया है।" | |
HINDI_PRIMER_STRONG = "कृपया सभी वाक्यों को केवल देवनागरी लिपि में लिखें।" | |
SANSKRIT_PRIMER_WEAK = "इदं देवनागरी-लिप्याम् अस्ति।" | |
SANSKRIT_PRIMER_STRONG = "कृपया सर्वाणि वाक्यानि केवलं देवनागरी-लिप्याम् एव लिखत।" | |
ENGLISH_PRIMER_WEAK = "This is in the Latin script." | |
ENGLISH_PRIMER_STRONG = "Please write all sentences only in Latin script." | |
LANG_PRIMERS = { | |
"Malayalam": (MALAYALAM_PRIMER_WEAK, MALAYALAM_PRIMER_STRONG), | |
"Tamil": (TAMIL_PRIMER_WEAK, TAMIL_PRIMER_STRONG), | |
"Hindi": (HINDI_PRIMER_WEAK, HINDI_PRIMER_STRONG), | |
"Sanskrit": (SANSKRIT_PRIMER_WEAK, SANSKRIT_PRIMER_STRONG), | |
"English": (ENGLISH_PRIMER_WEAK, ENGLISH_PRIMER_STRONG), | |
} | |
# ----------------------------- | |
# Script checks & helpers | |
# ----------------------------- | |
def script_matches(text: str, lang_choice: str) -> bool: | |
"""Return True if text appears to be predominantly in the target script.""" | |
if not text: | |
return False | |
has_ta = bool(RE_TAMIL.search(text)) | |
has_ml = bool(RE_MALAYALAM.search(text)) | |
has_deva = bool(RE_DEVANAGARI.search(text)) | |
has_lat = bool(RE_LATIN.search(text)) | |
if lang_choice == "Tamil": | |
return has_ta and not (has_ml or has_deva) | |
if lang_choice == "Malayalam": | |
return has_ml and not (has_ta or has_deva) | |
if lang_choice in ("Hindi", "Sanskrit"): | |
# Expect Devanagari; tolerate Latin (numbers/punctuation) but no Tamil/Malayalam | |
return has_deva and not (has_ta or has_ml) | |
if lang_choice == "English": | |
# Expect Latin letters; ensure we don't have Tamil/Malayalam/Devanagari | |
return has_lat and not (has_ta or has_ml or has_deva) | |
return True # Fallback | |
def make_transliteration(text: str, lang_choice: str, scheme: str = "ITRANS") -> str: | |
"""Transliterate Indic scripts to an English-friendly romanization (default ITRANS).""" | |
if not text: | |
return "" | |
target_scheme = { | |
"ITRANS": sanscript.ITRANS, | |
"IAST": sanscript.IAST, | |
"HK": sanscript.HK, | |
}.get(scheme.upper(), sanscript.ITRANS) | |
if lang_choice == "Tamil": | |
return transliterate(text, sanscript.TAMIL, target_scheme) | |
elif lang_choice == "Malayalam": | |
return transliterate(text, sanscript.MALAYALAM, target_scheme) | |
elif lang_choice in ("Hindi", "Sanskrit"): | |
return transliterate(text, sanscript.DEVANAGARI, target_scheme) | |
else: | |
# English: return as-is | |
return text | |
def transcribe_once( | |
audio_path: str, | |
lang_code: str, | |
initial_prompt: str, | |
deterministic: bool = True, | |
beam_size: int = 1, | |
condition_on_previous_text: bool = False, | |
): | |
"""One pass of transcription with given decoding settings.""" | |
kwargs = dict( | |
language=lang_code, | |
task="transcribe", | |
condition_on_previous_text=condition_on_previous_text, | |
initial_prompt=initial_prompt, | |
word_timestamps=False, | |
) | |
if deterministic: | |
# temperature 0 and beam_size control creativity; 0 + beam=1 is very strict | |
kwargs.update(dict(beam_size=beam_size, temperature=0.0)) | |
else: | |
# Slight exploration if needed | |
kwargs.update(dict(beam_size=max(beam_size, 5), temperature=0.0)) | |
segments, info = model.transcribe(audio_path, **kwargs) | |
text = "".join(s.text for s in segments).strip() | |
return text, info | |
# ----------------------------- | |
# Main inference function | |
# ----------------------------- | |
def transcribe_handler( | |
audio, | |
language_choice: str, | |
strict_script: bool, | |
return_transliteration: bool, | |
translit_scheme: str, | |
): | |
if audio is None: | |
return "", "", "No audio provided." | |
lang_code = LANG_CODES[language_choice] | |
primer_weak, primer_strong = LANG_PRIMERS[language_choice] | |
# Pass 1: strict, deterministic decoding to reduce "creative" corrections | |
text, _ = transcribe_once( | |
audio_path=audio, | |
lang_code=lang_code, | |
initial_prompt=primer_weak, | |
deterministic=True, | |
beam_size=1, | |
condition_on_previous_text=False, | |
) | |
warning = "" | |
if strict_script and not script_matches(text, language_choice): | |
# Retry with a stronger primer and a slightly larger beam | |
text_retry, _ = transcribe_once( | |
audio_path=audio, | |
lang_code=lang_code, | |
initial_prompt=primer_strong, | |
deterministic=True, | |
beam_size=5, | |
condition_on_previous_text=False, | |
) | |
if script_matches(text_retry, language_choice): | |
text = text_retry | |
else: | |
warning = ( | |
"⚠️ Script enforcement could not fully correct drift. " | |
"Output may contain mixed or incorrect script." | |
) | |
translit = "" | |
if return_transliteration: | |
translit = make_transliteration(text, language_choice, scheme=translit_scheme) | |
return text, translit, warning | |
# ----------------------------- | |
# Gradio UI | |
# ----------------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# 🎙 Whisper Large-v2 (CPU) — Raw Transcription + Script Enforcement | |
Supports **Tamil, Malayalam, Hindi, Sanskrit, English**. | |
- Minimal normalization (deterministic decoding, no context carryover). | |
- Optional **Strict script enforcement** (retry with stronger prompt if drift occurs). | |
- Optional **English transliteration** (ITRANS / IAST / HK) for Indic scripts. | |
> Note: On CPU free tier, 5–10s clips may take ~15–25s with large-v2. | |
""" | |
) | |
with gr.Row(): | |
audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio (mic or upload)") | |
lang_dd = gr.Dropdown(LANG_CHOICES, value="Malayalam", label="Language") | |
with gr.Row(): | |
strict_chk = gr.Checkbox(value=True, label="Strict script enforcement (recommended)") | |
translit_chk = gr.Checkbox(value=True, label="Also return English transliteration") | |
translit_scheme_dd = gr.Dropdown( | |
choices=["ITRANS", "IAST", "HK"], | |
value="ITRANS", | |
label="Transliteration scheme (for Indic scripts)" | |
) | |
transcribe_btn = gr.Button("Transcribe") | |
with gr.Row(): | |
out_text = gr.Textbox(label="Transcription", lines=6) | |
out_translit = gr.Textbox(label="English Transliteration", lines=6) | |
warn_box = gr.Markdown("") | |
def wrapped_handler(audio, language_choice, strict_script, return_transliteration, translit_scheme): | |
text, translit, warning = transcribe_handler( | |
audio=audio, | |
language_choice=language_choice, | |
strict_script=strict_script, | |
return_transliteration=return_transliteration, | |
translit_scheme=translit_scheme, | |
) | |
# Only show transliteration if checkbox is on; otherwise empty | |
if not return_transliteration: | |
translit = "" | |
return text, translit, (warning if warning else "") | |
transcribe_btn.click( | |
wrapped_handler, | |
inputs=[audio_in, lang_dd, strict_chk, translit_chk, translit_scheme_dd], | |
outputs=[out_text, out_translit, warn_box], | |
) | |
demo.launch() | |