File size: 9,312 Bytes
48c3c18
 
 
 
 
f4d67a4
 
48c3c18
 
f4d67a4
48c3c18
 
 
 
 
 
f4d67a4
48c3c18
 
 
 
f4d67a4
 
48c3c18
 
 
 
f4d67a4
 
48c3c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4d67a4
48c3c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4d67a4
48c3c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4d67a4
 
 
 
48c3c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4d67a4
48c3c18
 
 
f4d67a4
48c3c18
f4d67a4
48c3c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4d67a4
 
48c3c18
 
f4d67a4
48c3c18
f4d67a4
48c3c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4d67a4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# app.py
# HF Space: Whisper large-v2 (CPU) with strict script enforcement + optional English transliteration
# Languages: Tamil, Malayalam, English, Hindi, Sanskrit

import re
import gradio as gr
from faster_whisper import WhisperModel
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# -----------------------------
# Model: load once on CPU
# -----------------------------
# large-v2 is the best multilingual accuracy; int8 keeps CPU memory/latency reasonable on HF Spaces Free CPU
MODEL_NAME = "large-v2"
model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8")

# -----------------------------
# Language config
# -----------------------------
LANG_CHOICES = ["Tamil", "Malayalam", "Hindi", "Sanskrit", "English"]
LANG_CODES = {
    "Tamil": "ta",
    "Malayalam": "ml",
    "Hindi": "hi",
    "Sanskrit": "sa",
    "English": "en",
}

# Unicode script ranges (basic)
RE_TAMIL = re.compile(r"[\u0B80-\u0BFF]")        # Tamil
RE_MALAYALAM = re.compile(r"[\u0D00-\u0D7F]")    # Malayalam
RE_DEVANAGARI = re.compile(r"[\u0900-\u097F]")   # Devanagari (Hindi/Sanskrit)
RE_LATIN = re.compile(r"[A-Za-z]")               # Basic Latin letters

# Primers: weak/strong anchors in each target script to nudge decoding
MALAYALAM_PRIMER_WEAK = "ഇത് മലയാളം ലിപിയിലാണ്."
MALAYALAM_PRIMER_STRONG = "ദയവായി എല്ലാ വാചകങ്ങളും മലയാളം ലിപിയിൽ മാത്രം എഴുതുക."

TAMIL_PRIMER_WEAK = "இது தமிழ் எழுத்தாகும்."
TAMIL_PRIMER_STRONG = "தயவுசெய்து அனைத்து வாக்கியங்களையும் தமிழ் எழுத்தில் மட்டுமே எழுதவும்."

HINDI_PRIMER_WEAK = "यह देवनागरी लिपि में लिखा गया है।"
HINDI_PRIMER_STRONG = "कृपया सभी वाक्यों को केवल देवनागरी लिपि में लिखें।"

SANSKRIT_PRIMER_WEAK = "इदं देवनागरी-लिप्याम् अस्ति।"
SANSKRIT_PRIMER_STRONG = "कृपया सर्वाणि वाक्यानि केवलं देवनागरी-लिप्याम् एव लिखत।"

ENGLISH_PRIMER_WEAK = "This is in the Latin script."
ENGLISH_PRIMER_STRONG = "Please write all sentences only in Latin script."

LANG_PRIMERS = {
    "Malayalam": (MALAYALAM_PRIMER_WEAK, MALAYALAM_PRIMER_STRONG),
    "Tamil": (TAMIL_PRIMER_WEAK, TAMIL_PRIMER_STRONG),
    "Hindi": (HINDI_PRIMER_WEAK, HINDI_PRIMER_STRONG),
    "Sanskrit": (SANSKRIT_PRIMER_WEAK, SANSKRIT_PRIMER_STRONG),
    "English": (ENGLISH_PRIMER_WEAK, ENGLISH_PRIMER_STRONG),
}

# -----------------------------
# Script checks & helpers
# -----------------------------
def script_matches(text: str, lang_choice: str) -> bool:
    """Return True if text appears to be predominantly in the target script."""
    if not text:
        return False

    has_ta = bool(RE_TAMIL.search(text))
    has_ml = bool(RE_MALAYALAM.search(text))
    has_deva = bool(RE_DEVANAGARI.search(text))
    has_lat = bool(RE_LATIN.search(text))

    if lang_choice == "Tamil":
        return has_ta and not (has_ml or has_deva)
    if lang_choice == "Malayalam":
        return has_ml and not (has_ta or has_deva)
    if lang_choice in ("Hindi", "Sanskrit"):
        # Expect Devanagari; tolerate Latin (numbers/punctuation) but no Tamil/Malayalam
        return has_deva and not (has_ta or has_ml)
    if lang_choice == "English":
        # Expect Latin letters; ensure we don't have Tamil/Malayalam/Devanagari
        return has_lat and not (has_ta or has_ml or has_deva)

    return True  # Fallback

def make_transliteration(text: str, lang_choice: str, scheme: str = "ITRANS") -> str:
    """Transliterate Indic scripts to an English-friendly romanization (default ITRANS)."""
    if not text:
        return ""

    target_scheme = {
        "ITRANS": sanscript.ITRANS,
        "IAST": sanscript.IAST,
        "HK": sanscript.HK,
    }.get(scheme.upper(), sanscript.ITRANS)

    if lang_choice == "Tamil":
        return transliterate(text, sanscript.TAMIL, target_scheme)
    elif lang_choice == "Malayalam":
        return transliterate(text, sanscript.MALAYALAM, target_scheme)
    elif lang_choice in ("Hindi", "Sanskrit"):
        return transliterate(text, sanscript.DEVANAGARI, target_scheme)
    else:
        # English: return as-is
        return text

def transcribe_once(
    audio_path: str,
    lang_code: str,
    initial_prompt: str,
    deterministic: bool = True,
    beam_size: int = 1,
    condition_on_previous_text: bool = False,
):
    """One pass of transcription with given decoding settings."""
    kwargs = dict(
        language=lang_code,
        task="transcribe",
        condition_on_previous_text=condition_on_previous_text,
        initial_prompt=initial_prompt,
        word_timestamps=False,
    )
    if deterministic:
        # temperature 0 and beam_size control creativity; 0 + beam=1 is very strict
        kwargs.update(dict(beam_size=beam_size, temperature=0.0))
    else:
        # Slight exploration if needed
        kwargs.update(dict(beam_size=max(beam_size, 5), temperature=0.0))

    segments, info = model.transcribe(audio_path, **kwargs)
    text = "".join(s.text for s in segments).strip()
    return text, info

# -----------------------------
# Main inference function
# -----------------------------
def transcribe_handler(
    audio,
    language_choice: str,
    strict_script: bool,
    return_transliteration: bool,
    translit_scheme: str,
):
    if audio is None:
        return "", "", "No audio provided."

    lang_code = LANG_CODES[language_choice]
    primer_weak, primer_strong = LANG_PRIMERS[language_choice]

    # Pass 1: strict, deterministic decoding to reduce "creative" corrections
    text, _ = transcribe_once(
        audio_path=audio,
        lang_code=lang_code,
        initial_prompt=primer_weak,
        deterministic=True,
        beam_size=1,
        condition_on_previous_text=False,
    )

    warning = ""
    if strict_script and not script_matches(text, language_choice):
        # Retry with a stronger primer and a slightly larger beam
        text_retry, _ = transcribe_once(
            audio_path=audio,
            lang_code=lang_code,
            initial_prompt=primer_strong,
            deterministic=True,
            beam_size=5,
            condition_on_previous_text=False,
        )
        if script_matches(text_retry, language_choice):
            text = text_retry
        else:
            warning = (
                "⚠️ Script enforcement could not fully correct drift. "
                "Output may contain mixed or incorrect script."
            )

    translit = ""
    if return_transliteration:
        translit = make_transliteration(text, language_choice, scheme=translit_scheme)

    return text, translit, warning

# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown(
        """
# 🎙 Whisper Large-v2 (CPU) — Raw Transcription + Script Enforcement
Supports **Tamil, Malayalam, Hindi, Sanskrit, English**.
- Minimal normalization (deterministic decoding, no context carryover).
- Optional **Strict script enforcement** (retry with stronger prompt if drift occurs).
- Optional **English transliteration** (ITRANS / IAST / HK) for Indic scripts.

> Note: On CPU free tier, 5–10s clips may take ~15–25s with large-v2.
        """
    )

    with gr.Row():
        audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio (mic or upload)")
        lang_dd = gr.Dropdown(LANG_CHOICES, value="Malayalam", label="Language")

    with gr.Row():
        strict_chk = gr.Checkbox(value=True, label="Strict script enforcement (recommended)")
        translit_chk = gr.Checkbox(value=True, label="Also return English transliteration")
        translit_scheme_dd = gr.Dropdown(
            choices=["ITRANS", "IAST", "HK"],
            value="ITRANS",
            label="Transliteration scheme (for Indic scripts)"
        )

    transcribe_btn = gr.Button("Transcribe")

    with gr.Row():
        out_text = gr.Textbox(label="Transcription", lines=6)
        out_translit = gr.Textbox(label="English Transliteration", lines=6)

    warn_box = gr.Markdown("")

    def wrapped_handler(audio, language_choice, strict_script, return_transliteration, translit_scheme):
        text, translit, warning = transcribe_handler(
            audio=audio,
            language_choice=language_choice,
            strict_script=strict_script,
            return_transliteration=return_transliteration,
            translit_scheme=translit_scheme,
        )
        # Only show transliteration if checkbox is on; otherwise empty
        if not return_transliteration:
            translit = ""
        return text, translit, (warning if warning else "")

    transcribe_btn.click(
        wrapped_handler,
        inputs=[audio_in, lang_dd, strict_chk, translit_chk, translit_scheme_dd],
        outputs=[out_text, out_translit, warn_box],
    )

demo.launch()