Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,241 +1,137 @@
|
|
1 |
-
# app.py
|
2 |
-
# HF Space: Whisper large-v2 (CPU) with strict script enforcement + optional English transliteration
|
3 |
-
# Languages: Tamil, Malayalam, English, Hindi, Sanskrit
|
4 |
-
|
5 |
-
import re
|
6 |
import gradio as gr
|
7 |
from faster_whisper import WhisperModel
|
8 |
from indic_transliteration import sanscript
|
9 |
from indic_transliteration.sanscript import transliterate
|
|
|
10 |
|
11 |
-
#
|
12 |
-
# Model: load once on CPU
|
13 |
-
# -----------------------------
|
14 |
-
# large-v2 is the best multilingual accuracy; int8 keeps CPU memory/latency reasonable on HF Spaces Free CPU
|
15 |
MODEL_NAME = "large-v2"
|
16 |
-
|
17 |
-
|
18 |
-
# -----------------------------
|
19 |
-
# Language config
|
20 |
-
# -----------------------------
|
21 |
-
LANG_CHOICES = ["Tamil", "Malayalam", "Hindi", "Sanskrit", "English"]
|
22 |
LANG_CODES = {
|
|
|
23 |
"Tamil": "ta",
|
24 |
"Malayalam": "ml",
|
25 |
"Hindi": "hi",
|
26 |
-
"Sanskrit": "sa"
|
27 |
-
"English": "en",
|
28 |
}
|
29 |
|
30 |
-
# Unicode script ranges (basic)
|
31 |
-
RE_TAMIL = re.compile(r"[\u0B80-\u0BFF]") # Tamil
|
32 |
-
RE_MALAYALAM = re.compile(r"[\u0D00-\u0D7F]") # Malayalam
|
33 |
-
RE_DEVANAGARI = re.compile(r"[\u0900-\u097F]") # Devanagari (Hindi/Sanskrit)
|
34 |
-
RE_LATIN = re.compile(r"[A-Za-z]") # Basic Latin letters
|
35 |
-
|
36 |
-
# Primers: weak/strong anchors in each target script to nudge decoding
|
37 |
-
MALAYALAM_PRIMER_WEAK = "ഇത് മലയാളം ലിപിയിലാണ്."
|
38 |
-
MALAYALAM_PRIMER_STRONG = "ദയവായി എല്ലാ വാചകങ്ങളും മലയാളം ലിപിയിൽ മാത്രം എഴുതുക."
|
39 |
-
|
40 |
-
TAMIL_PRIMER_WEAK = "இது தமிழ் எழுத்தாகும்."
|
41 |
-
TAMIL_PRIMER_STRONG = "தயவுசெய்து அனைத்து வாக்கியங்களையும் தமிழ் எழுத்தில் மட்டுமே எழுதவும்."
|
42 |
-
|
43 |
-
HINDI_PRIMER_WEAK = "यह देवनागरी लिपि में लिखा गया है।"
|
44 |
-
HINDI_PRIMER_STRONG = "कृपया सभी वाक्यों को केवल देवनागरी लिपि में लिखें।"
|
45 |
-
|
46 |
-
SANSKRIT_PRIMER_WEAK = "इदं देवनागरी-लिप्याम् अस्ति।"
|
47 |
-
SANSKRIT_PRIMER_STRONG = "कृपया सर्वाणि वाक्यानि केवलं देवनागरी-लिप्याम् एव लिखत।"
|
48 |
-
|
49 |
-
ENGLISH_PRIMER_WEAK = "This is in the Latin script."
|
50 |
-
ENGLISH_PRIMER_STRONG = "Please write all sentences only in Latin script."
|
51 |
-
|
52 |
LANG_PRIMERS = {
|
53 |
-
"
|
54 |
-
"Tamil": (
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
}
|
59 |
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
""
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
if
|
89 |
-
return
|
90 |
-
|
91 |
-
target_scheme = {
|
92 |
-
"ITRANS": sanscript.ITRANS,
|
93 |
-
"IAST": sanscript.IAST,
|
94 |
-
"HK": sanscript.HK,
|
95 |
-
}.get(scheme.upper(), sanscript.ITRANS)
|
96 |
-
|
97 |
-
if lang_choice == "Tamil":
|
98 |
-
return transliterate(text, sanscript.TAMIL, target_scheme)
|
99 |
-
elif lang_choice == "Malayalam":
|
100 |
-
return transliterate(text, sanscript.MALAYALAM, target_scheme)
|
101 |
-
elif lang_choice in ("Hindi", "Sanskrit"):
|
102 |
-
return transliterate(text, sanscript.DEVANAGARI, target_scheme)
|
103 |
else:
|
104 |
-
# English: return as-is
|
105 |
return text
|
106 |
|
107 |
-
def transcribe_once(
|
108 |
-
|
109 |
-
|
110 |
-
initial_prompt: str,
|
111 |
-
deterministic: bool = True,
|
112 |
-
beam_size: int = 1,
|
113 |
-
condition_on_previous_text: bool = False,
|
114 |
-
):
|
115 |
-
"""One pass of transcription with given decoding settings."""
|
116 |
-
kwargs = dict(
|
117 |
language=lang_code,
|
118 |
task="transcribe",
|
119 |
-
condition_on_previous_text=condition_on_previous_text,
|
120 |
initial_prompt=initial_prompt,
|
121 |
-
|
|
|
|
|
|
|
122 |
)
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
# Slight exploration if needed
|
128 |
-
kwargs.update(dict(beam_size=max(beam_size, 5), temperature=0.0))
|
129 |
-
|
130 |
-
segments, info = model.transcribe(audio_path, **kwargs)
|
131 |
-
text = "".join(s.text for s in segments).strip()
|
132 |
-
return text, info
|
133 |
-
|
134 |
-
# -----------------------------
|
135 |
-
# Main inference function
|
136 |
-
# -----------------------------
|
137 |
-
def transcribe_handler(
|
138 |
-
audio,
|
139 |
-
language_choice: str,
|
140 |
-
strict_script: bool,
|
141 |
-
return_transliteration: bool,
|
142 |
-
translit_scheme: str,
|
143 |
-
):
|
144 |
if audio is None:
|
145 |
-
return "
|
146 |
|
147 |
lang_code = LANG_CODES[language_choice]
|
148 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
149 |
|
150 |
-
# Pass 1:
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
audio_path=audio,
|
153 |
lang_code=lang_code,
|
154 |
-
initial_prompt=
|
155 |
-
|
156 |
-
|
157 |
-
condition_on_previous_text=False
|
158 |
)
|
159 |
|
160 |
-
|
161 |
-
if
|
162 |
-
|
163 |
-
text_retry, _ = transcribe_once(
|
164 |
audio_path=audio,
|
165 |
lang_code=lang_code,
|
166 |
initial_prompt=primer_strong,
|
167 |
-
deterministic=True,
|
168 |
beam_size=5,
|
169 |
-
|
|
|
170 |
)
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
)
|
178 |
-
|
179 |
-
translit = ""
|
180 |
-
if return_transliteration:
|
181 |
-
translit = make_transliteration(text, language_choice, scheme=translit_scheme)
|
182 |
-
|
183 |
-
return text, translit, warning
|
184 |
-
|
185 |
-
# -----------------------------
|
186 |
-
# Gradio UI
|
187 |
-
# -----------------------------
|
188 |
with gr.Blocks() as demo:
|
189 |
-
gr.Markdown(
|
190 |
-
"""
|
191 |
-
# 🎙 Whisper Large-v2 (CPU) — Raw Transcription + Script Enforcement
|
192 |
-
Supports **Tamil, Malayalam, Hindi, Sanskrit, English**.
|
193 |
-
- Minimal normalization (deterministic decoding, no context carryover).
|
194 |
-
- Optional **Strict script enforcement** (retry with stronger prompt if drift occurs).
|
195 |
-
- Optional **English transliteration** (ITRANS / IAST / HK) for Indic scripts.
|
196 |
-
|
197 |
-
> Note: On CPU free tier, 5–10s clips may take ~15–25s with large-v2.
|
198 |
-
"""
|
199 |
-
)
|
200 |
|
201 |
with gr.Row():
|
202 |
-
|
203 |
-
|
204 |
|
205 |
with gr.Row():
|
206 |
-
|
207 |
-
|
208 |
-
translit_scheme_dd = gr.Dropdown(
|
209 |
-
choices=["ITRANS", "IAST", "HK"],
|
210 |
-
value="ITRANS",
|
211 |
-
label="Transliteration scheme (for Indic scripts)"
|
212 |
-
)
|
213 |
|
214 |
-
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
warn_box = gr.Markdown("")
|
221 |
-
|
222 |
-
def wrapped_handler(audio, language_choice, strict_script, return_transliteration, translit_scheme):
|
223 |
-
text, translit, warning = transcribe_handler(
|
224 |
-
audio=audio,
|
225 |
-
language_choice=language_choice,
|
226 |
-
strict_script=strict_script,
|
227 |
-
return_transliteration=return_transliteration,
|
228 |
-
translit_scheme=translit_scheme,
|
229 |
-
)
|
230 |
-
# Only show transliteration if checkbox is on; otherwise empty
|
231 |
-
if not return_transliteration:
|
232 |
-
translit = ""
|
233 |
-
return text, translit, (warning if warning else "")
|
234 |
-
|
235 |
-
transcribe_btn.click(
|
236 |
-
wrapped_handler,
|
237 |
-
inputs=[audio_in, lang_dd, strict_chk, translit_chk, translit_scheme_dd],
|
238 |
-
outputs=[out_text, out_translit, warn_box],
|
239 |
)
|
240 |
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from faster_whisper import WhisperModel
|
3 |
from indic_transliteration import sanscript
|
4 |
from indic_transliteration.sanscript import transliterate
|
5 |
+
import re
|
6 |
|
7 |
+
# ---------------- CONFIG ---------------- #
|
|
|
|
|
|
|
8 |
MODEL_NAME = "large-v2"
|
9 |
+
DEVICE = "cpu" # Change to "cuda" if you have GPU
|
|
|
|
|
|
|
|
|
|
|
10 |
LANG_CODES = {
|
11 |
+
"English": "en",
|
12 |
"Tamil": "ta",
|
13 |
"Malayalam": "ml",
|
14 |
"Hindi": "hi",
|
15 |
+
"Sanskrit": "sa"
|
|
|
16 |
}
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
LANG_PRIMERS = {
|
19 |
+
"English": ("", ""),
|
20 |
+
"Tamil": ("The transcript should be in Tamil script.", "Write only in Tamil script without translation."),
|
21 |
+
"Malayalam": ("The transcript should be in Malayalam script.", "Write only in Malayalam script without translation."),
|
22 |
+
"Hindi": ("The transcript should be in Devanagari script.", "Write only in Devanagari script without translation."),
|
23 |
+
"Sanskrit": ("The transcript should be in Devanagari script.", "Write only in Devanagari script without translation.")
|
24 |
}
|
25 |
|
26 |
+
# Script detection regexes
|
27 |
+
SCRIPT_PATTERNS = {
|
28 |
+
"Tamil": re.compile(r"[\u0B80-\u0BFF]"),
|
29 |
+
"Malayalam": re.compile(r"[\u0D00-\u0D7F]"),
|
30 |
+
"Hindi": re.compile(r"[\u0900-\u097F]"),
|
31 |
+
"Sanskrit": re.compile(r"[\u0900-\u097F]"),
|
32 |
+
"English": re.compile(r"[A-Za-z]")
|
33 |
+
}
|
34 |
+
|
35 |
+
# Load model
|
36 |
+
print("Loading Whisper model...")
|
37 |
+
model = WhisperModel(MODEL_NAME, device=DEVICE)
|
38 |
+
|
39 |
+
# ---------------- HELPERS ---------------- #
|
40 |
+
def is_script(text, lang_name):
|
41 |
+
pattern = SCRIPT_PATTERNS.get(lang_name)
|
42 |
+
if not pattern:
|
43 |
+
return True
|
44 |
+
return bool(pattern.search(text))
|
45 |
+
|
46 |
+
def transliterate_to_hk(text, lang_choice):
|
47 |
+
mapping = {
|
48 |
+
"Tamil": sanscript.TAMIL,
|
49 |
+
"Malayalam": sanscript.MALAYALAM,
|
50 |
+
"Hindi": sanscript.DEVANAGARI,
|
51 |
+
"Sanskrit": sanscript.DEVANAGARI,
|
52 |
+
"English": None
|
53 |
+
}
|
54 |
+
if mapping[lang_choice]:
|
55 |
+
return transliterate(text, mapping[lang_choice], sanscript.HK)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
else:
|
|
|
57 |
return text
|
58 |
|
59 |
+
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
|
60 |
+
segments, info = model.transcribe(
|
61 |
+
audio_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
language=lang_code,
|
63 |
task="transcribe",
|
|
|
64 |
initial_prompt=initial_prompt,
|
65 |
+
beam_size=beam_size,
|
66 |
+
temperature=temperature,
|
67 |
+
condition_on_previous_text=condition_on_previous_text,
|
68 |
+
word_timestamps=False
|
69 |
)
|
70 |
+
return "".join(s.text for s in segments).strip()
|
71 |
+
|
72 |
+
# ---------------- MAIN PIPELINE ---------------- #
|
73 |
+
def transcribe(audio, language_choice):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
if audio is None:
|
75 |
+
return "No audio provided.", ""
|
76 |
|
77 |
lang_code = LANG_CODES[language_choice]
|
78 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
79 |
|
80 |
+
# Pass 1: loose mode to get context
|
81 |
+
loose_text = transcribe_once(
|
82 |
+
audio_path=audio,
|
83 |
+
lang_code=lang_code,
|
84 |
+
initial_prompt="",
|
85 |
+
beam_size=8,
|
86 |
+
temperature=0.4,
|
87 |
+
condition_on_previous_text=True
|
88 |
+
)
|
89 |
+
|
90 |
+
# Pass 2: strict mode with context
|
91 |
+
strict_prompt = f"{primer_strong}\nContext: {loose_text}"
|
92 |
+
strict_text = transcribe_once(
|
93 |
audio_path=audio,
|
94 |
lang_code=lang_code,
|
95 |
+
initial_prompt=strict_prompt,
|
96 |
+
beam_size=5,
|
97 |
+
temperature=0.0,
|
98 |
+
condition_on_previous_text=False
|
99 |
)
|
100 |
|
101 |
+
# If still wrong script, retry with stronger primer only
|
102 |
+
if not is_script(strict_text, language_choice):
|
103 |
+
strict_text = transcribe_once(
|
|
|
104 |
audio_path=audio,
|
105 |
lang_code=lang_code,
|
106 |
initial_prompt=primer_strong,
|
|
|
107 |
beam_size=5,
|
108 |
+
temperature=0.0,
|
109 |
+
condition_on_previous_text=False
|
110 |
)
|
111 |
+
|
112 |
+
hk_translit = transliterate_to_hk(strict_text, language_choice)
|
113 |
+
|
114 |
+
return strict_text, hk_translit
|
115 |
+
|
116 |
+
# ---------------- UI ---------------- #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
with gr.Blocks() as demo:
|
118 |
+
gr.Markdown("# 🎙️ Multilingual Pronunciation Assistant\nUpload your speech and get native script + Harvard-Kyoto transliteration.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
with gr.Row():
|
121 |
+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
122 |
+
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="English", label="Language")
|
123 |
|
124 |
with gr.Row():
|
125 |
+
output_text = gr.Textbox(label="Transcription (Native Script)")
|
126 |
+
output_translit = gr.Textbox(label="Harvard-Kyoto Transliteration")
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
+
submit_btn = gr.Button("Transcribe")
|
129 |
|
130 |
+
submit_btn.click(
|
131 |
+
fn=transcribe,
|
132 |
+
inputs=[audio_input, lang_choice],
|
133 |
+
outputs=[output_text, output_translit]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
)
|
135 |
|
136 |
+
if __name__ == "__main__":
|
137 |
+
demo.launch()
|