sudhanm commited on
Commit
48c3c18
·
verified ·
1 Parent(s): b73dcd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -21
app.py CHANGED
@@ -1,44 +1,241 @@
 
 
 
 
 
1
  import gradio as gr
2
  from faster_whisper import WhisperModel
3
- import os
 
4
 
5
- # Load model on CPU (large-v2 multilingual)
6
- model = WhisperModel("large-v2", device="cpu", compute_type="int8")
 
 
 
 
7
 
 
 
 
 
8
  LANG_CODES = {
9
  "Tamil": "ta",
10
- "Malayalam": "ml"
 
 
 
11
  }
12
 
13
- def transcribe(audio, language_choice):
14
- if audio is None:
15
- return "No audio provided."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Whisper transcription with minimal correction
18
- segments, info = model.transcribe(
19
- audio,
20
- language=LANG_CODES[language_choice],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  task="transcribe",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  beam_size=1,
23
  condition_on_previous_text=False,
24
- initial_prompt=""
25
  )
26
 
27
- # Combine raw text from segments
28
- full_text = "".join([seg.text for seg in segments])
29
- return full_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
31
  # Gradio UI
 
32
  with gr.Blocks() as demo:
33
- gr.Markdown("# 🎙 Whisper Large-v2 Raw Transcription\nMinimal correction, Tamil & Malayalam")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  with gr.Row():
36
- audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
37
- lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Tamil", label="Language")
38
 
39
- output_text = gr.Textbox(label="Transcription")
40
 
41
- submit_btn = gr.Button("Transcribe")
42
- submit_btn.click(transcribe, inputs=[audio_input, lang_choice], outputs=output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  demo.launch()
 
1
+ # app.py
2
+ # HF Space: Whisper large-v2 (CPU) with strict script enforcement + optional English transliteration
3
+ # Languages: Tamil, Malayalam, English, Hindi, Sanskrit
4
+
5
+ import re
6
  import gradio as gr
7
  from faster_whisper import WhisperModel
8
+ from indic_transliteration import sanscript
9
+ from indic_transliteration.sanscript import transliterate
10
 
11
+ # -----------------------------
12
+ # Model: load once on CPU
13
+ # -----------------------------
14
+ # large-v2 is the best multilingual accuracy; int8 keeps CPU memory/latency reasonable on HF Spaces Free CPU
15
+ MODEL_NAME = "large-v2"
16
+ model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8")
17
 
18
+ # -----------------------------
19
+ # Language config
20
+ # -----------------------------
21
+ LANG_CHOICES = ["Tamil", "Malayalam", "Hindi", "Sanskrit", "English"]
22
  LANG_CODES = {
23
  "Tamil": "ta",
24
+ "Malayalam": "ml",
25
+ "Hindi": "hi",
26
+ "Sanskrit": "sa",
27
+ "English": "en",
28
  }
29
 
30
+ # Unicode script ranges (basic)
31
+ RE_TAMIL = re.compile(r"[\u0B80-\u0BFF]") # Tamil
32
+ RE_MALAYALAM = re.compile(r"[\u0D00-\u0D7F]") # Malayalam
33
+ RE_DEVANAGARI = re.compile(r"[\u0900-\u097F]") # Devanagari (Hindi/Sanskrit)
34
+ RE_LATIN = re.compile(r"[A-Za-z]") # Basic Latin letters
35
+
36
+ # Primers: weak/strong anchors in each target script to nudge decoding
37
+ MALAYALAM_PRIMER_WEAK = "ഇത് മലയാളം ലിപിയിലാണ്."
38
+ MALAYALAM_PRIMER_STRONG = "ദയവായി എല്ലാ വാചകങ്ങളും മലയാളം ലിപിയിൽ മാത്രം എഴുതുക."
39
+
40
+ TAMIL_PRIMER_WEAK = "இது தமிழ் எழுத்தாகும்."
41
+ TAMIL_PRIMER_STRONG = "தயவுசெய்து அனைத்து வாக்கியங்களையும் தமிழ் எழுத்தில் மட்டுமே எழுதவும்."
42
+
43
+ HINDI_PRIMER_WEAK = "यह देवनागरी लिपि में लिखा गया है।"
44
+ HINDI_PRIMER_STRONG = "कृपया सभी वाक्यों को केवल देवनागरी लिपि में लिखें।"
45
+
46
+ SANSKRIT_PRIMER_WEAK = "इदं देवनागरी-लिप्याम् अस्ति।"
47
+ SANSKRIT_PRIMER_STRONG = "कृपया सर्वाणि वाक्यानि केवलं देवनागरी-लिप्याम् एव लिखत।"
48
+
49
+ ENGLISH_PRIMER_WEAK = "This is in the Latin script."
50
+ ENGLISH_PRIMER_STRONG = "Please write all sentences only in Latin script."
51
+
52
+ LANG_PRIMERS = {
53
+ "Malayalam": (MALAYALAM_PRIMER_WEAK, MALAYALAM_PRIMER_STRONG),
54
+ "Tamil": (TAMIL_PRIMER_WEAK, TAMIL_PRIMER_STRONG),
55
+ "Hindi": (HINDI_PRIMER_WEAK, HINDI_PRIMER_STRONG),
56
+ "Sanskrit": (SANSKRIT_PRIMER_WEAK, SANSKRIT_PRIMER_STRONG),
57
+ "English": (ENGLISH_PRIMER_WEAK, ENGLISH_PRIMER_STRONG),
58
+ }
59
 
60
+ # -----------------------------
61
+ # Script checks & helpers
62
+ # -----------------------------
63
+ def script_matches(text: str, lang_choice: str) -> bool:
64
+ """Return True if text appears to be predominantly in the target script."""
65
+ if not text:
66
+ return False
67
+
68
+ has_ta = bool(RE_TAMIL.search(text))
69
+ has_ml = bool(RE_MALAYALAM.search(text))
70
+ has_deva = bool(RE_DEVANAGARI.search(text))
71
+ has_lat = bool(RE_LATIN.search(text))
72
+
73
+ if lang_choice == "Tamil":
74
+ return has_ta and not (has_ml or has_deva)
75
+ if lang_choice == "Malayalam":
76
+ return has_ml and not (has_ta or has_deva)
77
+ if lang_choice in ("Hindi", "Sanskrit"):
78
+ # Expect Devanagari; tolerate Latin (numbers/punctuation) but no Tamil/Malayalam
79
+ return has_deva and not (has_ta or has_ml)
80
+ if lang_choice == "English":
81
+ # Expect Latin letters; ensure we don't have Tamil/Malayalam/Devanagari
82
+ return has_lat and not (has_ta or has_ml or has_deva)
83
+
84
+ return True # Fallback
85
+
86
+ def make_transliteration(text: str, lang_choice: str, scheme: str = "ITRANS") -> str:
87
+ """Transliterate Indic scripts to an English-friendly romanization (default ITRANS)."""
88
+ if not text:
89
+ return ""
90
+
91
+ target_scheme = {
92
+ "ITRANS": sanscript.ITRANS,
93
+ "IAST": sanscript.IAST,
94
+ "HK": sanscript.HK,
95
+ }.get(scheme.upper(), sanscript.ITRANS)
96
+
97
+ if lang_choice == "Tamil":
98
+ return transliterate(text, sanscript.TAMIL, target_scheme)
99
+ elif lang_choice == "Malayalam":
100
+ return transliterate(text, sanscript.MALAYALAM, target_scheme)
101
+ elif lang_choice in ("Hindi", "Sanskrit"):
102
+ return transliterate(text, sanscript.DEVANAGARI, target_scheme)
103
+ else:
104
+ # English: return as-is
105
+ return text
106
+
107
+ def transcribe_once(
108
+ audio_path: str,
109
+ lang_code: str,
110
+ initial_prompt: str,
111
+ deterministic: bool = True,
112
+ beam_size: int = 1,
113
+ condition_on_previous_text: bool = False,
114
+ ):
115
+ """One pass of transcription with given decoding settings."""
116
+ kwargs = dict(
117
+ language=lang_code,
118
  task="transcribe",
119
+ condition_on_previous_text=condition_on_previous_text,
120
+ initial_prompt=initial_prompt,
121
+ word_timestamps=False,
122
+ )
123
+ if deterministic:
124
+ # temperature 0 and beam_size control creativity; 0 + beam=1 is very strict
125
+ kwargs.update(dict(beam_size=beam_size, temperature=0.0))
126
+ else:
127
+ # Slight exploration if needed
128
+ kwargs.update(dict(beam_size=max(beam_size, 5), temperature=0.0))
129
+
130
+ segments, info = model.transcribe(audio_path, **kwargs)
131
+ text = "".join(s.text for s in segments).strip()
132
+ return text, info
133
+
134
+ # -----------------------------
135
+ # Main inference function
136
+ # -----------------------------
137
+ def transcribe_handler(
138
+ audio,
139
+ language_choice: str,
140
+ strict_script: bool,
141
+ return_transliteration: bool,
142
+ translit_scheme: str,
143
+ ):
144
+ if audio is None:
145
+ return "", "", "No audio provided."
146
+
147
+ lang_code = LANG_CODES[language_choice]
148
+ primer_weak, primer_strong = LANG_PRIMERS[language_choice]
149
+
150
+ # Pass 1: strict, deterministic decoding to reduce "creative" corrections
151
+ text, _ = transcribe_once(
152
+ audio_path=audio,
153
+ lang_code=lang_code,
154
+ initial_prompt=primer_weak,
155
+ deterministic=True,
156
  beam_size=1,
157
  condition_on_previous_text=False,
 
158
  )
159
 
160
+ warning = ""
161
+ if strict_script and not script_matches(text, language_choice):
162
+ # Retry with a stronger primer and a slightly larger beam
163
+ text_retry, _ = transcribe_once(
164
+ audio_path=audio,
165
+ lang_code=lang_code,
166
+ initial_prompt=primer_strong,
167
+ deterministic=True,
168
+ beam_size=5,
169
+ condition_on_previous_text=False,
170
+ )
171
+ if script_matches(text_retry, language_choice):
172
+ text = text_retry
173
+ else:
174
+ warning = (
175
+ "⚠️ Script enforcement could not fully correct drift. "
176
+ "Output may contain mixed or incorrect script."
177
+ )
178
+
179
+ translit = ""
180
+ if return_transliteration:
181
+ translit = make_transliteration(text, language_choice, scheme=translit_scheme)
182
 
183
+ return text, translit, warning
184
+
185
+ # -----------------------------
186
  # Gradio UI
187
+ # -----------------------------
188
  with gr.Blocks() as demo:
189
+ gr.Markdown(
190
+ """
191
+ # 🎙 Whisper Large-v2 (CPU) — Raw Transcription + Script Enforcement
192
+ Supports **Tamil, Malayalam, Hindi, Sanskrit, English**.
193
+ - Minimal normalization (deterministic decoding, no context carryover).
194
+ - Optional **Strict script enforcement** (retry with stronger prompt if drift occurs).
195
+ - Optional **English transliteration** (ITRANS / IAST / HK) for Indic scripts.
196
+
197
+ > Note: On CPU free tier, 5–10s clips may take ~15–25s with large-v2.
198
+ """
199
+ )
200
+
201
+ with gr.Row():
202
+ audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio (mic or upload)")
203
+ lang_dd = gr.Dropdown(LANG_CHOICES, value="Malayalam", label="Language")
204
+
205
+ with gr.Row():
206
+ strict_chk = gr.Checkbox(value=True, label="Strict script enforcement (recommended)")
207
+ translit_chk = gr.Checkbox(value=True, label="Also return English transliteration")
208
+ translit_scheme_dd = gr.Dropdown(
209
+ choices=["ITRANS", "IAST", "HK"],
210
+ value="ITRANS",
211
+ label="Transliteration scheme (for Indic scripts)"
212
+ )
213
+
214
+ transcribe_btn = gr.Button("Transcribe")
215
 
216
  with gr.Row():
217
+ out_text = gr.Textbox(label="Transcription", lines=6)
218
+ out_translit = gr.Textbox(label="English Transliteration", lines=6)
219
 
220
+ warn_box = gr.Markdown("")
221
 
222
+ def wrapped_handler(audio, language_choice, strict_script, return_transliteration, translit_scheme):
223
+ text, translit, warning = transcribe_handler(
224
+ audio=audio,
225
+ language_choice=language_choice,
226
+ strict_script=strict_script,
227
+ return_transliteration=return_transliteration,
228
+ translit_scheme=translit_scheme,
229
+ )
230
+ # Only show transliteration if checkbox is on; otherwise empty
231
+ if not return_transliteration:
232
+ translit = ""
233
+ return text, translit, (warning if warning else "")
234
+
235
+ transcribe_btn.click(
236
+ wrapped_handler,
237
+ inputs=[audio_in, lang_dd, strict_chk, translit_chk, translit_scheme_dd],
238
+ outputs=[out_text, out_translit, warn_box],
239
+ )
240
 
241
  demo.launch()