sudhanm commited on
Commit
2911bf0
·
verified ·
1 Parent(s): 48c3c18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -196
app.py CHANGED
@@ -1,241 +1,137 @@
1
- # app.py
2
- # HF Space: Whisper large-v2 (CPU) with strict script enforcement + optional English transliteration
3
- # Languages: Tamil, Malayalam, English, Hindi, Sanskrit
4
-
5
- import re
6
  import gradio as gr
7
  from faster_whisper import WhisperModel
8
  from indic_transliteration import sanscript
9
  from indic_transliteration.sanscript import transliterate
 
10
 
11
- # -----------------------------
12
- # Model: load once on CPU
13
- # -----------------------------
14
- # large-v2 is the best multilingual accuracy; int8 keeps CPU memory/latency reasonable on HF Spaces Free CPU
15
  MODEL_NAME = "large-v2"
16
- model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8")
17
-
18
- # -----------------------------
19
- # Language config
20
- # -----------------------------
21
- LANG_CHOICES = ["Tamil", "Malayalam", "Hindi", "Sanskrit", "English"]
22
  LANG_CODES = {
 
23
  "Tamil": "ta",
24
  "Malayalam": "ml",
25
  "Hindi": "hi",
26
- "Sanskrit": "sa",
27
- "English": "en",
28
  }
29
 
30
- # Unicode script ranges (basic)
31
- RE_TAMIL = re.compile(r"[\u0B80-\u0BFF]") # Tamil
32
- RE_MALAYALAM = re.compile(r"[\u0D00-\u0D7F]") # Malayalam
33
- RE_DEVANAGARI = re.compile(r"[\u0900-\u097F]") # Devanagari (Hindi/Sanskrit)
34
- RE_LATIN = re.compile(r"[A-Za-z]") # Basic Latin letters
35
-
36
- # Primers: weak/strong anchors in each target script to nudge decoding
37
- MALAYALAM_PRIMER_WEAK = "ഇത് മലയാളം ലിപിയിലാണ്."
38
- MALAYALAM_PRIMER_STRONG = "ദയവായി എല്ലാ വാചകങ്ങളും മലയാളം ലിപിയിൽ മാത്രം എഴുതുക."
39
-
40
- TAMIL_PRIMER_WEAK = "இது தமிழ் எழுத்தாகும்."
41
- TAMIL_PRIMER_STRONG = "தயவுசெய்து அனைத்து வாக்கியங்களையும் தமிழ் எழுத்தில் மட்டுமே எழுதவும்."
42
-
43
- HINDI_PRIMER_WEAK = "यह देवनागरी लिपि में लिखा गया है।"
44
- HINDI_PRIMER_STRONG = "कृपया सभी वाक्यों को केवल देवनागरी लिपि में लिखें।"
45
-
46
- SANSKRIT_PRIMER_WEAK = "इदं देवनागरी-लिप्याम् अस्ति।"
47
- SANSKRIT_PRIMER_STRONG = "कृपया सर्वाणि वाक्यानि केवलं देवनागरी-लिप्याम् एव लिखत।"
48
-
49
- ENGLISH_PRIMER_WEAK = "This is in the Latin script."
50
- ENGLISH_PRIMER_STRONG = "Please write all sentences only in Latin script."
51
-
52
  LANG_PRIMERS = {
53
- "Malayalam": (MALAYALAM_PRIMER_WEAK, MALAYALAM_PRIMER_STRONG),
54
- "Tamil": (TAMIL_PRIMER_WEAK, TAMIL_PRIMER_STRONG),
55
- "Hindi": (HINDI_PRIMER_WEAK, HINDI_PRIMER_STRONG),
56
- "Sanskrit": (SANSKRIT_PRIMER_WEAK, SANSKRIT_PRIMER_STRONG),
57
- "English": (ENGLISH_PRIMER_WEAK, ENGLISH_PRIMER_STRONG),
58
  }
59
 
60
- # -----------------------------
61
- # Script checks & helpers
62
- # -----------------------------
63
- def script_matches(text: str, lang_choice: str) -> bool:
64
- """Return True if text appears to be predominantly in the target script."""
65
- if not text:
66
- return False
67
-
68
- has_ta = bool(RE_TAMIL.search(text))
69
- has_ml = bool(RE_MALAYALAM.search(text))
70
- has_deva = bool(RE_DEVANAGARI.search(text))
71
- has_lat = bool(RE_LATIN.search(text))
72
-
73
- if lang_choice == "Tamil":
74
- return has_ta and not (has_ml or has_deva)
75
- if lang_choice == "Malayalam":
76
- return has_ml and not (has_ta or has_deva)
77
- if lang_choice in ("Hindi", "Sanskrit"):
78
- # Expect Devanagari; tolerate Latin (numbers/punctuation) but no Tamil/Malayalam
79
- return has_deva and not (has_ta or has_ml)
80
- if lang_choice == "English":
81
- # Expect Latin letters; ensure we don't have Tamil/Malayalam/Devanagari
82
- return has_lat and not (has_ta or has_ml or has_deva)
83
-
84
- return True # Fallback
85
-
86
- def make_transliteration(text: str, lang_choice: str, scheme: str = "ITRANS") -> str:
87
- """Transliterate Indic scripts to an English-friendly romanization (default ITRANS)."""
88
- if not text:
89
- return ""
90
-
91
- target_scheme = {
92
- "ITRANS": sanscript.ITRANS,
93
- "IAST": sanscript.IAST,
94
- "HK": sanscript.HK,
95
- }.get(scheme.upper(), sanscript.ITRANS)
96
-
97
- if lang_choice == "Tamil":
98
- return transliterate(text, sanscript.TAMIL, target_scheme)
99
- elif lang_choice == "Malayalam":
100
- return transliterate(text, sanscript.MALAYALAM, target_scheme)
101
- elif lang_choice in ("Hindi", "Sanskrit"):
102
- return transliterate(text, sanscript.DEVANAGARI, target_scheme)
103
  else:
104
- # English: return as-is
105
  return text
106
 
107
- def transcribe_once(
108
- audio_path: str,
109
- lang_code: str,
110
- initial_prompt: str,
111
- deterministic: bool = True,
112
- beam_size: int = 1,
113
- condition_on_previous_text: bool = False,
114
- ):
115
- """One pass of transcription with given decoding settings."""
116
- kwargs = dict(
117
  language=lang_code,
118
  task="transcribe",
119
- condition_on_previous_text=condition_on_previous_text,
120
  initial_prompt=initial_prompt,
121
- word_timestamps=False,
 
 
 
122
  )
123
- if deterministic:
124
- # temperature 0 and beam_size control creativity; 0 + beam=1 is very strict
125
- kwargs.update(dict(beam_size=beam_size, temperature=0.0))
126
- else:
127
- # Slight exploration if needed
128
- kwargs.update(dict(beam_size=max(beam_size, 5), temperature=0.0))
129
-
130
- segments, info = model.transcribe(audio_path, **kwargs)
131
- text = "".join(s.text for s in segments).strip()
132
- return text, info
133
-
134
- # -----------------------------
135
- # Main inference function
136
- # -----------------------------
137
- def transcribe_handler(
138
- audio,
139
- language_choice: str,
140
- strict_script: bool,
141
- return_transliteration: bool,
142
- translit_scheme: str,
143
- ):
144
  if audio is None:
145
- return "", "", "No audio provided."
146
 
147
  lang_code = LANG_CODES[language_choice]
148
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
149
 
150
- # Pass 1: strict, deterministic decoding to reduce "creative" corrections
151
- text, _ = transcribe_once(
 
 
 
 
 
 
 
 
 
 
 
152
  audio_path=audio,
153
  lang_code=lang_code,
154
- initial_prompt=primer_weak,
155
- deterministic=True,
156
- beam_size=1,
157
- condition_on_previous_text=False,
158
  )
159
 
160
- warning = ""
161
- if strict_script and not script_matches(text, language_choice):
162
- # Retry with a stronger primer and a slightly larger beam
163
- text_retry, _ = transcribe_once(
164
  audio_path=audio,
165
  lang_code=lang_code,
166
  initial_prompt=primer_strong,
167
- deterministic=True,
168
  beam_size=5,
169
- condition_on_previous_text=False,
 
170
  )
171
- if script_matches(text_retry, language_choice):
172
- text = text_retry
173
- else:
174
- warning = (
175
- "⚠️ Script enforcement could not fully correct drift. "
176
- "Output may contain mixed or incorrect script."
177
- )
178
-
179
- translit = ""
180
- if return_transliteration:
181
- translit = make_transliteration(text, language_choice, scheme=translit_scheme)
182
-
183
- return text, translit, warning
184
-
185
- # -----------------------------
186
- # Gradio UI
187
- # -----------------------------
188
  with gr.Blocks() as demo:
189
- gr.Markdown(
190
- """
191
- # 🎙 Whisper Large-v2 (CPU) — Raw Transcription + Script Enforcement
192
- Supports **Tamil, Malayalam, Hindi, Sanskrit, English**.
193
- - Minimal normalization (deterministic decoding, no context carryover).
194
- - Optional **Strict script enforcement** (retry with stronger prompt if drift occurs).
195
- - Optional **English transliteration** (ITRANS / IAST / HK) for Indic scripts.
196
-
197
- > Note: On CPU free tier, 5–10s clips may take ~15–25s with large-v2.
198
- """
199
- )
200
 
201
  with gr.Row():
202
- audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio (mic or upload)")
203
- lang_dd = gr.Dropdown(LANG_CHOICES, value="Malayalam", label="Language")
204
 
205
  with gr.Row():
206
- strict_chk = gr.Checkbox(value=True, label="Strict script enforcement (recommended)")
207
- translit_chk = gr.Checkbox(value=True, label="Also return English transliteration")
208
- translit_scheme_dd = gr.Dropdown(
209
- choices=["ITRANS", "IAST", "HK"],
210
- value="ITRANS",
211
- label="Transliteration scheme (for Indic scripts)"
212
- )
213
 
214
- transcribe_btn = gr.Button("Transcribe")
215
 
216
- with gr.Row():
217
- out_text = gr.Textbox(label="Transcription", lines=6)
218
- out_translit = gr.Textbox(label="English Transliteration", lines=6)
219
-
220
- warn_box = gr.Markdown("")
221
-
222
- def wrapped_handler(audio, language_choice, strict_script, return_transliteration, translit_scheme):
223
- text, translit, warning = transcribe_handler(
224
- audio=audio,
225
- language_choice=language_choice,
226
- strict_script=strict_script,
227
- return_transliteration=return_transliteration,
228
- translit_scheme=translit_scheme,
229
- )
230
- # Only show transliteration if checkbox is on; otherwise empty
231
- if not return_transliteration:
232
- translit = ""
233
- return text, translit, (warning if warning else "")
234
-
235
- transcribe_btn.click(
236
- wrapped_handler,
237
- inputs=[audio_in, lang_dd, strict_chk, translit_chk, translit_scheme_dd],
238
- outputs=[out_text, out_translit, warn_box],
239
  )
240
 
241
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from faster_whisper import WhisperModel
3
  from indic_transliteration import sanscript
4
  from indic_transliteration.sanscript import transliterate
5
+ import re
6
 
7
+ # ---------------- CONFIG ---------------- #
 
 
 
8
  MODEL_NAME = "large-v2"
9
+ DEVICE = "cpu" # Change to "cuda" if you have GPU
 
 
 
 
 
10
  LANG_CODES = {
11
+ "English": "en",
12
  "Tamil": "ta",
13
  "Malayalam": "ml",
14
  "Hindi": "hi",
15
+ "Sanskrit": "sa"
 
16
  }
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  LANG_PRIMERS = {
19
+ "English": ("", ""),
20
+ "Tamil": ("The transcript should be in Tamil script.", "Write only in Tamil script without translation."),
21
+ "Malayalam": ("The transcript should be in Malayalam script.", "Write only in Malayalam script without translation."),
22
+ "Hindi": ("The transcript should be in Devanagari script.", "Write only in Devanagari script without translation."),
23
+ "Sanskrit": ("The transcript should be in Devanagari script.", "Write only in Devanagari script without translation.")
24
  }
25
 
26
+ # Script detection regexes
27
+ SCRIPT_PATTERNS = {
28
+ "Tamil": re.compile(r"[\u0B80-\u0BFF]"),
29
+ "Malayalam": re.compile(r"[\u0D00-\u0D7F]"),
30
+ "Hindi": re.compile(r"[\u0900-\u097F]"),
31
+ "Sanskrit": re.compile(r"[\u0900-\u097F]"),
32
+ "English": re.compile(r"[A-Za-z]")
33
+ }
34
+
35
+ # Load model
36
+ print("Loading Whisper model...")
37
+ model = WhisperModel(MODEL_NAME, device=DEVICE)
38
+
39
+ # ---------------- HELPERS ---------------- #
40
+ def is_script(text, lang_name):
41
+ pattern = SCRIPT_PATTERNS.get(lang_name)
42
+ if not pattern:
43
+ return True
44
+ return bool(pattern.search(text))
45
+
46
+ def transliterate_to_hk(text, lang_choice):
47
+ mapping = {
48
+ "Tamil": sanscript.TAMIL,
49
+ "Malayalam": sanscript.MALAYALAM,
50
+ "Hindi": sanscript.DEVANAGARI,
51
+ "Sanskrit": sanscript.DEVANAGARI,
52
+ "English": None
53
+ }
54
+ if mapping[lang_choice]:
55
+ return transliterate(text, mapping[lang_choice], sanscript.HK)
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  else:
 
57
  return text
58
 
59
+ def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
60
+ segments, info = model.transcribe(
61
+ audio_path,
 
 
 
 
 
 
 
62
  language=lang_code,
63
  task="transcribe",
 
64
  initial_prompt=initial_prompt,
65
+ beam_size=beam_size,
66
+ temperature=temperature,
67
+ condition_on_previous_text=condition_on_previous_text,
68
+ word_timestamps=False
69
  )
70
+ return "".join(s.text for s in segments).strip()
71
+
72
+ # ---------------- MAIN PIPELINE ---------------- #
73
+ def transcribe(audio, language_choice):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  if audio is None:
75
+ return "No audio provided.", ""
76
 
77
  lang_code = LANG_CODES[language_choice]
78
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
79
 
80
+ # Pass 1: loose mode to get context
81
+ loose_text = transcribe_once(
82
+ audio_path=audio,
83
+ lang_code=lang_code,
84
+ initial_prompt="",
85
+ beam_size=8,
86
+ temperature=0.4,
87
+ condition_on_previous_text=True
88
+ )
89
+
90
+ # Pass 2: strict mode with context
91
+ strict_prompt = f"{primer_strong}\nContext: {loose_text}"
92
+ strict_text = transcribe_once(
93
  audio_path=audio,
94
  lang_code=lang_code,
95
+ initial_prompt=strict_prompt,
96
+ beam_size=5,
97
+ temperature=0.0,
98
+ condition_on_previous_text=False
99
  )
100
 
101
+ # If still wrong script, retry with stronger primer only
102
+ if not is_script(strict_text, language_choice):
103
+ strict_text = transcribe_once(
 
104
  audio_path=audio,
105
  lang_code=lang_code,
106
  initial_prompt=primer_strong,
 
107
  beam_size=5,
108
+ temperature=0.0,
109
+ condition_on_previous_text=False
110
  )
111
+
112
+ hk_translit = transliterate_to_hk(strict_text, language_choice)
113
+
114
+ return strict_text, hk_translit
115
+
116
+ # ---------------- UI ---------------- #
 
 
 
 
 
 
 
 
 
 
 
117
  with gr.Blocks() as demo:
118
+ gr.Markdown("# 🎙️ Multilingual Pronunciation Assistant\nUpload your speech and get native script + Harvard-Kyoto transliteration.")
 
 
 
 
 
 
 
 
 
 
119
 
120
  with gr.Row():
121
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
122
+ lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="English", label="Language")
123
 
124
  with gr.Row():
125
+ output_text = gr.Textbox(label="Transcription (Native Script)")
126
+ output_translit = gr.Textbox(label="Harvard-Kyoto Transliteration")
 
 
 
 
 
127
 
128
+ submit_btn = gr.Button("Transcribe")
129
 
130
+ submit_btn.click(
131
+ fn=transcribe,
132
+ inputs=[audio_input, lang_choice],
133
+ outputs=[output_text, output_translit]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  )
135
 
136
+ if __name__ == "__main__":
137
+ demo.launch()