sudhanm commited on
Commit
25dc731
ยท
verified ยท
1 Parent(s): 33e6674

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +556 -2
app.py CHANGED
@@ -1,4 +1,522 @@
1
- # Event handlers for buttons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  gen_btn.click(
3
  fn=get_random_sentence,
4
  inputs=[lang_choice],
@@ -30,4 +548,40 @@
30
  fn=get_random_sentence,
31
  inputs=[lang_choice],
32
  outputs=[intended_display]
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+ import difflib
4
+ import re
5
+ import jiwer
6
+ import torch
7
+ import torchaudio
8
+ import numpy as np
9
+ from transformers import (
10
+ AutoProcessor,
11
+ AutoModelForSpeechSeq2Seq,
12
+ WhisperProcessor,
13
+ WhisperForConditionalGeneration
14
+ )
15
+ import librosa
16
+ import soundfile as sf
17
+ from indic_transliteration import sanscript
18
+ from indic_transliteration.sanscript import transliterate
19
+ import warnings
20
+ import spaces
21
+ warnings.filterwarnings("ignore")
22
+
23
+ # ---------------- CONFIG ---------------- #
24
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
+ print(f"๐Ÿ”ง Using device: {DEVICE}")
26
+
27
+ LANG_CODES = {
28
+ "English": "en",
29
+ "Tamil": "ta",
30
+ "Malayalam": "ml"
31
+ }
32
+
33
+ # Updated model configurations with LARGE models for maximum accuracy
34
+ ASR_MODELS = {
35
+ "English": "openai/whisper-base.en",
36
+ "Tamil": "ai4bharat/whisper-large-ta", # LARGE AI4Bharat Tamil model (~1.5GB)
37
+ "Malayalam": "ai4bharat/whisper-large-ml" # LARGE AI4Bharat Malayalam model (~1.5GB)
38
+ }
39
+
40
+ LANG_PRIMERS = {
41
+ "English": ("Transcribe in English.",
42
+ "Write only in English. Example: This is an English sentence."),
43
+ "Tamil": ("เฎคเฎฎเฎฟเฎดเฎฟเฎฒเฏ เฎŽเฎดเฏเฎคเฏเฎ•.",
44
+ "เฎคเฎฎเฎฟเฎดเฏ เฎŽเฎดเฏเฎคเฏเฎคเฏเฎ•เฏเฎ•เฎณเฎฟเฎฒเฏ เฎฎเฎŸเฏเฎŸเฏเฎฎเฏ เฎŽเฎดเฏเฎคเฎตเฏเฎฎเฏ. เฎ‰เฎคเฎพเฎฐเฎฃเฎฎเฏ: เฎ‡เฎคเฏ เฎ’เฎฐเฏ เฎคเฎฎเฎฟเฎดเฏ เฎตเฎพเฎ•เฏเฎ•เฎฟเฎฏเฎฎเฏ."),
45
+ "Malayalam": ("เดฎเดฒเดฏเดพเดณเดคเตเดคเดฟเตฝ เดŽเดดเตเดคเตเด•.",
46
+ "เดฎเดฒเดฏเดพเดณ เดฒเดฟเดชเดฟเดฏเดฟเตฝ เดฎเดพเดคเตเดฐเด‚ เดŽเดดเตเดคเตเด•. เด‰เดฆเดพเดนเดฐเดฃเด‚: เด‡เดคเตŠเดฐเต เดฎเดฒเดฏเดพเดณ เดตเดพเด•เตเดฏเดฎเดพเดฃเต.")
47
+ }
48
+
49
+ SCRIPT_PATTERNS = {
50
+ "Tamil": re.compile(r"[เฎ€-เฏฟ]"),
51
+ "Malayalam": re.compile(r"[เด€-เตฟ]"),
52
+ "English": re.compile(r"[A-Za-z]")
53
+ }
54
+
55
+ SENTENCE_BANK = {
56
+ "English": [
57
+ "The sun sets over the beautiful horizon.",
58
+ "Learning new languages opens many doors.",
59
+ "I enjoy reading books in the evening.",
60
+ "Technology has changed our daily lives.",
61
+ "Music brings people together across cultures.",
62
+ "Education is the key to a bright future.",
63
+ "The flowers bloom beautifully in spring.",
64
+ "Hard work always pays off in the end."
65
+ ],
66
+ "Tamil": [
67
+ "เฎ‡เฎฉเฏเฎฑเฏ เฎจเฎฒเฏเฎฒ เฎตเฎพเฎฉเฎฟเฎฒเฏˆ เฎ‰เฎณเฏเฎณเฎคเฏ.",
68
+ "เฎจเฎพเฎฉเฏ เฎคเฎฎเฎฟเฎดเฏ เฎ•เฎฑเฏเฎฑเฏเฎ•เฏเฎ•เฏŠเฎฃเฏเฎŸเฏ เฎ‡เฎฐเฏเฎ•เฏเฎ•เฎฟเฎฑเฏ‡เฎฉเฏ.",
69
+ "เฎŽเฎฉเฎ•เฏเฎ•เฏ เฎชเฏเฎคเฏเฎคเฎ•เฎฎเฏ เฎชเฎŸเฎฟเฎ•เฏเฎ• เฎตเฎฟเฎฐเฏเฎชเฏเฎชเฎฎเฏ.",
70
+ "เฎคเฎฎเฎฟเฎดเฏ เฎฎเฏŠเฎดเฎฟ เฎฎเฎฟเฎ•เฎตเฏเฎฎเฏ เฎ…เฎดเฎ•เฎพเฎฉเฎคเฏ.",
71
+ "เฎ•เฏเฎŸเฏเฎฎเฏเฎชเฎคเฏเฎคเฏเฎŸเฎฉเฏ เฎจเฏ‡เฎฐเฎฎเฏ เฎšเฏ†เฎฒเฎตเฎฟเฎŸเฏเฎตเฎคเฏ เฎฎเฏเฎ•เฏเฎ•เฎฟเฎฏเฎฎเฏ.",
72
+ "เฎ•เฎฒเฏเฎตเฎฟ เฎจเฎฎเฎคเฏ เฎŽเฎคเฎฟเฎฐเฏเฎ•เฎพเฎฒเฎคเฏเฎคเฎฟเฎฉเฏ เฎคเฎฟเฎฑเฎตเฏเฎ•เฏ‹เฎฒเฏ.",
73
+ "เฎชเฎฑเฎตเฏˆเฎ•เฎณเฏ เฎ•เฎพเฎฒเฏˆเฎฏเฎฟเฎฒเฏ เฎ‡เฎฉเฎฟเฎฎเฏˆเฎฏเฎพเฎ• เฎชเฎพเฎŸเฏเฎ•เฎฟเฎฉเฏเฎฑเฎฉ.",
74
+ "เฎ‰เฎดเฏˆเฎชเฏเฎชเฏ เฎŽเฎชเฏเฎชเฏ‹เฎคเฏเฎฎเฏ เฎตเฏ†เฎฑเฏเฎฑเฎฟเฎฏเฏˆเฎคเฏ เฎคเฎฐเฏเฎฎเฏ."
75
+ ],
76
+ "Malayalam": [
77
+ "เดŽเดจเดฟเด•เตเด•เต เดฎเดฒเดฏเดพเดณเด‚ เดตเดณเดฐเต† เด‡เดทเตเดŸเดฎเดพเดฃเต.",
78
+ "เด‡เดจเตเดจเต เดฎเดดเดชเต†เดฏเตเดฏเตเดจเตเดจเต.",
79
+ "เดžเดพเตป เดชเตเดธเตเดคเด•เด‚ เดตเดพเดฏเดฟเด•เตเด•เตเดจเตเดจเต.",
80
+ "เด•เต‡เดฐเดณเดคเตเดคเดฟเดจเตเดฑเต† เดชเตเดฐเด•เตƒเดคเดฟ เดธเตเดจเตเดฆเดฐเดฎเดพเดฃเต.",
81
+ "เดตเดฟเดฆเตเดฏเดพเดญเตเดฏเดพเดธเด‚ เดœเต€เดตเดฟเดคเดคเตเดคเดฟเตฝ เดชเตเดฐเดงเดพเดจเดฎเดพเดฃเต.",
82
+ "เดธเด‚เด—เต€เดคเด‚ เดฎเดจเดธเตเดธเดฟเดจเต เดธเดจเตเดคเต‹เดทเด‚ เดจเตฝเด•เตเดจเตเดจเต.",
83
+ "เด•เตเดŸเตเด‚เดฌเดธเดฎเดฏเด‚ เดตเดณเดฐเต† เดตเดฟเดฒเดชเตเดชเต†เดŸเตเดŸเดคเดพเดฃเต.",
84
+ "เด•เด เดฟเดจเดพเดงเตเดตเดพเดจเด‚ เดŽเดชเตเดชเต‹เดดเตเด‚ เดซเดฒเด‚ เดจเตฝเด•เตเด‚."
85
+ ]
86
+ }
87
+
88
+ # ---------------- MODEL CACHE ---------------- #
89
+ asr_models = {}
90
+
91
+ @spaces.GPU
92
+ def load_asr_model(language):
93
+ """Load ASR model for specific language - PRIMARY MODELS ONLY"""
94
+ if language not in asr_models:
95
+ model_name = ASR_MODELS[language]
96
+ print(f"๐Ÿ”„ Loading LARGE model for {language}: {model_name}")
97
+
98
+ try:
99
+ processor = AutoProcessor.from_pretrained(model_name)
100
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
101
+ model_name,
102
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
103
+ low_cpu_mem_usage=True,
104
+ use_safetensors=True
105
+ ).to(DEVICE)
106
+
107
+ asr_models[language] = {"processor": processor, "model": model, "model_name": model_name}
108
+ print(f"โœ… LARGE model loaded successfully for {language}")
109
+
110
+ except Exception as e:
111
+ print(f"โŒ Failed to load {model_name}: {e}")
112
+ raise Exception(f"Could not load {language} model. Please check model availability.")
113
+
114
+ return asr_models[language]
115
+
116
+ # ---------------- HELPERS ---------------- #
117
+ def get_random_sentence(language_choice):
118
+ """Get random sentence for practice"""
119
+ return random.choice(SENTENCE_BANK[language_choice])
120
+
121
+ def is_script(text, lang_name):
122
+ """Check if text is in expected script"""
123
+ pattern = SCRIPT_PATTERNS.get(lang_name)
124
+ if not pattern:
125
+ return True
126
+ return bool(pattern.search(text))
127
+
128
+ def transliterate_to_hk(text, lang_choice):
129
+ """Transliterate Indic text to Harvard-Kyoto"""
130
+ mapping = {
131
+ "Tamil": sanscript.TAMIL,
132
+ "Malayalam": sanscript.MALAYALAM,
133
+ "English": None
134
+ }
135
+
136
+ script = mapping.get(lang_choice)
137
+ if script and is_script(text, lang_choice):
138
+ try:
139
+ return transliterate(text, script, sanscript.HK)
140
+ except Exception as e:
141
+ print(f"Transliteration error: {e}")
142
+ return text
143
+ return text
144
+
145
+ def preprocess_audio(audio_path, target_sr=16000):
146
+ """Preprocess audio for ASR"""
147
+ try:
148
+ # Load audio
149
+ audio, sr = librosa.load(audio_path, sr=target_sr)
150
+
151
+ # Normalize audio
152
+ if np.max(np.abs(audio)) > 0:
153
+ audio = audio / np.max(np.abs(audio))
154
+
155
+ # Remove silence from beginning and end
156
+ audio, _ = librosa.effects.trim(audio, top_db=20)
157
+
158
+ # Ensure minimum length
159
+ if len(audio) < target_sr * 0.1: # Less than 0.1 seconds
160
+ return None, None
161
+
162
+ return audio, target_sr
163
+ except Exception as e:
164
+ print(f"Audio preprocessing error: {e}")
165
+ return None, None
166
+
167
+ @spaces.GPU
168
+ def transcribe_audio(audio_path, language, initial_prompt="", force_language=True):
169
+ """Transcribe audio using loaded models"""
170
+ try:
171
+ # Load model components
172
+ asr_components = load_asr_model(language)
173
+ processor = asr_components["processor"]
174
+ model = asr_components["model"]
175
+ model_name = asr_components["model_name"]
176
+
177
+ # Preprocess audio
178
+ audio, sr = preprocess_audio(audio_path)
179
+ if audio is None:
180
+ return "Error: Audio too short or could not be processed"
181
+
182
+ # Prepare inputs
183
+ inputs = processor(
184
+ audio,
185
+ sampling_rate=sr,
186
+ return_tensors="pt",
187
+ padding=True
188
+ )
189
+
190
+ # Move to device
191
+ input_features = inputs.input_features.to(DEVICE)
192
+
193
+ # Generate transcription
194
+ with torch.no_grad():
195
+ # Basic generation parameters
196
+ generate_kwargs = {
197
+ "input_features": input_features,
198
+ "max_length": 200,
199
+ "num_beams": 3, # Reduced for better compatibility
200
+ "do_sample": False
201
+ }
202
+
203
+ # Try different approaches for language forcing
204
+ if force_language and language != "English":
205
+ lang_code = LANG_CODES.get(language, "en")
206
+
207
+ # Method 1: Try forced_decoder_ids (OpenAI Whisper style)
208
+ try:
209
+ if hasattr(processor, 'get_decoder_prompt_ids'):
210
+ forced_decoder_ids = processor.get_decoder_prompt_ids(
211
+ language=lang_code,
212
+ task="transcribe"
213
+ )
214
+ # Test if model accepts this parameter
215
+ test_kwargs = generate_kwargs.copy()
216
+ test_kwargs["max_length"] = 10
217
+ test_kwargs["forced_decoder_ids"] = forced_decoder_ids
218
+ _ = model.generate(**test_kwargs) # Test run
219
+ generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
220
+ print(f"โœ… Using forced_decoder_ids for {language}")
221
+ except Exception as e:
222
+ print(f"โš ๏ธ forced_decoder_ids not supported: {e}")
223
+
224
+ # Method 2: Try language parameter
225
+ try:
226
+ test_kwargs = generate_kwargs.copy()
227
+ test_kwargs["max_length"] = 10
228
+ test_kwargs["language"] = lang_code
229
+ _ = model.generate(**test_kwargs) # Test run
230
+ generate_kwargs["language"] = lang_code
231
+ print(f"โœ… Using language parameter for {language}")
232
+ except Exception as e:
233
+ print(f"โš ๏ธ language parameter not supported: {e}")
234
+
235
+ # Generate with whatever parameters work
236
+ predicted_ids = model.generate(**generate_kwargs)
237
+
238
+ # Decode
239
+ transcription = processor.batch_decode(
240
+ predicted_ids,
241
+ skip_special_tokens=True,
242
+ clean_up_tokenization_spaces=True
243
+ )[0]
244
+
245
+ # Post-process transcription
246
+ transcription = transcription.strip()
247
+
248
+ # If we get empty transcription, try again with simpler parameters
249
+ if not transcription and generate_kwargs.get("num_beams", 1) > 1:
250
+ print("๐Ÿ”„ Retrying with greedy decoding...")
251
+ simple_kwargs = {
252
+ "input_features": input_features,
253
+ "max_length": 200,
254
+ "do_sample": False
255
+ }
256
+ predicted_ids = model.generate(**simple_kwargs)
257
+ transcription = processor.batch_decode(
258
+ predicted_ids,
259
+ skip_special_tokens=True,
260
+ clean_up_tokenization_spaces=True
261
+ )[0].strip()
262
+
263
+ return transcription or "(No transcription generated)"
264
+
265
+ except Exception as e:
266
+ print(f"Transcription error for {language}: {e}")
267
+ return f"Error: {str(e)[:150]}..."
268
+
269
+ def highlight_differences(ref, hyp):
270
+ """Highlight word-level differences with better styling"""
271
+ if not ref.strip() or not hyp.strip():
272
+ return "No text to compare"
273
+
274
+ ref_words = ref.strip().split()
275
+ hyp_words = hyp.strip().split()
276
+
277
+ sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
278
+ out_html = []
279
+
280
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
281
+ if tag == 'equal':
282
+ out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
283
+ elif tag == 'replace':
284
+ out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
285
+ out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>โ†’{w}</span>" for w in hyp_words[j1:j2]])
286
+ elif tag == 'delete':
287
+ out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
288
+ elif tag == 'insert':
289
+ out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
290
+
291
+ return " ".join(out_html)
292
+
293
+ def char_level_highlight(ref, hyp):
294
+ """Highlight character-level differences"""
295
+ if not ref.strip() or not hyp.strip():
296
+ return "No text to compare"
297
+
298
+ sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
299
+ out = []
300
+
301
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
302
+ if tag == 'equal':
303
+ out.extend([f"<span style='color:green; background-color:#e8f5e8;'>{c}</span>" for c in ref[i1:i2]])
304
+ elif tag in ('replace', 'delete'):
305
+ out.extend([f"<span style='color:red; text-decoration:underline; background-color:#ffe8e8; font-weight:bold;'>{c}</span>" for c in ref[i1:i2]])
306
+ elif tag == 'insert':
307
+ out.extend([f"<span style='color:orange; background-color:#fff3cd; font-weight:bold;'>{c}</span>" for c in hyp[j1:j2]])
308
+
309
+ return "".join(out)
310
+
311
+ def get_pronunciation_score(wer_val, cer_val):
312
+ """Calculate pronunciation score and feedback"""
313
+ # Weight WER more heavily than CER
314
+ combined_score = (wer_val * 0.7) + (cer_val * 0.3)
315
+
316
+ if combined_score <= 0.1:
317
+ return "๐Ÿ† Excellent! (90%+)", "Your pronunciation is outstanding!"
318
+ elif combined_score <= 0.2:
319
+ return "๐ŸŽ‰ Very Good! (80-90%)", "Great pronunciation with minor areas for improvement."
320
+ elif combined_score <= 0.4:
321
+ return "๐Ÿ‘ Good! (60-80%)", "Good effort! Keep practicing for better accuracy."
322
+ elif combined_score <= 0.6:
323
+ return "๐Ÿ“š Needs Practice (40-60%)", "Focus on clearer pronunciation of highlighted words."
324
+ else:
325
+ return "๐Ÿ’ช Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
326
+
327
+ # ---------------- MAIN FUNCTION ---------------- #
328
+ @spaces.GPU
329
+ def compare_pronunciation(audio, language_choice, intended_sentence):
330
+ """Main function to compare pronunciation"""
331
+ print(f"๐Ÿ” Starting analysis with language: {language_choice}")
332
+ print(f"๐Ÿ“ Audio file: {audio}")
333
+ print(f"๐ŸŽฏ Intended sentence: {intended_sentence}")
334
+
335
+ if audio is None:
336
+ print("โŒ No audio provided")
337
+ return ("โŒ Please record audio first.", "", "", "", "", "", "", "", "", "", "", "", "")
338
+
339
+ if not intended_sentence.strip():
340
+ print("โŒ No intended sentence")
341
+ return ("โŒ Please generate a practice sentence first.", "", "", "", "", "", "", "", "", "", "", "", "")
342
+
343
+ try:
344
+ print(f"๐Ÿ” Analyzing pronunciation for {language_choice}...")
345
+
346
+ # Pass 1: Raw transcription
347
+ print("๐Ÿ”„ Starting Pass 1 transcription...")
348
+ primer_weak, _ = LANG_PRIMERS[language_choice]
349
+ actual_text = transcribe_audio(audio, language_choice, primer_weak, force_language=True)
350
+ print(f"โœ… Pass 1 result: {actual_text}")
351
+
352
+ # Pass 2: Target-biased transcription with stronger prompt
353
+ print("๐Ÿ”„ Starting Pass 2 transcription...")
354
+ _, primer_strong = LANG_PRIMERS[language_choice]
355
+ strict_prompt = f"{primer_strong}\nExpected: {intended_sentence}"
356
+ corrected_text = transcribe_audio(audio, language_choice, strict_prompt, force_language=True)
357
+ print(f"โœ… Pass 2 result: {corrected_text}")
358
+
359
+ # Handle transcription errors
360
+ if actual_text.startswith("Error:"):
361
+ print(f"โŒ Transcription error: {actual_text}")
362
+ return (f"โŒ {actual_text}", "", "", "", "", "", "", "", "", "", "", "", "")
363
+
364
+ # Calculate error metrics
365
+ try:
366
+ print("๐Ÿ”„ Calculating error metrics...")
367
+ wer_val = jiwer.wer(intended_sentence, actual_text)
368
+ cer_val = jiwer.cer(intended_sentence, actual_text)
369
+ print(f"โœ… WER: {wer_val:.3f}, CER: {cer_val:.3f}")
370
+ except Exception as e:
371
+ print(f"โŒ Error calculating metrics: {e}")
372
+ wer_val, cer_val = 1.0, 1.0
373
+
374
+ # Get pronunciation score and feedback
375
+ score_text, feedback = get_pronunciation_score(wer_val, cer_val)
376
+ print(f"โœ… Score: {score_text}")
377
+
378
+ # Transliterations for both actual and intended
379
+ print("๐Ÿ”„ Generating transliterations...")
380
+ actual_hk = transliterate_to_hk(actual_text, language_choice)
381
+ target_hk = transliterate_to_hk(intended_sentence, language_choice)
382
+
383
+ # Handle script mismatches
384
+ if not is_script(actual_text, language_choice) and language_choice != "English":
385
+ actual_hk = f"โš ๏ธ Expected {language_choice} script, got mixed/other script"
386
+
387
+ # Visual feedback
388
+ print("๐Ÿ”„ Generating visual feedback...")
389
+ diff_html = highlight_differences(intended_sentence, actual_text)
390
+ char_html = char_level_highlight(intended_sentence, actual_text)
391
+
392
+ # Status message with detailed feedback
393
+ status = f"โœ… Analysis Complete - {score_text}\n๐Ÿ’ฌ {feedback}"
394
+ print(f"โœ… Analysis completed successfully")
395
+
396
+ return (
397
+ status,
398
+ actual_text or "(No transcription)",
399
+ corrected_text or "(No corrected transcription)",
400
+ f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
401
+ f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
402
+ # New visual feedback outputs
403
+ actual_text or "(No transcription)", # actual_text_display
404
+ actual_hk, # actual_transliteration
405
+ intended_sentence, # target_text_display
406
+ target_hk, # target_transliteration
407
+ diff_html, # diff_html_box
408
+ char_html, # char_html_box
409
+ intended_sentence, # intended_display (unchanged)
410
+ f"๐ŸŽฏ Target: {intended_sentence}" # target_display
411
+ )
412
+
413
+ except Exception as e:
414
+ error_msg = f"โŒ Analysis Error: {str(e)[:200]}"
415
+ print(f"โŒ FATAL ERROR: {e}")
416
+ import traceback
417
+ traceback.print_exc()
418
+ return (error_msg, str(e), "", "", "", "", "", "", "", "", "", "", "")
419
+
420
+ # ---------------- UI ---------------- #
421
+ def create_interface():
422
+ with gr.Blocks(title="๐ŸŽ™๏ธ Multilingual Pronunciation Trainer") as demo:
423
+
424
+ gr.Markdown("""
425
+ # ๐ŸŽ™๏ธ Multilingual Pronunciation Trainer
426
+
427
+ **Practice pronunciation in Tamil, Malayalam & English** using advanced speech recognition!
428
+
429
+ ### ๐Ÿ“‹ How to Use:
430
+ 1. **Select** your target language ๐ŸŒ
431
+ 2. **Generate** a practice sentence ๐ŸŽฒ
432
+ 3. **Record** yourself reading it aloud ๐ŸŽค
433
+ 4. **Get** detailed feedback with accuracy metrics ๐Ÿ“Š
434
+
435
+ ### ๐ŸŽฏ Features:
436
+ - **Dual-pass analysis** for accurate assessment
437
+ - **Visual highlighting** of pronunciation errors
438
+ - **Romanization** for Indic scripts
439
+ - **Detailed metrics** (Word & Character accuracy)
440
+ """)
441
+
442
+ with gr.Row():
443
+ with gr.Column(scale=3):
444
+ lang_choice = gr.Dropdown(
445
+ choices=list(LANG_CODES.keys()),
446
+ value="Tamil",
447
+ label="๐ŸŒ Select Language"
448
+ )
449
+ with gr.Column(scale=1):
450
+ gen_btn = gr.Button("๐ŸŽฒ Generate Sentence", variant="primary")
451
+
452
+ intended_display = gr.Textbox(
453
+ label="๐Ÿ“ Practice Sentence (Read this aloud)",
454
+ placeholder="Click 'Generate Sentence' to get started...",
455
+ interactive=False,
456
+ lines=3
457
+ )
458
+
459
+ audio_input = gr.Audio(
460
+ sources=["microphone", "upload"],
461
+ type="filepath",
462
+ label="๐ŸŽค Record Your Pronunciation"
463
+ )
464
+
465
+ analyze_btn = gr.Button("๐Ÿ” Analyze Pronunciation", variant="primary")
466
+
467
+ status_output = gr.Textbox(
468
+ label="๐Ÿ“Š Analysis Results",
469
+ interactive=False,
470
+ lines=3
471
+ )
472
+
473
+ with gr.Row():
474
+ with gr.Column():
475
+ pass1_out = gr.Textbox(
476
+ label="๐ŸŽฏ What You Actually Said (Raw Output)",
477
+ interactive=False,
478
+ lines=2
479
+ )
480
+ wer_out = gr.Textbox(
481
+ label="๐Ÿ“ˆ Word Accuracy",
482
+ interactive=False
483
+ )
484
+
485
+ with gr.Column():
486
+ pass2_out = gr.Textbox(
487
+ label="๐Ÿ”ง Target-Biased Analysis",
488
+ interactive=False,
489
+ lines=2
490
+ )
491
+ cer_out = gr.Textbox(
492
+ label="๐Ÿ“Š Character Accuracy",
493
+ interactive=False
494
+ )
495
+
496
+ with gr.Accordion("๐Ÿ“ Detailed Visual Feedback", open=True):
497
+ gr.Markdown("""
498
+ ### ๐ŸŽจ Color Guide:
499
+ - ๐ŸŸข **Green**: Correctly pronounced words/characters
500
+ - ๐Ÿ”ด **Red**: Missing or mispronounced (strikethrough)
501
+ - ๐ŸŸ  **Orange**: Extra words or substitutions
502
+ """)
503
+
504
+ diff_html_box = gr.HTML(
505
+ label="๐Ÿ” Word-Level Analysis",
506
+ show_label=True
507
+ )
508
+ char_html_box = gr.HTML(
509
+ label="๐Ÿ”ค Character-Level Analysis",
510
+ show_label=True
511
+ )
512
+
513
+ target_display = gr.Textbox(
514
+ label="๐ŸŽฏ Reference Text",
515
+ interactive=False,
516
+ visible=False
517
+ )
518
+
519
+ # Event handlers for buttons
520
  gen_btn.click(
521
  fn=get_random_sentence,
522
  inputs=[lang_choice],
 
548
  fn=get_random_sentence,
549
  inputs=[lang_choice],
550
  outputs=[intended_display]
551
+ )
552
+
553
+ # Footer
554
+ gr.Markdown("""
555
+ ---
556
+ ### ๐Ÿ”ง Technical Details:
557
+ - **ASR Models**:
558
+ - **Tamil**: AI4Bharat Whisper-LARGE-TA (~1.5GB, maximum accuracy)
559
+ - **Malayalam**: AI4Bharat Whisper-LARGE-ML (~1.5GB, maximum accuracy)
560
+ - **English**: OpenAI Whisper-Base-EN (optimized for English)
561
+ - **Performance**: Using largest available models for best pronunciation assessment
562
+ - **Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
563
+ - **Transliteration**: Harvard-Kyoto system for Indic scripts
564
+ - **Analysis**: Dual-pass approach for comprehensive feedback
565
+
566
+ **Note**: Large models provide maximum accuracy but require longer initial loading time.
567
+ **Languages**: English, Tamil, and Malayalam with specialized large models.
568
+ """)
569
+
570
+ return demo
571
+
572
+ # ---------------- LAUNCH ---------------- #
573
+ if __name__ == "__main__":
574
+ print("๐Ÿš€ Starting Multilingual Pronunciation Trainer with LARGE models...")
575
+ print(f"๐Ÿ”ง Device: {DEVICE}")
576
+ print(f"๐Ÿ”ง PyTorch version: {torch.__version__}")
577
+ print("๐Ÿ“ฆ Models will be loaded on-demand with GPU acceleration...")
578
+ print("โšก Using AI4Bharat LARGE models for maximum accuracy!")
579
+ print("๐ŸŽฎ GPU functions decorated with @spaces.GPU for HuggingFace Spaces")
580
+
581
+ demo = create_interface()
582
+ demo.launch(
583
+ share=True,
584
+ show_error=True,
585
+ server_name="0.0.0.0",
586
+ server_port=7860
587
+ )