sudhanm commited on
Commit
05566a8
ยท
verified ยท
1 Parent(s): ca298ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +494 -355
app.py CHANGED
@@ -1,9 +1,14 @@
1
  import gradio as gr
2
- import random, difflib, re, warnings, contextlib
 
 
 
3
  import torch
 
 
 
 
4
  import numpy as np
5
- import librosa, soundfile as sf
6
- import jiwer
7
 
8
  # Optional transliteration
9
  try:
@@ -12,6 +17,7 @@ try:
12
  INDIC_OK = True
13
  except:
14
  INDIC_OK = False
 
15
 
16
  # Optional HF Spaces GPU decorator
17
  try:
@@ -31,512 +37,645 @@ DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
31
  amp_ctx = torch.cuda.amp.autocast if DEVICE == "cuda" else contextlib.nullcontext
32
  print(f"๐Ÿ”ง Using device: {DEVICE}")
33
 
34
- LANG_CODES = {"English": "en", "Tamil": "ta", "Malayalam": "ml"}
 
 
 
 
 
35
 
36
- # Primary: IndicWhisper - Fixed model configuration
37
  INDICWHISPER_MODEL = "parthiv11/indic_whisper_nodcil"
38
 
39
- # Specialised fallbacks
40
  SPECIALIZED_MODELS = {
41
  "English": "openai/whisper-base.en",
42
- "Tamil": "vasista22/whisper-tamil-large-v2",
43
  "Malayalam": "thennal/whisper-medium-ml",
 
44
  }
45
 
46
  SCRIPT_PATTERNS = {
47
  "Tamil": re.compile(r"[เฎ€-เฏฟ]"),
48
  "Malayalam": re.compile(r"[เด€-เตฟ]"),
49
- "English": re.compile(r"[A-Za-z]"),
 
 
 
 
 
 
 
 
 
50
  }
 
51
  SENTENCE_BANK = {
52
  "English": [
53
- "The sun sets over the beautiful horizon.",
54
- "Hard work always pays off in the end."
 
 
 
55
  ],
56
  "Tamil": [
57
  "เฎ‡เฎฉเฏเฎฑเฏ เฎจเฎฒเฏเฎฒ เฎตเฎพเฎฉเฎฟเฎฒเฏˆ เฎ‰เฎณเฏเฎณเฎคเฏ.",
58
- "เฎ‰เฎดเฏˆเฎชเฏเฎชเฏ เฎŽเฎชเฏเฎชเฏ‹เฎคเฏเฎฎเฏ เฎตเฏ†เฎฑเฏเฎฑเฎฟเฎฏเฏˆเฎคเฏ เฎคเฎฐเฏเฎฎเฏ."
 
 
 
59
  ],
60
  "Malayalam": [
61
  "เดŽเดจเดฟเด•เตเด•เต เดฎเดฒเดฏเดพเดณเด‚ เดตเดณเดฐเต† เด‡เดทเตเดŸเดฎเดพเดฃเต.",
62
- "เด•เด เดฟเดจเดพเดงเตเดตเดพเดจเด‚ เดŽเดชเตเดชเต‹เดดเตเด‚ เดซเดฒเด‚ เดจเตฝเด•เตเด‚."
 
 
 
 
 
 
 
 
 
 
63
  ]
64
  }
65
 
66
  # Model cache
67
- indicwhisper_pipeline = None
68
- fallback_models = {}
69
- WHISPER_JAX_AVAILABLE = False
70
 
71
  # ---------------- HELPERS ---------------- #
72
  def get_random_sentence(language_choice):
73
  return random.choice(SENTENCE_BANK[language_choice])
74
 
75
- def is_script(text, lang_name):
76
- p = SCRIPT_PATTERNS.get(lang_name)
77
- return not p or bool(p.search(text or ""))
78
-
79
- def transliterate_to_hk(text, lang_choice):
80
- if not INDIC_OK:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  return text
82
- mapping = {"Tamil": sanscript.TAMIL, "Malayalam": sanscript.MALAYALAM, "English": None}
83
- script = mapping.get(lang_choice)
84
- if script and is_script(text, lang_choice):
85
- try: return transliterate(text, script, sanscript.HK)
86
- except: return text
87
- return text
88
 
89
  def preprocess_audio(audio_path, target_sr=16000):
 
90
  try:
91
  audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
92
- if audio is None or len(audio) == 0: return None, None
 
 
 
93
  audio = audio.astype(np.float32)
94
- m = np.max(np.abs(audio))
95
- if m > 0: audio /= m
 
 
 
96
  audio, _ = librosa.effects.trim(audio, top_db=20)
97
- if len(audio) < int(target_sr*0.1): return None, None
 
 
 
 
98
  return audio, target_sr
99
- except: return None, None
100
-
101
- JIWER_TRANSFORM = jiwer.Compose([
102
- jiwer.ToLowerCase(), jiwer.RemovePunctuation(),
103
- jiwer.RemoveMultipleSpaces(), jiwer.Strip(),
104
- jiwer.ReduceToListOfListOfWords()
105
- ])
106
- def compute_wer(ref,hyp):
107
- try: return jiwer.wer(ref, hyp, truth_transform=JIWER_TRANSFORM, hypothesis_transform=JIWER_TRANSFORM)
108
- except: return 1.0
109
- def compute_cer(ref,hyp):
110
- try: return jiwer.cer(ref, hyp)
111
- except: return 1.0
112
 
113
  # ---------------- MODEL LOADERS ---------------- #
114
  @GPU_DECORATOR
115
- def load_indicwhisper():
116
- """
117
- Load IndicWhisper with explicit architecture matching.
118
- Fixed to handle the Large model architecture properly.
119
- """
120
- global indicwhisper_pipeline, WHISPER_JAX_AVAILABLE
121
- if indicwhisper_pipeline is not None:
122
- return indicwhisper_pipeline
123
-
124
- # Try JAX first with explicit config
125
  try:
126
- from whisper_jax import FlaxWhisperPipeline
127
- import jax.numpy as jnp
128
- print(f"๐Ÿ”„ Loading JAX IndicWhisper: {INDICWHISPER_MODEL}")
129
 
130
- # Try with explicit model size specification
131
- try:
132
- indicwhisper_pipeline = FlaxWhisperPipeline(
133
- INDICWHISPER_MODEL,
134
- dtype=jnp.bfloat16,
135
- batch_size=1
136
- )
137
- except Exception as model_error:
138
- print(f"โš ๏ธ Direct JAX loading failed: {model_error}")
139
- # Fallback to specifying base model architecture
140
- indicwhisper_pipeline = FlaxWhisperPipeline(
141
- "openai/whisper-large-v2", # Base architecture
142
- checkpoint=INDICWHISPER_MODEL, # Fine-tuned weights
143
- dtype=jnp.bfloat16,
144
- batch_size=1
145
- )
146
-
147
- WHISPER_JAX_AVAILABLE = True
148
- print("โœ… JAX IndicWhisper loaded!")
149
- return indicwhisper_pipeline
150
- except Exception as e:
151
- print(f"โš ๏ธ JAX unavailable: {e}")
152
- WHISPER_JAX_AVAILABLE = False
153
-
154
- # Transformers fallback with explicit model loading
155
- try:
156
- from transformers import (
157
- AutoProcessor,
158
- AutoModelForSpeechSeq2Seq,
159
- WhisperProcessor,
160
- WhisperForConditionalGeneration,
161
- pipeline
162
  )
 
 
163
 
164
- print(f"๐Ÿ”„ Loading Transformers IndicWhisper: {INDICWHISPER_MODEL}")
165
-
166
- # Method 1: Try direct loading with trust_remote_code
167
- try:
168
- indicwhisper_pipeline = pipeline(
169
- "automatic-speech-recognition",
170
- model=INDICWHISPER_MODEL,
171
- tokenizer=INDICWHISPER_MODEL,
172
- feature_extractor=INDICWHISPER_MODEL,
173
- device=DEVICE_INDEX,
174
- trust_remote_code=True,
175
- torch_dtype=DTYPE
176
- )
177
- print("โœ… Direct Transformers loading successful!")
178
- return indicwhisper_pipeline
179
- except Exception as direct_error:
180
- print(f"โš ๏ธ Direct loading failed: {direct_error}")
181
-
182
- # Method 2: Load with explicit base model architecture
183
- try:
184
- # Load processor from the fine-tuned model
185
- processor = AutoProcessor.from_pretrained(INDICWHISPER_MODEL)
186
-
187
- # Load model with explicit architecture handling
188
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
189
- INDICWHISPER_MODEL,
190
- torch_dtype=DTYPE,
191
- device_map="auto" if DEVICE == "cuda" else None,
192
- trust_remote_code=True
193
- ).to(DEVICE)
194
-
195
- # Create pipeline manually
196
- indicwhisper_pipeline = pipeline(
197
- "automatic-speech-recognition",
198
- model=model,
199
- tokenizer=processor.tokenizer,
200
- feature_extractor=processor.feature_extractor,
201
- device=DEVICE_INDEX
202
- )
203
- print("โœ… Manual Transformers loading successful!")
204
- return indicwhisper_pipeline
205
- except Exception as manual_error:
206
- print(f"โš ๏ธ Manual loading failed: {manual_error}")
207
-
208
- # Method 3: Fallback to base Whisper Large model
209
- print("๐Ÿ”„ Falling back to base Whisper Large model...")
210
- indicwhisper_pipeline = pipeline(
211
- "automatic-speech-recognition",
212
  model="openai/whisper-large-v2",
213
  device=DEVICE_INDEX,
214
  torch_dtype=DTYPE
215
  )
216
- print("โœ… Base Whisper Large loaded as fallback!")
217
- return indicwhisper_pipeline
218
-
219
- except Exception as e:
220
- print(f"โŒ All loading methods failed: {e}")
221
- raise e
222
 
223
- @GPU_DECORATOR
224
  def load_specialized_model(language):
225
- if language in fallback_models:
226
- return fallback_models[language]
 
 
 
 
227
 
228
  try:
229
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
230
- name = SPECIALIZED_MODELS[language]
231
- print(f"๐Ÿ”„ Loading specialized model for {language}: {name}")
232
-
233
- proc = AutoProcessor.from_pretrained(name)
234
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
235
- name,
236
  torch_dtype=DTYPE,
237
- device_map="auto" if DEVICE == "cuda" else None,
238
- trust_remote_code=True
239
  ).to(DEVICE)
240
 
241
- fallback_models[language] = {"processor": proc, "model": model}
 
 
 
242
  print(f"โœ… Specialized {language} model loaded!")
243
- return fallback_models[language]
 
244
  except Exception as e:
245
- print(f"โŒ Failed to load specialized model for {language}: {e}")
246
- raise e
247
 
248
- # ---------------- TRANSCRIBE ---------------- #
249
  @GPU_DECORATOR
250
- def transcribe_with_primary_model(audio_path, language):
 
251
  try:
252
- pipe = load_indicwhisper()
253
- lang_code = LANG_CODES.get(language, "en")
254
-
255
- # JAX path
256
- if WHISPER_JAX_AVAILABLE:
257
- result = pipe(audio_path, task="transcribe", language=lang_code)
258
- if isinstance(result, dict) and "text" in result:
259
- return result["text"].strip()
260
- return str(result).strip()
261
-
262
- # Transformers path
263
  try:
264
- # Enhanced language forcing
265
- if hasattr(pipe, "model") and hasattr(pipe, "tokenizer"):
266
- # Get forced decoder IDs for language
267
- forced_ids = pipe.tokenizer.get_decoder_prompt_ids(
268
  language=lang_code,
269
  task="transcribe"
270
  )
271
- if forced_ids is not None:
272
- pipe.model.config.forced_decoder_ids = forced_ids
273
- print(f"๐Ÿ”ง Language forced to: {lang_code}")
274
  except Exception as e:
275
  print(f"โš ๏ธ Language forcing failed: {e}")
276
-
277
- # Transcribe with enhanced parameters
278
  with amp_ctx():
279
- out = pipe(
280
- audio_path,
281
- generate_kwargs={
282
- "language": lang_code,
283
- "task": "transcribe",
284
- "max_length": 200,
285
- "num_beams": 3
286
- }
287
- )
288
 
289
- if isinstance(out, dict):
290
- return (out.get("text") or "").strip()
291
- return str(out).strip()
292
 
293
  except Exception as e:
294
- error_msg = f"Primary model error: {str(e)}"
295
- print(f"โŒ {error_msg}")
296
- return error_msg
297
 
298
  @GPU_DECORATOR
299
- def transcribe_with_specialized_model(audio_path, language):
 
300
  try:
301
- comp = load_specialized_model(language)
 
 
 
 
302
  audio, sr = preprocess_audio(audio_path)
303
- if audio is None:
304
- return "Error: Audio preprocessing failed or audio too short"
305
 
306
- # Process audio
307
- inputs = comp["processor"](
308
- audio,
309
- sampling_rate=sr,
310
  return_tensors="pt"
311
  )
312
- feats = inputs.input_features.to(DEVICE)
 
313
 
314
  # Generation parameters
315
  gen_kwargs = {
316
- "inputs": feats,
317
- "max_length": 200,
318
  "num_beams": 3,
319
  "do_sample": False
320
  }
321
 
322
- # Language forcing for non-English models
323
  if language != "English":
324
  try:
325
- forced_ids = comp["processor"].tokenizer.get_decoder_prompt_ids(
326
- language=LANG_CODES[language],
327
  task="transcribe"
328
  )
329
- if forced_ids is not None:
330
  gen_kwargs["forced_decoder_ids"] = forced_ids
331
- print(f"๐Ÿ”ง Specialized model language forced to: {LANG_CODES[language]}")
332
  except Exception as e:
333
  print(f"โš ๏ธ Specialized language forcing failed: {e}")
334
 
335
  # Generate transcription
336
  with torch.no_grad(), amp_ctx():
337
- ids = comp["model"].generate(**gen_kwargs)
338
 
339
- # Decode
340
- text = comp["processor"].batch_decode(
341
- ids,
342
  skip_special_tokens=True
343
  )[0]
344
- return text.strip()
 
345
 
346
  except Exception as e:
347
- error_msg = f"Specialized model error: {str(e)}"
348
- print(f"โŒ {error_msg}")
349
- return error_msg
350
 
351
- @GPU_DECORATOR
352
- def transcribe_audio(audio_path, language, use_specialized=False):
353
- """
354
- Enhanced transcription with better error handling and fallback logic.
355
- """
356
- if not audio_path or not language:
357
- return "Error: Invalid audio path or language"
358
-
359
  try:
360
- if use_specialized:
361
- result = transcribe_with_specialized_model(audio_path, language)
362
- # If specialized model fails, try primary as fallback
363
- if result.startswith("Error:") or result.startswith("Specialized model error:"):
364
- print(f"โš ๏ธ Specialized model failed, trying primary: {result}")
365
- return transcribe_with_primary_model(audio_path, language)
366
- return result
367
- else:
368
- result = transcribe_with_primary_model(audio_path, language)
369
- # If primary model fails, try specialized as fallback
370
- if result.startswith("Error:") or result.startswith("Primary model error:"):
371
- print(f"โš ๏ธ Primary model failed, trying specialized: {result}")
372
- return transcribe_with_specialized_model(audio_path, language)
373
- return result
374
  except Exception as e:
375
- return f"Transcription error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- # ---------------- MAIN ---------------- #
378
- def get_score(wer, cer):
379
- c = (wer*0.7)+(cer*0.3)
380
- if c <= 0.1: return "๐Ÿ† Excellent!","Outstanding pronunciation!"
381
- elif c <= 0.2: return "๐ŸŽ‰ Very Good!","Minor improvements needed."
382
- elif c <= 0.4: return "๐Ÿ‘ Good!","Keep practicing."
383
- elif c <= 0.6: return "๐Ÿ“š Needs Practice","Focus on clearer pronunciation."
384
- else: return "๐Ÿ’ช Keep Trying!","Don't give up! Try speaking more slowly."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- def diff_html(ref,hyp):
387
- ref_w, hyp_w = ref.split(), hyp.split()
388
- sm = difflib.SequenceMatcher(None, ref_w, hyp_w)
389
- out=[]
390
- for tag,i1,i2,j1,j2 in sm.get_opcodes():
391
- if tag=='equal':
392
- out += [f"<span style='color:green; background-color:#e8f5e8'>{w}</span>" for w in ref_w[i1:i2]]
393
- elif tag=='replace':
394
- out += [f"<span style='color:red; background-color:#ffe8e8'>{w}</span>" for w in ref_w[i1:i2]]
395
- out += [f"<span style='color:orange; background-color:#fff3e8'>โ†’{w}</span>" for w in hyp_w[j1:j2]]
396
- elif tag=='delete':
397
- out += [f"<span style='color:red; background-color:#ffe8e8'>{w}</span>" for w in ref_w[i1:i2]]
398
- elif tag=='insert':
399
- out += [f"<span style='color:orange; background-color:#fff3e8'>+{w}</span>" for w in hyp_w[j1:j2]]
400
- return " ".join(out)
 
 
 
 
 
 
 
 
 
 
 
401
 
402
- def char_html(ref,hyp):
403
- sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
404
- out=[]
405
- for tag,i1,i2,j1,j2 in sm.get_opcodes():
406
- if tag=='equal':
407
- out += [f"<span style='color:green; background-color:#e8f5e8'>{c}</span>" for c in ref[i1:i2]]
408
- elif tag in ('replace','delete'):
409
- out += [f"<span style='color:red; background-color:#ffe8e8'>{c}</span>" for c in ref[i1:i2]]
410
- elif tag=='insert':
411
- out += [f"<span style='color:orange; background-color:#fff3e8'>{c}</span>" for c in hyp[j1:j2]]
412
- return "".join(out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
 
414
  @GPU_DECORATOR
415
- def compare_pronunciation(audio, lang_choice, intended):
 
 
416
  if audio is None:
417
- return ("โŒ Please record audio first","","","","","","","")
418
- if not intended.strip():
419
- return ("โŒ Please generate a sentence first","","","","","","","")
420
-
421
- print(f"๐Ÿ” Analyzing pronunciation for {lang_choice}...")
 
422
 
423
  # Get transcriptions from both models
424
- ptext = transcribe_audio(audio, lang_choice, False) # Primary (IndicWhisper)
425
- stext = transcribe_audio(audio, lang_choice, True) # Specialized
426
-
427
- # Choose the best transcription (prefer primary if successful)
428
- if not ptext.startswith("Error:") and ptext.strip():
429
- actual = ptext
430
- source = "Primary (IndicWhisper)"
431
- elif not stext.startswith("Error:") and stext.strip():
432
- actual = stext
433
- source = "Specialized"
434
  else:
435
- return (f"โŒ Both models failed:\nPrimary: {ptext}\nSpecialized: {stext}",
436
- ptext, stext, "", "", "", "", f"๐ŸŽฏ Target: {intended}")
437
-
 
 
 
 
 
 
 
438
  # Compute metrics
439
- wer_val = compute_wer(intended, actual)
440
- cer_val = compute_cer(intended, actual)
441
- score, feedback = get_score(wer_val, cer_val)
442
-
443
- print(f"โœ… Analysis complete - WER: {wer_val:.3f}, CER: {cer_val:.3f}")
444
-
445
- return (f"โœ… Analysis Complete - {score}\n๐Ÿ’ฌ {feedback}\n๐Ÿค– Best result from: {source}",
446
- ptext, stext,
447
- f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% accuracy)",
448
- f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% accuracy)",
449
- diff_html(intended, actual),
450
- char_html(intended, actual),
451
- f"๐ŸŽฏ Target: {intended}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
  # ---------------- UI ---------------- #
454
  def create_interface():
455
- with gr.Blocks(title="IndicWhisper Pronunciation Trainer") as demo:
456
  gr.Markdown("""
457
- # ๐ŸŽ™๏ธ IndicWhisper Pronunciation Trainer
458
 
459
- Practice pronunciation in English, Tamil, and Malayalam with AI-powered feedback!
460
 
461
- **Instructions:**
462
- 1. Select your language
463
- 2. Generate a practice sentence
464
- 3. Record yourself saying the sentence
465
- 4. Get detailed pronunciation analysis
 
 
 
466
  """)
467
 
468
  with gr.Row():
469
  with gr.Column(scale=2):
470
- lang = gr.Dropdown(
471
- choices=list(LANG_CODES.keys()),
472
- value="Tamil",
473
  label="๐ŸŒ Select Language"
474
  )
475
  with gr.Column(scale=1):
476
- btn = gr.Button("๐ŸŽฒ Generate Practice Sentence", variant="primary")
477
 
478
- intended = gr.Textbox(
479
- label="๐Ÿ“ Practice Sentence",
480
- interactive=False,
481
- lines=3,
482
  placeholder="Click 'Generate Practice Sentence' to get started..."
483
  )
484
 
485
- audio = gr.Audio(
486
- sources=["microphone", "upload"],
487
  type="filepath",
488
  label="๐ŸŽค Record Your Pronunciation"
489
  )
490
 
491
- analyze = gr.Button("๐Ÿ” Analyze Pronunciation", variant="secondary", size="lg")
492
 
493
  with gr.Row():
494
- status = gr.Textbox(
495
- label="๐Ÿ“Š Analysis Results",
496
- interactive=False,
497
- lines=4
498
  )
499
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  with gr.Row():
501
  with gr.Column():
502
- pass1 = gr.Textbox(label="๐Ÿค– Primary Model (IndicWhisper)", interactive=False)
503
- wer = gr.Textbox(label="๐Ÿ“ˆ Word Accuracy", interactive=False)
 
504
  with gr.Column():
505
- pass2 = gr.Textbox(label="๐ŸŽฏ Specialized Model", interactive=False)
506
- cer = gr.Textbox(label="๐Ÿ“Š Character Accuracy", interactive=False)
 
 
 
 
507
 
508
- with gr.Accordion("๐Ÿ“‹ Detailed Analysis", open=False):
509
- diff = gr.HTML(label="๐Ÿ”ค Word-by-Word Comparison")
510
- chars = gr.HTML(label="๐Ÿ” Character-by-Character Analysis")
511
- target = gr.Textbox(label="๐ŸŽฏ Reference Text", visible=False)
512
 
513
  # Event handlers
514
- btn.click(get_random_sentence, [lang], [intended])
515
- analyze.click(
516
- compare_pronunciation,
517
- [audio, lang, intended],
518
- [status, pass1, pass2, wer, cer, diff, chars, target]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  )
520
- lang.change(get_random_sentence, [lang], [intended])
521
 
522
  gr.Markdown("""
523
- ### ๐Ÿ“ Tips for Better Results:
524
- - Speak clearly and at a moderate pace
525
- - Ensure good audio quality (minimal background noise)
526
- - Try to pronounce each word distinctly
527
- - Practice multiple times with the same sentence to see improvement
 
 
 
 
 
 
 
 
 
 
 
528
  """)
529
 
530
  return demo
531
 
532
  # ---------------- LAUNCH ---------------- #
533
  if __name__ == "__main__":
534
- print("๐Ÿš€ Starting IndicWhisper Pronunciation Trainer...")
535
  demo = create_interface()
536
  demo.launch(
537
- server_name="0.0.0.0",
538
- server_port=7860,
539
  share=True,
540
- show_error=True,
541
- debug=True
542
  )
 
1
  import gradio as gr
2
+ import random
3
+ import difflib
4
+ import re
5
+ import jiwer
6
  import torch
7
+ import warnings
8
+ import contextlib
9
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
10
+ import librosa
11
  import numpy as np
 
 
12
 
13
  # Optional transliteration
14
  try:
 
17
  INDIC_OK = True
18
  except:
19
  INDIC_OK = False
20
+ print("โš ๏ธ indic_transliteration not available. Transliteration features disabled.")
21
 
22
  # Optional HF Spaces GPU decorator
23
  try:
 
37
  amp_ctx = torch.cuda.amp.autocast if DEVICE == "cuda" else contextlib.nullcontext
38
  print(f"๐Ÿ”ง Using device: {DEVICE}")
39
 
40
+ LANG_CODES = {
41
+ "English": "en",
42
+ "Tamil": "ta",
43
+ "Malayalam": "ml",
44
+ "Hindi": "hi"
45
+ }
46
 
47
+ # Primary: IndicWhisper
48
  INDICWHISPER_MODEL = "parthiv11/indic_whisper_nodcil"
49
 
50
+ # Specialized models for better accuracy
51
  SPECIALIZED_MODELS = {
52
  "English": "openai/whisper-base.en",
53
+ "Tamil": "vasista22/whisper-tamil-large-v2",
54
  "Malayalam": "thennal/whisper-medium-ml",
55
+ "Hindi": "openai/whisper-large-v2" # Using general model for Hindi
56
  }
57
 
58
  SCRIPT_PATTERNS = {
59
  "Tamil": re.compile(r"[เฎ€-เฏฟ]"),
60
  "Malayalam": re.compile(r"[เด€-เตฟ]"),
61
+ "Hindi": re.compile(r"[เค€-เฅฟ]"),
62
+ "English": re.compile(r"[A-Za-z]")
63
+ }
64
+
65
+ # Transliteration mappings
66
+ TRANSLITERATION_SCRIPTS = {
67
+ "Tamil": sanscript.TAMIL,
68
+ "Malayalam": sanscript.MALAYALAM,
69
+ "Hindi": sanscript.DEVANAGARI,
70
+ "English": None
71
  }
72
+
73
  SENTENCE_BANK = {
74
  "English": [
75
+ "The sun sets over the horizon.",
76
+ "Learning languages is fun and rewarding.",
77
+ "I like to drink coffee in the morning.",
78
+ "Technology helps us connect with others.",
79
+ "Reading books expands our knowledge."
80
  ],
81
  "Tamil": [
82
  "เฎ‡เฎฉเฏเฎฑเฏ เฎจเฎฒเฏเฎฒ เฎตเฎพเฎฉเฎฟเฎฒเฏˆ เฎ‰เฎณเฏเฎณเฎคเฏ.",
83
+ "เฎจเฎพเฎฉเฏ เฎคเฎฎเฎฟเฎดเฏ เฎ•เฎฑเฏเฎฑเฏเฎ•เฏเฎ•เฏŠเฎฃเฏเฎŸเฏ เฎ‡เฎฐเฏเฎ•เฏเฎ•เฎฟเฎฑเฏ‡เฎฉเฏ.",
84
+ "เฎŽเฎฉเฎ•เฏเฎ•เฏ เฎชเฏเฎคเฏเฎคเฎ•เฎฎเฏ เฎชเฎŸเฎฟเฎ•เฏเฎ• เฎตเฎฟเฎฐเฏเฎชเฏเฎชเฎฎเฏ.",
85
+ "เฎ•เฎพเฎฒเฏˆเฎฏเฎฟเฎฒเฏ เฎ•เฎพเฎชเฎฟ เฎ•เฏเฎŸเฎฟเฎ•เฏเฎ• เฎชเฎฟเฎŸเฎฟเฎ•เฏเฎ•เฏเฎฎเฏ.",
86
+ "เฎจเฎฃเฏเฎชเฎฐเฏเฎ•เฎณเฏเฎŸเฎฉเฏ เฎชเฏ‡เฎšเฏเฎตเฎคเฏ เฎฎเฎ•เฎฟเฎดเฏเฎšเฏเฎšเฎฟ."
87
  ],
88
  "Malayalam": [
89
  "เดŽเดจเดฟเด•เตเด•เต เดฎเดฒเดฏเดพเดณเด‚ เดตเดณเดฐเต† เด‡เดทเตเดŸเดฎเดพเดฃเต.",
90
+ "เด‡เดจเตเดจเต เดฎเดดเดชเต†เดฏเตเดฏเตเดจเตเดจเต.",
91
+ "เดžเดพเตป เดชเตเดธเตเดคเด•เด‚ เดตเดพเดฏเดฟเด•เตเด•เตเดจเตเดจเต.",
92
+ "เด•เดพเดฒเดฏเดฟเตฝ เดšเดพเดฏ เด•เตเดŸเดฟเด•เตเด•เดพเตป เด‡เดทเตเดŸเดฎเดพเดฃเต.",
93
+ "เดธเตเดนเตƒเดคเตเดคเตเด•เตเด•เดณเต‹เดŸเต เดธเด‚เดธเดพเดฐเดฟเด•เตเด•เตเดจเตเดจเดคเต เดธเดจเตเดคเต‹เดทเดฎเดพเดฃเต."
94
+ ],
95
+ "Hindi": [
96
+ "เค†เคœ เคฎเฅŒเคธเคฎ เค…เคšเฅเค›เคพ เคนเฅˆเฅค",
97
+ "เคฎเฅเคเฅ‡ เคนเคฟเค‚เคฆเฅ€ เคฌเฅ‹เคฒเคจเคพ เคชเคธเค‚เคฆ เคนเฅˆเฅค",
98
+ "เคฎเฅˆเค‚ เค•เคฟเคคเคพเคฌ เคชเคขเคผ เคฐเคนเคพ เคนเฅ‚เคเฅค",
99
+ "เคธเฅเคฌเคน เคšเคพเคฏ เคชเฅ€เคจเคพ เค…เคšเฅเค›เคพ เคฒเค—เคคเคพ เคนเฅˆเฅค",
100
+ "เคฆเฅ‹เคธเฅเคคเฅ‹เค‚ เค•เฅ‡ เคธเคพเคฅ เคฌเคพเคค เค•เคฐเคจเคพ เค–เฅเคถเฅ€ เคฆเฅ‡เคคเคพ เคนเฅˆเฅค"
101
  ]
102
  }
103
 
104
  # Model cache
105
+ primary_pipeline = None
106
+ specialized_models = {}
 
107
 
108
  # ---------------- HELPERS ---------------- #
109
  def get_random_sentence(language_choice):
110
  return random.choice(SENTENCE_BANK[language_choice])
111
 
112
+ def is_correct_script(text, lang_name):
113
+ """Check if text contains the expected script for the language"""
114
+ if not text.strip():
115
+ return False
116
+ pattern = SCRIPT_PATTERNS.get(lang_name)
117
+ if not pattern:
118
+ return True
119
+ return bool(pattern.search(text))
120
+
121
+ def transliterate_text(text, lang_choice, to_romanized=True):
122
+ """Transliterate text to/from romanized form"""
123
+ if not INDIC_OK or not text.strip():
124
+ return text
125
+
126
+ source_script = TRANSLITERATION_SCRIPTS.get(lang_choice)
127
+ if not source_script:
128
+ return text
129
+
130
+ try:
131
+ if to_romanized:
132
+ # Convert to Harvard-Kyoto (romanized)
133
+ return transliterate(text, source_script, sanscript.HK)
134
+ else:
135
+ # Convert from romanized to native script (if needed)
136
+ return transliterate(text, sanscript.HK, source_script)
137
+ except Exception as e:
138
+ print(f"โš ๏ธ Transliteration failed: {e}")
139
  return text
 
 
 
 
 
 
140
 
141
  def preprocess_audio(audio_path, target_sr=16000):
142
+ """Enhanced audio preprocessing"""
143
  try:
144
  audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
145
+ if audio is None or len(audio) == 0:
146
+ return None, None
147
+
148
+ # Normalize audio
149
  audio = audio.astype(np.float32)
150
+ max_val = np.max(np.abs(audio))
151
+ if max_val > 0:
152
+ audio = audio / max_val
153
+
154
+ # Trim silence
155
  audio, _ = librosa.effects.trim(audio, top_db=20)
156
+
157
+ # Check minimum length (0.1 seconds)
158
+ if len(audio) < int(target_sr * 0.1):
159
+ return None, None
160
+
161
  return audio, target_sr
162
+ except Exception as e:
163
+ print(f"โš ๏ธ Audio preprocessing failed: {e}")
164
+ return None, None
 
 
 
 
 
 
 
 
 
 
165
 
166
  # ---------------- MODEL LOADERS ---------------- #
167
  @GPU_DECORATOR
168
+ def load_primary_model():
169
+ """Load the primary IndicWhisper model"""
170
+ global primary_pipeline
171
+ if primary_pipeline is not None:
172
+ return primary_pipeline
173
+
 
 
 
 
174
  try:
175
+ print(f"๐Ÿ”„ Loading primary model: {INDICWHISPER_MODEL}")
 
 
176
 
177
+ # Try direct loading first
178
+ primary_pipeline = pipeline(
179
+ "automatic-speech-recognition",
180
+ model=INDICWHISPER_MODEL,
181
+ device=DEVICE_INDEX,
182
+ torch_dtype=DTYPE,
183
+ trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  )
185
+ print("โœ… Primary model loaded successfully!")
186
+ return primary_pipeline
187
 
188
+ except Exception as e:
189
+ print(f"โš ๏ธ Primary model failed, using fallback: {e}")
190
+ # Fallback to base Whisper
191
+ primary_pipeline = pipeline(
192
+ "automatic-speech-recognition",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  model="openai/whisper-large-v2",
194
  device=DEVICE_INDEX,
195
  torch_dtype=DTYPE
196
  )
197
+ print("โœ… Fallback model loaded!")
198
+ return primary_pipeline
 
 
 
 
199
 
200
+ @GPU_DECORATOR
201
  def load_specialized_model(language):
202
+ """Load specialized model for specific language"""
203
+ if language in specialized_models:
204
+ return specialized_models[language]
205
+
206
+ model_name = SPECIALIZED_MODELS[language]
207
+ print(f"๐Ÿ”„ Loading specialized {language} model: {model_name}")
208
 
209
  try:
210
+ processor = AutoProcessor.from_pretrained(model_name)
 
 
 
 
211
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
212
+ model_name,
213
  torch_dtype=DTYPE,
214
+ device_map="auto" if DEVICE == "cuda" else None
 
215
  ).to(DEVICE)
216
 
217
+ specialized_models[language] = {
218
+ "processor": processor,
219
+ "model": model
220
+ }
221
  print(f"โœ… Specialized {language} model loaded!")
222
+ return specialized_models[language]
223
+
224
  except Exception as e:
225
+ print(f"โŒ Failed to load specialized {language} model: {e}")
226
+ return None
227
 
228
+ # ---------------- TRANSCRIPTION ---------------- #
229
  @GPU_DECORATOR
230
+ def transcribe_with_primary(audio_path, language):
231
+ """Transcribe using primary IndicWhisper model"""
232
  try:
233
+ pipeline_model = load_primary_model()
234
+ lang_code = LANG_CODES[language]
235
+
236
+ # Set language forcing if possible
 
 
 
 
 
 
 
237
  try:
238
+ if hasattr(pipeline_model, "model") and hasattr(pipeline_model, "tokenizer"):
239
+ forced_ids = pipeline_model.tokenizer.get_decoder_prompt_ids(
 
 
240
  language=lang_code,
241
  task="transcribe"
242
  )
243
+ if forced_ids:
244
+ pipeline_model.model.config.forced_decoder_ids = forced_ids
 
245
  except Exception as e:
246
  print(f"โš ๏ธ Language forcing failed: {e}")
247
+
 
248
  with amp_ctx():
249
+ result = pipeline_model(audio_path)
 
 
 
 
 
 
 
 
250
 
251
+ if isinstance(result, dict):
252
+ return result.get("text", "").strip()
253
+ return str(result).strip()
254
 
255
  except Exception as e:
256
+ return f"Primary transcription error: {str(e)}"
 
 
257
 
258
  @GPU_DECORATOR
259
+ def transcribe_with_specialized(audio_path, language):
260
+ """Transcribe using specialized model"""
261
  try:
262
+ model_components = load_specialized_model(language)
263
+ if not model_components:
264
+ return "Specialized model not available"
265
+
266
+ # Preprocess audio
267
  audio, sr = preprocess_audio(audio_path)
268
+ if audio is None:
269
+ return "Audio preprocessing failed"
270
 
271
+ # Process with specialized model
272
+ inputs = model_components["processor"](
273
+ audio,
274
+ sampling_rate=sr,
275
  return_tensors="pt"
276
  )
277
+
278
+ input_features = inputs.input_features.to(DEVICE)
279
 
280
  # Generation parameters
281
  gen_kwargs = {
282
+ "inputs": input_features,
283
+ "max_length": 200,
284
  "num_beams": 3,
285
  "do_sample": False
286
  }
287
 
288
+ # Language forcing for non-English
289
  if language != "English":
290
  try:
291
+ forced_ids = model_components["processor"].tokenizer.get_decoder_prompt_ids(
292
+ language=LANG_CODES[language],
293
  task="transcribe"
294
  )
295
+ if forced_ids:
296
  gen_kwargs["forced_decoder_ids"] = forced_ids
 
297
  except Exception as e:
298
  print(f"โš ๏ธ Specialized language forcing failed: {e}")
299
 
300
  # Generate transcription
301
  with torch.no_grad(), amp_ctx():
302
+ generated_ids = model_components["model"].generate(**gen_kwargs)
303
 
304
+ # Decode result
305
+ transcription = model_components["processor"].batch_decode(
306
+ generated_ids,
307
  skip_special_tokens=True
308
  )[0]
309
+
310
+ return transcription.strip()
311
 
312
  except Exception as e:
313
+ return f"Specialized transcription error: {str(e)}"
 
 
314
 
315
+ # ---------------- ANALYSIS ---------------- #
316
+ def compute_metrics(reference, hypothesis):
317
+ """Compute WER and CER with error handling"""
 
 
 
 
 
318
  try:
319
+ # Clean up texts
320
+ ref_clean = reference.strip()
321
+ hyp_clean = hypothesis.strip()
322
+
323
+ if not ref_clean or not hyp_clean:
324
+ return 1.0, 1.0
325
+
326
+ # Compute WER and CER
327
+ wer = jiwer.wer(ref_clean, hyp_clean)
328
+ cer = jiwer.cer(ref_clean, hyp_clean)
329
+
330
+ return wer, cer
 
 
331
  except Exception as e:
332
+ print(f"โš ๏ธ Metric computation failed: {e}")
333
+ return 1.0, 1.0
334
+
335
+ def get_pronunciation_score(wer, cer):
336
+ """Convert error rates to intuitive scores and feedback"""
337
+ # Weighted combination (WER is more important)
338
+ combined_error = (wer * 0.7) + (cer * 0.3)
339
+ accuracy = 1 - combined_error
340
+
341
+ if accuracy >= 0.95:
342
+ return "๐Ÿ† Perfect!", "Outstanding pronunciation! Native-like accuracy.", "#d4edda"
343
+ elif accuracy >= 0.85:
344
+ return "๐ŸŽ‰ Excellent!", "Very good pronunciation with minor variations.", "#d1ecf1"
345
+ elif accuracy >= 0.70:
346
+ return "๐Ÿ‘ Good!", "Good pronunciation, practice specific sounds.", "#fff3cd"
347
+ elif accuracy >= 0.50:
348
+ return "๐Ÿ“š Needs Practice", "Focus on clearer pronunciation and rhythm.", "#f8d7da"
349
+ else:
350
+ return "๐Ÿ’ช Keep Trying!", "Break down into smaller parts and practice slowly.", "#f5c6cb"
351
 
352
+ def create_detailed_comparison(intended, actual, lang_choice):
353
+ """Create detailed side-by-side comparison with transliteration"""
354
+
355
+ # Original scripts
356
+ intended_orig = intended.strip()
357
+ actual_orig = actual.strip()
358
+
359
+ # Transliterations
360
+ intended_translit = transliterate_text(intended_orig, lang_choice, to_romanized=True)
361
+ actual_translit = transliterate_text(actual_orig, lang_choice, to_romanized=True)
362
+
363
+ # Word-level highlighting
364
+ word_diff_orig = highlight_word_differences(intended_orig, actual_orig)
365
+ word_diff_translit = highlight_word_differences(intended_translit, actual_translit)
366
+
367
+ # Character-level highlighting
368
+ char_diff_orig = highlight_char_differences(intended_orig, actual_orig)
369
+ char_diff_translit = highlight_char_differences(intended_translit, actual_translit)
370
+
371
+ return {
372
+ "intended_orig": intended_orig,
373
+ "actual_orig": actual_orig,
374
+ "intended_translit": intended_translit,
375
+ "actual_translit": actual_translit,
376
+ "word_diff_orig": word_diff_orig,
377
+ "word_diff_translit": word_diff_translit,
378
+ "char_diff_orig": char_diff_orig,
379
+ "char_diff_translit": char_diff_translit
380
+ }
381
+
382
+ def highlight_word_differences(reference, hypothesis):
383
+ """Highlight word-level differences with colors"""
384
+ ref_words = reference.split()
385
+ hyp_words = hypothesis.split()
386
+
387
+ sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
388
+ html_output = []
389
+
390
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
391
+ if tag == 'equal':
392
+ # Correct words - green background
393
+ html_output.extend([
394
+ f"<span style='background-color:#d4edda; color:#155724; padding:2px 4px; margin:1px; border-radius:3px'>{word}</span>"
395
+ for word in ref_words[i1:i2]
396
+ ])
397
+ elif tag == 'replace':
398
+ # Wrong words - red background for reference, orange for hypothesis
399
+ html_output.extend([
400
+ f"<span style='background-color:#f8d7da; color:#721c24; padding:2px 4px; margin:1px; border-radius:3px; text-decoration:line-through'>{word}</span>"
401
+ for word in ref_words[i1:i2]
402
+ ])
403
+ html_output.extend([
404
+ f"<span style='background-color:#fff3cd; color:#856404; padding:2px 4px; margin:1px; border-radius:3px'>โ†’{word}</span>"
405
+ for word in hyp_words[j1:j2]
406
+ ])
407
+ elif tag == 'delete':
408
+ # Missing words - red background
409
+ html_output.extend([
410
+ f"<span style='background-color:#f8d7da; color:#721c24; padding:2px 4px; margin:1px; border-radius:3px; text-decoration:line-through'>{word}</span>"
411
+ for word in ref_words[i1:i2]
412
+ ])
413
+ elif tag == 'insert':
414
+ # Extra words - orange background
415
+ html_output.extend([
416
+ f"<span style='background-color:#fff3cd; color:#856404; padding:2px 4px; margin:1px; border-radius:3px'>+{word}</span>"
417
+ for word in hyp_words[j1:j2]
418
+ ])
419
+
420
+ return " ".join(html_output)
421
 
422
+ def highlight_char_differences(reference, hypothesis):
423
+ """Highlight character-level differences"""
424
+ sm = difflib.SequenceMatcher(None, list(reference), list(hypothesis))
425
+ html_output = []
426
+
427
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
428
+ if tag == 'equal':
429
+ # Correct characters - green
430
+ html_output.extend([
431
+ f"<span style='color:#28a745'>{char}</span>"
432
+ for char in reference[i1:i2]
433
+ ])
434
+ elif tag in ('replace', 'delete'):
435
+ # Wrong/missing characters - red with underline
436
+ html_output.extend([
437
+ f"<span style='color:#dc3545; text-decoration:underline; font-weight:bold'>{char}</span>"
438
+ for char in reference[i1:i2]
439
+ ])
440
+ elif tag == 'insert':
441
+ # Extra characters - orange
442
+ html_output.extend([
443
+ f"<span style='color:#fd7e14; font-weight:bold'>{char}</span>"
444
+ for char in hypothesis[j1:j2]
445
+ ])
446
+
447
+ return "".join(html_output)
448
 
449
+ def analyze_pronunciation_errors(intended, actual, lang_choice):
450
+ """Provide specific feedback about pronunciation errors"""
451
+ comparison = create_detailed_comparison(intended, actual, lang_choice)
452
+
453
+ # Analyze error patterns
454
+ intended_words = intended.split()
455
+ actual_words = actual.split()
456
+
457
+ error_analysis = []
458
+
459
+ # Length difference analysis
460
+ if len(actual_words) < len(intended_words):
461
+ missing_count = len(intended_words) - len(actual_words)
462
+ error_analysis.append(f"๐Ÿ” You missed {missing_count} word(s). Try speaking more slowly.")
463
+ elif len(actual_words) > len(intended_words):
464
+ extra_count = len(actual_words) - len(intended_words)
465
+ error_analysis.append(f"๐Ÿ” You added {extra_count} extra word(s). Focus on the exact sentence.")
466
+
467
+ # Script verification
468
+ if not is_correct_script(actual, lang_choice):
469
+ error_analysis.append(f"โš ๏ธ The transcription doesn't contain {lang_choice} script. Check your pronunciation.")
470
+
471
+ # WER/CER based feedback
472
+ wer, cer = compute_metrics(intended, actual)
473
+
474
+ if wer > 0.5:
475
+ error_analysis.append("๐ŸŽฏ Focus on pronouncing each word clearly and separately.")
476
+ elif wer > 0.3:
477
+ error_analysis.append("๐ŸŽฏ Good overall, but some words need clearer pronunciation.")
478
+
479
+ if cer > 0.3:
480
+ error_analysis.append("๐Ÿ”ค Pay attention to individual sounds and syllables.")
481
+
482
+ return error_analysis, comparison
483
 
484
+ # ---------------- MAIN FUNCTION ---------------- #
485
  @GPU_DECORATOR
486
+ def compare_pronunciation(audio, language_choice, intended_sentence):
487
+ """Main function to analyze pronunciation"""
488
+
489
  if audio is None:
490
+ return ("โŒ Please record audio first", "", "", "", "", "", "", "", "", "", "")
491
+
492
+ if not intended_sentence.strip():
493
+ return ("โŒ Please generate a sentence first", "", "", "", "", "", "", "", "", "", "")
494
+
495
+ print(f"๐Ÿ” Analyzing pronunciation for {language_choice}...")
496
 
497
  # Get transcriptions from both models
498
+ primary_result = transcribe_with_primary(audio, language_choice)
499
+ specialized_result = transcribe_with_specialized(audio, language_choice)
500
+
501
+ # Choose best result (prefer specialized if successful)
502
+ if not specialized_result.startswith("Specialized") and specialized_result.strip():
503
+ best_transcription = specialized_result
504
+ best_source = "Specialized Model"
505
+ elif not primary_result.startswith("Primary") and primary_result.strip():
506
+ best_transcription = primary_result
507
+ best_source = "Primary Model"
508
  else:
509
+ return (
510
+ f"โŒ Both models failed:\nPrimary: {primary_result}\nSpecialized: {specialized_result}",
511
+ "", "", "", "", "", "", "", "", "", ""
512
+ )
513
+
514
+ # Analyze pronunciation
515
+ error_analysis, comparison = analyze_pronunciation_errors(
516
+ intended_sentence, best_transcription, language_choice
517
+ )
518
+
519
  # Compute metrics
520
+ wer, cer = compute_metrics(intended_sentence, best_transcription)
521
+ score, feedback, color = get_pronunciation_score(wer, cer)
522
+
523
+ # Create status message
524
+ status_msg = f"""โœ… Analysis Complete!
525
+
526
+ {score}
527
+ {feedback}
528
+
529
+ ๐Ÿค– Best result from: {best_source}
530
+ ๐Ÿ“Š Word Accuracy: {(1-wer)*100:.1f}%
531
+ ๐Ÿ“ˆ Character Accuracy: {(1-cer)*100:.1f}%
532
+
533
+ ๐Ÿ” Analysis:
534
+ """ + "\n".join(error_analysis)
535
+
536
+ return (
537
+ status_msg,
538
+ primary_result,
539
+ specialized_result,
540
+ f"{wer:.3f} ({(1-wer)*100:.1f}%)",
541
+ f"{cer:.3f} ({(1-cer)*100:.1f}%)",
542
+ comparison["intended_orig"],
543
+ comparison["actual_orig"],
544
+ comparison["intended_translit"],
545
+ comparison["actual_translit"],
546
+ comparison["word_diff_orig"],
547
+ comparison["char_diff_orig"]
548
+ )
549
 
550
  # ---------------- UI ---------------- #
551
  def create_interface():
552
+ with gr.Blocks(title="Enhanced Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
553
  gr.Markdown("""
554
+ # ๐ŸŽ™๏ธ Enhanced Pronunciation Comparator
555
 
556
+ **Perfect your pronunciation in English, Tamil, Malayalam, and Hindi!**
557
 
558
+ This tool uses specialized AI models to give you detailed feedback on your pronunciation,
559
+ including transliteration to help you understand exactly where you need improvement.
560
+
561
+ ### How to use:
562
+ 1. ๐ŸŒ Select your target language
563
+ 2. ๐ŸŽฒ Generate a practice sentence
564
+ 3. ๐ŸŽค Record yourself saying the sentence clearly
565
+ 4. ๐Ÿ” Get detailed pronunciation analysis with transliteration
566
  """)
567
 
568
  with gr.Row():
569
  with gr.Column(scale=2):
570
+ language_dropdown = gr.Dropdown(
571
+ choices=list(LANG_CODES.keys()),
572
+ value="Tamil",
573
  label="๐ŸŒ Select Language"
574
  )
575
  with gr.Column(scale=1):
576
+ generate_btn = gr.Button("๐ŸŽฒ Generate Practice Sentence", variant="primary")
577
 
578
+ intended_textbox = gr.Textbox(
579
+ label="๐Ÿ“ Practice Sentence",
580
+ interactive=False,
581
+ lines=2,
582
  placeholder="Click 'Generate Practice Sentence' to get started..."
583
  )
584
 
585
+ audio_input = gr.Audio(
586
+ sources=["microphone", "upload"],
587
  type="filepath",
588
  label="๐ŸŽค Record Your Pronunciation"
589
  )
590
 
591
+ analyze_btn = gr.Button("๐Ÿ” Analyze Pronunciation", variant="secondary", size="lg")
592
 
593
  with gr.Row():
594
+ status_output = gr.Textbox(
595
+ label="๐Ÿ“Š Analysis Results",
596
+ interactive=False,
597
+ lines=8
598
  )
599
 
600
+ with gr.Accordion("๐Ÿค– Model Outputs", open=False):
601
+ with gr.Row():
602
+ primary_output = gr.Textbox(label="Primary Model (IndicWhisper)", interactive=False)
603
+ specialized_output = gr.Textbox(label="Specialized Model", interactive=False)
604
+
605
+ with gr.Accordion("๐Ÿ“ˆ Detailed Metrics", open=False):
606
+ with gr.Row():
607
+ wer_output = gr.Textbox(label="Word Error Rate", interactive=False)
608
+ cer_output = gr.Textbox(label="Character Error Rate", interactive=False)
609
+
610
+ gr.Markdown("### ๐Ÿ” Detailed Comparison")
611
+
612
  with gr.Row():
613
  with gr.Column():
614
+ gr.Markdown("#### ๐Ÿ“ Original Script")
615
+ intended_orig = gr.Textbox(label="๐ŸŽฏ Target Text", interactive=False)
616
+ actual_orig = gr.Textbox(label="๐Ÿ—ฃ๏ธ What You Said", interactive=False)
617
  with gr.Column():
618
+ gr.Markdown("#### ๐Ÿ”ค Romanized (Transliterated)")
619
+ intended_translit = gr.Textbox(label="๐ŸŽฏ Target (Romanized)", interactive=False)
620
+ actual_translit = gr.Textbox(label="๐Ÿ—ฃ๏ธ What You Said (Romanized)", interactive=False)
621
+
622
+ gr.Markdown("### ๐ŸŽจ Visual Comparison")
623
+ gr.Markdown("**Green** = Correct, **Red** = Wrong/Missing, **Orange** = Added/Substituted")
624
 
625
+ word_diff_html = gr.HTML(label="๐Ÿ”ค Word-by-Word Comparison")
626
+ char_diff_html = gr.HTML(label="๐Ÿ” Character-by-Character Analysis")
 
 
627
 
628
  # Event handlers
629
+ generate_btn.click(
630
+ fn=get_random_sentence,
631
+ inputs=[language_dropdown],
632
+ outputs=[intended_textbox]
633
+ )
634
+
635
+ analyze_btn.click(
636
+ fn=compare_pronunciation,
637
+ inputs=[audio_input, language_dropdown, intended_textbox],
638
+ outputs=[
639
+ status_output, primary_output, specialized_output,
640
+ wer_output, cer_output, intended_orig, actual_orig,
641
+ intended_translit, actual_translit, word_diff_html, char_diff_html
642
+ ]
643
+ )
644
+
645
+ language_dropdown.change(
646
+ fn=get_random_sentence,
647
+ inputs=[language_dropdown],
648
+ outputs=[intended_textbox]
649
  )
 
650
 
651
  gr.Markdown("""
652
+ ### ๐Ÿ“š Pro Tips for Better Pronunciation:
653
+
654
+ - **Speak slowly and clearly** - Don't rush through the sentence
655
+ - **Pronounce each syllable** - Break down complex words
656
+ - **Check the romanized version** - Use it to understand correct pronunciation
657
+ - **Practice repeatedly** - Use the same sentence multiple times to track improvement
658
+ - **Focus on problem areas** - Pay attention to red-highlighted parts
659
+ - **Record in a quiet environment** - Minimize background noise
660
+
661
+ ### ๐ŸŽฏ Understanding the Feedback:
662
+
663
+ - **Green highlights** = Perfect pronunciation โœ…
664
+ - **Red highlights** = Missing or mispronounced โŒ
665
+ - **Orange highlights** = Added or substituted ๐Ÿ”„
666
+ - **Transliteration** = Helps you see pronunciation patterns
667
+ - **Error rates** = Lower is better (0% = perfect)
668
  """)
669
 
670
  return demo
671
 
672
  # ---------------- LAUNCH ---------------- #
673
  if __name__ == "__main__":
674
+ print("๐Ÿš€ Starting Enhanced Pronunciation Comparator...")
675
  demo = create_interface()
676
  demo.launch(
677
+ server_name="0.0.0.0",
678
+ server_port=7860,
679
  share=True,
680
+ show_error=True
 
681
  )