sudhanm commited on
Commit
2175de9
·
verified ·
1 Parent(s): 494a527

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -181
app.py CHANGED
@@ -9,10 +9,9 @@ import numpy as np
9
  from transformers import (
10
  AutoProcessor,
11
  AutoModelForSpeechSeq2Seq,
12
- AutoTokenizer,
13
- AutoModel
14
  )
15
- from TTS.api import TTS
16
  import librosa
17
  import soundfile as sf
18
  from indic_transliteration import sanscript
@@ -22,6 +21,7 @@ warnings.filterwarnings("ignore")
22
 
23
  # ---------------- CONFIG ---------------- #
24
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
25
 
26
  LANG_CODES = {
27
  "English": "en",
@@ -31,21 +31,22 @@ LANG_CODES = {
31
  "Sanskrit": "sa"
32
  }
33
 
34
- # AI4Bharat model configurations
35
  ASR_MODELS = {
36
  "English": "openai/whisper-base.en",
37
- "Tamil": "ai4bharat/whisper-medium-ta",
38
- "Malayalam": "ai4bharat/whisper-medium-ml",
39
- "Hindi": "ai4bharat/whisper-medium-hi",
40
- "Sanskrit": "ai4bharat/whisper-medium-hi" # Fallback to Hindi for Sanskrit
41
  }
42
 
43
- TTS_MODELS = {
44
- "English": "tts_models/en/ljspeech/tacotron2-DDC",
45
- "Tamil": "tts_models/ta/mai/tacotron2-DDC",
46
- "Malayalam": "tts_models/ml/mai/tacotron2-DDC",
47
- "Hindi": "tts_models/hi/mai/tacotron2-DDC",
48
- "Sanskrit": "tts_models/hi/mai/tacotron2-DDC" # Fallback to Hindi
 
49
  }
50
 
51
  LANG_PRIMERS = {
@@ -75,84 +76,103 @@ SENTENCE_BANK = {
75
  "Learning new languages opens many doors.",
76
  "I enjoy reading books in the evening.",
77
  "Technology has changed our daily lives.",
78
- "Music brings people together across cultures."
 
 
 
79
  ],
80
  "Tamil": [
81
  "இன்று நல்ல வானிலை உள்ளது.",
82
  "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
83
  "எனக்கு புத்தகம் படிக்க விருப்பம்.",
84
  "தமிழ் மொழி மிகவும் அழகானது.",
85
- "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்."
 
 
 
86
  ],
87
  "Malayalam": [
88
  "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
89
  "ഇന്ന് മഴപെയ്യുന്നു.",
90
  "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
91
  "കേരളത്തിന്റെ പ്രകൃതി സുന്ദരമാണ്.",
92
- "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്."
 
 
 
93
  ],
94
  "Hindi": [
95
  "आज मौसम बहुत अच्छा है।",
96
  "मुझे हिंदी बोलना पसंद है।",
97
  "मैं रोज किताब पढ़ता हूँ।",
98
  "भारत की संस्कृति विविधतापूर्ण है।",
99
- "शिक्षा हमारे भविष्य की कुंजी है।"
 
 
 
100
  ],
101
  "Sanskrit": [
102
  "अहं ग्रन्थं पठामि।",
103
  "अद्य सूर्यः तेजस्वी अस्ति।",
104
  "मम नाम रामः।",
105
  "विद्या सर्वत्र पूज्यते।",
106
- "सत्यमेव जयते।"
 
 
 
107
  ]
108
  }
109
 
110
  # ---------------- MODEL CACHE ---------------- #
111
  asr_models = {}
112
- tts_models = {}
113
 
114
  def load_asr_model(language):
115
- """Load ASR model for specific language"""
116
  if language not in asr_models:
117
  try:
118
  model_name = ASR_MODELS[language]
119
- print(f"Loading ASR model for {language}: {model_name}")
120
-
121
- processor = AutoProcessor.from_pretrained(model_name)
122
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(DEVICE)
123
 
124
- asr_models[language] = {"processor": processor, "model": model}
125
- print(f"✅ ASR model loaded for {language}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  except Exception as e:
127
- print(f"❌ Failed to load ASR for {language}: {e}")
128
- # Fallback to English model
129
  if language != "English":
130
- print(f"🔄 Falling back to English ASR for {language}")
131
  load_asr_model("English")
132
  asr_models[language] = asr_models["English"]
133
 
134
  return asr_models[language]
135
 
136
- def load_tts_model(language):
137
- """Load TTS model for specific language"""
138
- if language not in tts_models:
139
- try:
140
- model_name = TTS_MODELS[language]
141
- print(f"Loading TTS model for {language}: {model_name}")
142
-
143
- tts = TTS(model_name=model_name).to(DEVICE)
144
- tts_models[language] = tts
145
- print(f"✅ TTS model loaded for {language}")
146
- except Exception as e:
147
- print(f"❌ Failed to load TTS for {language}: {e}")
148
- # Fallback to English
149
- if language != "English":
150
- print(f"🔄 Falling back to English TTS for {language}")
151
- load_tts_model("English")
152
- tts_models[language] = tts_models["English"]
153
-
154
- return tts_models[language]
155
-
156
  # ---------------- HELPERS ---------------- #
157
  def get_random_sentence(language_choice):
158
  """Get random sentence for practice"""
@@ -161,7 +181,9 @@ def get_random_sentence(language_choice):
161
  def is_script(text, lang_name):
162
  """Check if text is in expected script"""
163
  pattern = SCRIPT_PATTERNS.get(lang_name)
164
- return bool(pattern.search(text)) if pattern else True
 
 
165
 
166
  def transliterate_to_hk(text, lang_choice):
167
  """Transliterate Indic text to Harvard-Kyoto"""
@@ -177,7 +199,8 @@ def transliterate_to_hk(text, lang_choice):
177
  if script and is_script(text, lang_choice):
178
  try:
179
  return transliterate(text, script, sanscript.HK)
180
- except:
 
181
  return text
182
  return text
183
 
@@ -188,70 +211,89 @@ def preprocess_audio(audio_path, target_sr=16000):
188
  audio, sr = librosa.load(audio_path, sr=target_sr)
189
 
190
  # Normalize audio
191
- audio = audio / np.max(np.abs(audio))
 
192
 
193
- # Remove silence
194
  audio, _ = librosa.effects.trim(audio, top_db=20)
195
 
 
 
 
 
196
  return audio, target_sr
197
  except Exception as e:
198
  print(f"Audio preprocessing error: {e}")
199
  return None, None
200
 
201
- def transcribe_with_ai4bharat(audio_path, language, initial_prompt=""):
202
- """Transcribe audio using AI4Bharat models"""
203
  try:
204
- # Load model
205
  asr_components = load_asr_model(language)
206
  processor = asr_components["processor"]
207
  model = asr_components["model"]
 
208
 
209
  # Preprocess audio
210
  audio, sr = preprocess_audio(audio_path)
211
  if audio is None:
212
- return "Error: Could not process audio"
213
 
214
  # Prepare inputs
215
- inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
216
- inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
 
 
 
 
 
 
 
217
 
218
  # Generate transcription
219
  with torch.no_grad():
220
- predicted_ids = model.generate(**inputs, max_length=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  # Decode
223
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
 
 
 
224
 
225
  return transcription.strip()
226
 
227
  except Exception as e:
228
  print(f"Transcription error for {language}: {e}")
229
- return f"Error: Transcription failed - {str(e)}"
230
-
231
- def synthesize_with_ai4bharat(text, language):
232
- """Synthesize speech using AI4Bharat TTS"""
233
- if not text.strip():
234
- return None
235
-
236
- try:
237
- # Load TTS model
238
- tts = load_tts_model(language)
239
-
240
- # Generate audio
241
- audio_path = f"/tmp/tts_output_{hash(text)}.wav"
242
- tts.tts_to_file(text=text, file_path=audio_path)
243
-
244
- # Load generated audio
245
- audio, sr = librosa.load(audio_path, sr=22050)
246
-
247
- return sr, audio
248
-
249
- except Exception as e:
250
- print(f"TTS error for {language}: {e}")
251
- return None
252
 
253
  def highlight_differences(ref, hyp):
254
- """Highlight word-level differences"""
 
 
 
255
  ref_words = ref.strip().split()
256
  hyp_words = hyp.strip().split()
257
 
@@ -260,176 +302,249 @@ def highlight_differences(ref, hyp):
260
 
261
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
262
  if tag == 'equal':
263
- out_html.extend([f"<span style='color:green; font-weight:bold'>{w}</span>" for w in ref_words[i1:i2]])
264
  elif tag == 'replace':
265
- out_html.extend([f"<span style='color:red; text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
266
- out_html.extend([f"<span style='color:orange; font-weight:bold'> {w}</span>" for w in hyp_words[j1:j2]])
267
  elif tag == 'delete':
268
- out_html.extend([f"<span style='color:red; text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
269
  elif tag == 'insert':
270
- out_html.extend([f"<span style='color:orange; font-weight:bold'>+{w}</span>" for w in hyp_words[j1:j2]])
271
 
272
  return " ".join(out_html)
273
 
274
  def char_level_highlight(ref, hyp):
275
  """Highlight character-level differences"""
 
 
 
276
  sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
277
  out = []
278
 
279
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
280
  if tag == 'equal':
281
- out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
282
  elif tag in ('replace', 'delete'):
283
- out.extend([f"<span style='color:red; text-decoration:underline; font-weight:bold'>{c}</span>" for c in ref[i1:i2]])
284
  elif tag == 'insert':
285
- out.extend([f"<span style='color:orange; background-color:yellow'>{c}</span>" for c in hyp[j1:j2]])
286
 
287
  return "".join(out)
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  # ---------------- MAIN FUNCTION ---------------- #
290
  def compare_pronunciation(audio, language_choice, intended_sentence):
291
  """Main function to compare pronunciation"""
292
- if audio is None or not intended_sentence.strip():
293
- return ("❌ No audio or intended sentence provided.", "", "", "", "", "",
294
- None, None, "", "")
 
 
295
 
296
  try:
297
- print(f"Processing audio for {language_choice}")
298
 
299
  # Pass 1: Raw transcription
300
  primer_weak, _ = LANG_PRIMERS[language_choice]
301
- actual_text = transcribe_with_ai4bharat(audio, language_choice, primer_weak)
302
 
303
- # Pass 2: Target-biased transcription
304
  _, primer_strong = LANG_PRIMERS[language_choice]
305
- strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
306
- corrected_text = transcribe_with_ai4bharat(audio, language_choice, strict_prompt)
 
 
 
 
307
 
308
- # Error metrics
309
  try:
310
  wer_val = jiwer.wer(intended_sentence, actual_text)
311
  cer_val = jiwer.cer(intended_sentence, actual_text)
312
- except:
 
313
  wer_val, cer_val = 1.0, 1.0
314
 
315
- # Transliteration
 
 
 
316
  hk_translit = transliterate_to_hk(actual_text, language_choice)
317
- if not is_script(actual_text, language_choice):
318
- hk_translit = f"⚠️ Script mismatch: expected {language_choice} script"
319
 
320
  # Visual feedback
321
  diff_html = highlight_differences(intended_sentence, actual_text)
322
  char_html = char_level_highlight(intended_sentence, actual_text)
323
 
324
- # TTS synthesis
325
- tts_intended = synthesize_with_ai4bharat(intended_sentence, language_choice)
326
- tts_actual = synthesize_with_ai4bharat(actual_text, language_choice)
327
-
328
- # Status message
329
- status = f"✅ Analysis complete for {language_choice}"
330
- if wer_val < 0.1:
331
- status += " - Excellent pronunciation! 🎉"
332
- elif wer_val < 0.3:
333
- status += " - Good pronunciation! 👍"
334
- elif wer_val < 0.5:
335
- status += " - Needs improvement 📚"
336
- else:
337
- status += " - Keep practicing! 💪"
338
 
339
  return (
340
  status,
341
- actual_text,
342
- corrected_text,
343
  hk_translit,
344
- f"{wer_val:.3f}",
345
- f"{cer_val:.3f}",
346
  diff_html,
347
- tts_intended,
348
- tts_actual,
349
  char_html,
350
- intended_sentence
 
351
  )
352
 
353
  except Exception as e:
354
- error_msg = f"❌ Error during analysis: {str(e)}"
355
- print(error_msg)
356
- return (error_msg, "", "", "", "", "", None, None, "", "")
357
 
358
  # ---------------- UI ---------------- #
359
  def create_interface():
360
- with gr.Blocks(title="🎙️ AI4Bharat Pronunciation Trainer", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
361
  gr.Markdown("""
362
- # 🎙️ AI4Bharat Pronunciation Trainer
 
 
363
 
364
- Practice pronunciation in **Tamil, Malayalam, Hindi, Sanskrit & English** using state-of-the-art AI4Bharat models!
 
 
 
 
365
 
366
- 📋 **How to use:**
367
- 1. Select your target language
368
- 2. Generate a practice sentence
369
- 3. Record yourself reading it aloud
370
- 4. Get detailed feedback with error analysis
371
  """)
372
 
373
  with gr.Row():
374
- with gr.Column(scale=2):
375
  lang_choice = gr.Dropdown(
376
  choices=list(LANG_CODES.keys()),
377
  value="Tamil",
378
- label="🌍 Select Language"
 
379
  )
380
  with gr.Column(scale=1):
381
- gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
382
 
383
  intended_display = gr.Textbox(
384
  label="📝 Practice Sentence (Read this aloud)",
385
- placeholder="Click 'Generate Practice Sentence' to get started...",
386
  interactive=False,
387
- lines=2
 
388
  )
389
 
390
- with gr.Row():
391
- audio_input = gr.Audio(
392
- sources=["microphone", "upload"],
393
- type="filepath",
394
- label="🎤 Record Your Pronunciation"
395
- )
396
 
397
  analyze_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary", size="lg")
398
 
399
- status_output = gr.Textbox(label="📊 Analysis Status", interactive=False)
 
 
 
 
 
400
 
401
  with gr.Row():
402
  with gr.Column():
403
- pass1_out = gr.Textbox(label="🎯 What You Actually Said", interactive=False)
404
- wer_out = gr.Textbox(label="📈 Word Error Rate (lower = better)", interactive=False)
 
 
 
 
 
 
 
 
405
 
406
  with gr.Column():
407
- pass2_out = gr.Textbox(label="🔧 Target-Biased Output", interactive=False)
408
- cer_out = gr.Textbox(label="📊 Character Error Rate (lower = better)", interactive=False)
409
-
410
- hk_out = gr.Textbox(label="🔤 Romanization (Harvard-Kyoto)", interactive=False)
 
 
 
 
 
 
 
411
 
412
- with gr.Accordion("📝 Detailed Feedback", open=True):
413
- diff_html_box = gr.HTML(label="🔍 Word-Level Differences")
414
- char_html_box = gr.HTML(label="🔤 Character-Level Analysis")
 
 
 
415
 
416
- with gr.Row():
417
- intended_tts_audio = gr.Audio(label="🔊 Reference Pronunciation", type="numpy")
418
- actual_tts_audio = gr.Audio(label="🔊 Your Pronunciation (TTS)", type="numpy")
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
- gr.Markdown("""
421
- ### 🎨 Color Guide:
422
- - 🟢 **Green**: Correctly pronounced
423
- - 🔴 **Red**: Missing or incorrect words
424
- - 🟠 **Orange**: Extra or substituted words
425
- - 🟡 **Yellow background**: Inserted characters
426
- """)
427
 
428
  # Event handlers
 
 
 
 
429
  gen_btn.click(
430
- fn=get_random_sentence,
431
  inputs=[lang_choice],
432
- outputs=[intended_display]
 
 
 
433
  )
434
 
435
  analyze_btn.click(
@@ -438,8 +553,7 @@ def create_interface():
438
  outputs=[
439
  status_output, pass1_out, pass2_out, hk_out,
440
  wer_out, cer_out, diff_html_box,
441
- intended_tts_audio, actual_tts_audio,
442
- char_html_box, intended_display
443
  ]
444
  )
445
 
@@ -449,26 +563,41 @@ def create_interface():
449
  inputs=[lang_choice],
450
  outputs=[intended_display]
451
  )
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
  return demo
454
 
455
  # ---------------- LAUNCH ---------------- #
456
  if __name__ == "__main__":
457
- print("🚀 Starting AI4Bharat Pronunciation Trainer...")
 
 
458
 
459
- # Pre-load English models for faster startup
460
- print("📦 Pre-loading English models...")
461
  try:
462
  load_asr_model("English")
463
- load_tts_model("English")
464
- print("✅ English models loaded successfully")
465
  except Exception as e:
466
- print(f"⚠️ Warning: Could not pre-load English models: {e}")
467
 
468
  demo = create_interface()
469
  demo.launch(
470
  share=True,
471
  show_error=True,
472
  server_name="0.0.0.0",
473
- server_port=7860
 
 
474
  )
 
9
  from transformers import (
10
  AutoProcessor,
11
  AutoModelForSpeechSeq2Seq,
12
+ WhisperProcessor,
13
+ WhisperForConditionalGeneration
14
  )
 
15
  import librosa
16
  import soundfile as sf
17
  from indic_transliteration import sanscript
 
21
 
22
  # ---------------- CONFIG ---------------- #
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
+ print(f"🔧 Using device: {DEVICE}")
25
 
26
  LANG_CODES = {
27
  "English": "en",
 
31
  "Sanskrit": "sa"
32
  }
33
 
34
+ # Updated model configurations for better HF Spaces compatibility
35
  ASR_MODELS = {
36
  "English": "openai/whisper-base.en",
37
+ "Tamil": "vasista22/whisper-tamil-base", # Community model for Tamil
38
+ "Malayalam": "parambharat/whisper-small-ml", # Community model for Malayalam
39
+ "Hindi": "vasista22/whisper-hindi-base", # Community model for Hindi
40
+ "Sanskrit": "vasista22/whisper-hindi-base" # Fallback to Hindi for Sanskrit
41
  }
42
 
43
+ # Backup models in case primary ones fail
44
+ FALLBACK_MODELS = {
45
+ "English": "openai/whisper-base.en",
46
+ "Tamil": "openai/whisper-small",
47
+ "Malayalam": "openai/whisper-small",
48
+ "Hindi": "openai/whisper-small",
49
+ "Sanskrit": "openai/whisper-small"
50
  }
51
 
52
  LANG_PRIMERS = {
 
76
  "Learning new languages opens many doors.",
77
  "I enjoy reading books in the evening.",
78
  "Technology has changed our daily lives.",
79
+ "Music brings people together across cultures.",
80
+ "Education is the key to a bright future.",
81
+ "The flowers bloom beautifully in spring.",
82
+ "Hard work always pays off in the end."
83
  ],
84
  "Tamil": [
85
  "இன்று நல்ல வானிலை உள்ளது.",
86
  "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
87
  "எனக்கு புத்தகம் படிக்க விருப்பம்.",
88
  "தமிழ் மொழி மிகவும் அழகானது.",
89
+ "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்.",
90
+ "கல்வி நமது எதிர்காலத்தின் திறவுகோல்.",
91
+ "பறவைகள் காலையில் இனிமையாக பாடுகின்றன.",
92
+ "உழைப்பு எப்போதும் வெற்றியைத் தரும்."
93
  ],
94
  "Malayalam": [
95
  "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
96
  "ഇന്ന് മഴപെയ്യുന്നു.",
97
  "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
98
  "കേരളത്തിന്റെ പ്രകൃതി സുന്ദരമാണ്.",
99
+ "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്.",
100
+ "സംഗീതം മനസ്സിന് സന്തോഷം നൽകുന്നു.",
101
+ "കുടുംബസമയം വളരെ വിലപ്പെട്ടതാണ്.",
102
+ "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും."
103
  ],
104
  "Hindi": [
105
  "आज मौसम बहुत अच्छा है।",
106
  "मुझे हिंदी बोलना पसंद है।",
107
  "मैं रोज किताब पढ़ता हूँ।",
108
  "भारत की संस्कृति विविधतापूर्ण है।",
109
+ "शिक्षा हमारे भविष्य की कुंजी है।",
110
+ "संगीत हमारे दिल को छूता है।",
111
+ "परिवार के साथ समय बिताना अनमोल है।",
112
+ "मेहनत का फल हमेशा मीठा होता है।"
113
  ],
114
  "Sanskrit": [
115
  "अहं ग्रन्थं पठामि।",
116
  "अद्य सूर्यः तेजस्वी अस्ति।",
117
  "मम नाम रामः।",
118
  "विद्या सर्वत्र पूज्यते।",
119
+ "सत्यमेव जयते।",
120
+ "गुरुर्ब्रह्मा गुरुर्विष्णुः।",
121
+ "वसुधैव कुटुम्बकम्।",
122
+ "श्रम एव विजयते।"
123
  ]
124
  }
125
 
126
  # ---------------- MODEL CACHE ---------------- #
127
  asr_models = {}
 
128
 
129
  def load_asr_model(language):
130
+ """Load ASR model for specific language with fallback"""
131
  if language not in asr_models:
132
  try:
133
  model_name = ASR_MODELS[language]
134
+ print(f"🔄 Loading ASR model for {language}: {model_name}")
 
 
 
135
 
136
+ # Try loading the primary model
137
+ try:
138
+ processor = AutoProcessor.from_pretrained(model_name)
139
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
140
+ model_name,
141
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
142
+ low_cpu_mem_usage=True,
143
+ use_safetensors=True
144
+ ).to(DEVICE)
145
+
146
+ asr_models[language] = {"processor": processor, "model": model, "model_name": model_name}
147
+ print(f"✅ Primary ASR model loaded for {language}")
148
+ return asr_models[language]
149
+
150
+ except Exception as e:
151
+ print(f"⚠️ Primary model failed for {language}: {e}")
152
+ print(f"🔄 Trying fallback model...")
153
+
154
+ # Try fallback model
155
+ fallback_name = FALLBACK_MODELS[language]
156
+ processor = WhisperProcessor.from_pretrained(fallback_name)
157
+ model = WhisperForConditionalGeneration.from_pretrained(
158
+ fallback_name,
159
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
160
+ low_cpu_mem_usage=True
161
+ ).to(DEVICE)
162
+
163
+ asr_models[language] = {"processor": processor, "model": model, "model_name": fallback_name}
164
+ print(f"✅ Fallback ASR model loaded for {language}")
165
+
166
  except Exception as e:
167
+ print(f"❌ Failed to load any ASR model for {language}: {e}")
168
+ # Use English as ultimate fallback
169
  if language != "English":
170
+ print(f"🔄 Using English ASR as final fallback for {language}")
171
  load_asr_model("English")
172
  asr_models[language] = asr_models["English"]
173
 
174
  return asr_models[language]
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # ---------------- HELPERS ---------------- #
177
  def get_random_sentence(language_choice):
178
  """Get random sentence for practice"""
 
181
  def is_script(text, lang_name):
182
  """Check if text is in expected script"""
183
  pattern = SCRIPT_PATTERNS.get(lang_name)
184
+ if not pattern:
185
+ return True
186
+ return bool(pattern.search(text))
187
 
188
  def transliterate_to_hk(text, lang_choice):
189
  """Transliterate Indic text to Harvard-Kyoto"""
 
199
  if script and is_script(text, lang_choice):
200
  try:
201
  return transliterate(text, script, sanscript.HK)
202
+ except Exception as e:
203
+ print(f"Transliteration error: {e}")
204
  return text
205
  return text
206
 
 
211
  audio, sr = librosa.load(audio_path, sr=target_sr)
212
 
213
  # Normalize audio
214
+ if np.max(np.abs(audio)) > 0:
215
+ audio = audio / np.max(np.abs(audio))
216
 
217
+ # Remove silence from beginning and end
218
  audio, _ = librosa.effects.trim(audio, top_db=20)
219
 
220
+ # Ensure minimum length
221
+ if len(audio) < target_sr * 0.1: # Less than 0.1 seconds
222
+ return None, None
223
+
224
  return audio, target_sr
225
  except Exception as e:
226
  print(f"Audio preprocessing error: {e}")
227
  return None, None
228
 
229
+ def transcribe_audio(audio_path, language, initial_prompt="", force_language=True):
230
+ """Transcribe audio using loaded models"""
231
  try:
232
+ # Load model components
233
  asr_components = load_asr_model(language)
234
  processor = asr_components["processor"]
235
  model = asr_components["model"]
236
+ model_name = asr_components["model_name"]
237
 
238
  # Preprocess audio
239
  audio, sr = preprocess_audio(audio_path)
240
  if audio is None:
241
+ return "Error: Audio too short or could not be processed"
242
 
243
  # Prepare inputs
244
+ inputs = processor(
245
+ audio,
246
+ sampling_rate=sr,
247
+ return_tensors="pt",
248
+ padding=True
249
+ )
250
+
251
+ # Move to device
252
+ input_features = inputs.input_features.to(DEVICE)
253
 
254
  # Generate transcription
255
  with torch.no_grad():
256
+ # Set generation parameters
257
+ generate_kwargs = {
258
+ "input_features": input_features,
259
+ "max_length": 200,
260
+ "num_beams": 5,
261
+ "temperature": 0.0,
262
+ "do_sample": False
263
+ }
264
+
265
+ # Add language forcing if supported
266
+ if hasattr(model.config, 'forced_decoder_ids') and force_language:
267
+ lang_code = LANG_CODES.get(language, "en")
268
+ try:
269
+ forced_decoder_ids = processor.get_decoder_prompt_ids(
270
+ language=lang_code,
271
+ task="transcribe"
272
+ )
273
+ generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
274
+ except:
275
+ pass # Skip if not supported
276
+
277
+ predicted_ids = model.generate(**generate_kwargs)
278
 
279
  # Decode
280
+ transcription = processor.batch_decode(
281
+ predicted_ids,
282
+ skip_special_tokens=True,
283
+ clean_up_tokenization_spaces=True
284
+ )[0]
285
 
286
  return transcription.strip()
287
 
288
  except Exception as e:
289
  print(f"Transcription error for {language}: {e}")
290
+ return f"Error: Transcription failed - {str(e)[:100]}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  def highlight_differences(ref, hyp):
293
+ """Highlight word-level differences with better styling"""
294
+ if not ref.strip() or not hyp.strip():
295
+ return "No text to compare"
296
+
297
  ref_words = ref.strip().split()
298
  hyp_words = hyp.strip().split()
299
 
 
302
 
303
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
304
  if tag == 'equal':
305
+ out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
306
  elif tag == 'replace':
307
+ out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
308
+ out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>→{w}</span>" for w in hyp_words[j1:j2]])
309
  elif tag == 'delete':
310
+ out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
311
  elif tag == 'insert':
312
+ out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
313
 
314
  return " ".join(out_html)
315
 
316
  def char_level_highlight(ref, hyp):
317
  """Highlight character-level differences"""
318
+ if not ref.strip() or not hyp.strip():
319
+ return "No text to compare"
320
+
321
  sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
322
  out = []
323
 
324
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
325
  if tag == 'equal':
326
+ out.extend([f"<span style='color:green; background-color:#e8f5e8;'>{c}</span>" for c in ref[i1:i2]])
327
  elif tag in ('replace', 'delete'):
328
+ out.extend([f"<span style='color:red; text-decoration:underline; background-color:#ffe8e8; font-weight:bold;'>{c}</span>" for c in ref[i1:i2]])
329
  elif tag == 'insert':
330
+ out.extend([f"<span style='color:orange; background-color:#fff3cd; font-weight:bold;'>{c}</span>" for c in hyp[j1:j2]])
331
 
332
  return "".join(out)
333
 
334
+ def get_pronunciation_score(wer_val, cer_val):
335
+ """Calculate pronunciation score and feedback"""
336
+ # Weight WER more heavily than CER
337
+ combined_score = (wer_val * 0.7) + (cer_val * 0.3)
338
+
339
+ if combined_score <= 0.1:
340
+ return "🏆 Excellent! (90%+)", "Your pronunciation is outstanding!"
341
+ elif combined_score <= 0.2:
342
+ return "🎉 Very Good! (80-90%)", "Great pronunciation with minor areas for improvement."
343
+ elif combined_score <= 0.4:
344
+ return "👍 Good! (60-80%)", "Good effort! Keep practicing for better accuracy."
345
+ elif combined_score <= 0.6:
346
+ return "📚 Needs Practice (40-60%)", "Focus on clearer pronunciation of highlighted words."
347
+ else:
348
+ return "💪 Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
349
+
350
  # ---------------- MAIN FUNCTION ---------------- #
351
  def compare_pronunciation(audio, language_choice, intended_sentence):
352
  """Main function to compare pronunciation"""
353
+ if audio is None:
354
+ return ("❌ Please record audio first.", "", "", "", "", "", "", "", "", "")
355
+
356
+ if not intended_sentence.strip():
357
+ return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "", "", "")
358
 
359
  try:
360
+ print(f"🔍 Analyzing pronunciation for {language_choice}...")
361
 
362
  # Pass 1: Raw transcription
363
  primer_weak, _ = LANG_PRIMERS[language_choice]
364
+ actual_text = transcribe_audio(audio, language_choice, primer_weak, force_language=True)
365
 
366
+ # Pass 2: Target-biased transcription with stronger prompt
367
  _, primer_strong = LANG_PRIMERS[language_choice]
368
+ strict_prompt = f"{primer_strong}\nExpected: {intended_sentence}"
369
+ corrected_text = transcribe_audio(audio, language_choice, strict_prompt, force_language=True)
370
+
371
+ # Handle transcription errors
372
+ if actual_text.startswith("Error:"):
373
+ return (f"❌ {actual_text}", "", "", "", "", "", "", "", "", "")
374
 
375
+ # Calculate error metrics
376
  try:
377
  wer_val = jiwer.wer(intended_sentence, actual_text)
378
  cer_val = jiwer.cer(intended_sentence, actual_text)
379
+ except Exception as e:
380
+ print(f"Error calculating metrics: {e}")
381
  wer_val, cer_val = 1.0, 1.0
382
 
383
+ # Get pronunciation score and feedback
384
+ score_text, feedback = get_pronunciation_score(wer_val, cer_val)
385
+
386
+ # Transliteration for Indic scripts
387
  hk_translit = transliterate_to_hk(actual_text, language_choice)
388
+ if not is_script(actual_text, language_choice) and language_choice != "English":
389
+ hk_translit = f"⚠️ Expected {language_choice} script, got mixed/other script"
390
 
391
  # Visual feedback
392
  diff_html = highlight_differences(intended_sentence, actual_text)
393
  char_html = char_level_highlight(intended_sentence, actual_text)
394
 
395
+ # Status message with detailed feedback
396
+ status = f"✅ Analysis Complete - {score_text}\n💬 {feedback}"
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  return (
399
  status,
400
+ actual_text or "(No transcription)",
401
+ corrected_text or "(No corrected transcription)",
402
  hk_translit,
403
+ f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
404
+ f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
405
  diff_html,
 
 
406
  char_html,
407
+ intended_sentence,
408
+ f"🎯 Target: {intended_sentence}"
409
  )
410
 
411
  except Exception as e:
412
+ error_msg = f"❌ Analysis Error: {str(e)[:200]}"
413
+ print(f"Analysis error: {e}")
414
+ return (error_msg, "", "", "", "", "", "", "", "", "")
415
 
416
  # ---------------- UI ---------------- #
417
  def create_interface():
418
+ with gr.Blocks(
419
+ title="🎙️ Multilingual Pronunciation Trainer",
420
+ theme=gr.themes.Soft(),
421
+ css="""
422
+ .gradio-container {max-width: 1200px !important}
423
+ .feedback-box {font-size: 18px !important; font-weight: bold !important}
424
+ """
425
+ ) as demo:
426
+
427
  gr.Markdown("""
428
+ # 🎙️ Multilingual Pronunciation Trainer
429
+
430
+ **Practice pronunciation in Tamil, Malayalam, Hindi, Sanskrit & English** using advanced speech recognition!
431
 
432
+ ### 📋 How to Use:
433
+ 1. **Select** your target language 🌍
434
+ 2. **Generate** a practice sentence 🎲
435
+ 3. **Record** yourself reading it aloud 🎤
436
+ 4. **Get** detailed feedback with accuracy metrics 📊
437
 
438
+ ### 🎯 Features:
439
+ - **Dual-pass analysis** for accurate assessment
440
+ - **Visual highlighting** of pronunciation errors
441
+ - **Romanization** for Indic scripts
442
+ - **Detailed metrics** (Word & Character accuracy)
443
  """)
444
 
445
  with gr.Row():
446
+ with gr.Column(scale=3):
447
  lang_choice = gr.Dropdown(
448
  choices=list(LANG_CODES.keys()),
449
  value="Tamil",
450
+ label="🌍 Select Language",
451
+ info="Choose the language you want to practice"
452
  )
453
  with gr.Column(scale=1):
454
+ gen_btn = gr.Button("🎲 Generate Sentence", variant="primary", size="lg")
455
 
456
  intended_display = gr.Textbox(
457
  label="📝 Practice Sentence (Read this aloud)",
458
+ placeholder="Click 'Generate Sentence' to get started...",
459
  interactive=False,
460
+ lines=3,
461
+ show_copy_button=True
462
  )
463
 
464
+ audio_input = gr.Audio(
465
+ sources=["microphone", "upload"],
466
+ type="filepath",
467
+ label="🎤 Record Your Pronunciation",
468
+ info="Record yourself reading the sentence above"
469
+ )
470
 
471
  analyze_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary", size="lg")
472
 
473
+ status_output = gr.Textbox(
474
+ label="📊 Analysis Results",
475
+ interactive=False,
476
+ lines=3,
477
+ elem_classes=["feedback-box"]
478
+ )
479
 
480
  with gr.Row():
481
  with gr.Column():
482
+ pass1_out = gr.Textbox(
483
+ label="🎯 What You Actually Said (Raw Output)",
484
+ interactive=False,
485
+ lines=2
486
+ )
487
+ wer_out = gr.Textbox(
488
+ label="📈 Word Accuracy",
489
+ interactive=False,
490
+ info="Higher percentage = better pronunciation"
491
+ )
492
 
493
  with gr.Column():
494
+ pass2_out = gr.Textbox(
495
+ label="🔧 Target-Biased Analysis",
496
+ interactive=False,
497
+ lines=2,
498
+ info="What the model thinks you meant to say"
499
+ )
500
+ cer_out = gr.Textbox(
501
+ label="📊 Character Accuracy",
502
+ interactive=False,
503
+ info="Character-level pronunciation accuracy"
504
+ )
505
 
506
+ hk_out = gr.Textbox(
507
+ label="🔤 Romanization (Harvard-Kyoto)",
508
+ interactive=False,
509
+ info="Romanized version for easier analysis",
510
+ show_copy_button=True
511
+ )
512
 
513
+ with gr.Accordion("📝 Detailed Visual Feedback", open=True):
514
+ gr.Markdown("""
515
+ ### 🎨 Color Guide:
516
+ - 🟢 **Green**: Correctly pronounced words/characters
517
+ - 🔴 **Red**: Missing or mispronounced (strikethrough)
518
+ - 🟠 **Orange**: Extra words or substitutions
519
+ """)
520
+
521
+ diff_html_box = gr.HTML(
522
+ label="🔍 Word-Level Analysis",
523
+ show_label=True
524
+ )
525
+ char_html_box = gr.HTML(
526
+ label="🔤 Character-Level Analysis",
527
+ show_label=True
528
+ )
529
 
530
+ target_display = gr.Textbox(
531
+ label="🎯 Reference Text",
532
+ interactive=False,
533
+ visible=False
534
+ )
 
 
535
 
536
  # Event handlers
537
+ def generate_and_clear(language):
538
+ sentence = get_random_sentence(language)
539
+ return sentence, "", "", "", "", "", "", "", "", ""
540
+
541
  gen_btn.click(
542
+ fn=generate_and_clear,
543
  inputs=[lang_choice],
544
+ outputs=[
545
+ intended_display, status_output, pass1_out, pass2_out,
546
+ hk_out, wer_out, cer_out, diff_html_box, char_html_box, target_display
547
+ ]
548
  )
549
 
550
  analyze_btn.click(
 
553
  outputs=[
554
  status_output, pass1_out, pass2_out, hk_out,
555
  wer_out, cer_out, diff_html_box,
556
+ char_html_box, intended_display, target_display
 
557
  ]
558
  )
559
 
 
563
  inputs=[lang_choice],
564
  outputs=[intended_display]
565
  )
566
+
567
+ # Footer
568
+ gr.Markdown("""
569
+ ---
570
+ ### 🔧 Technical Details:
571
+ - **ASR Models**: Community-trained Whisper models optimized for Indic languages
572
+ - **Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
573
+ - **Transliteration**: Harvard-Kyoto system for Indic scripts
574
+ - **Analysis**: Dual-pass approach for comprehensive feedback
575
+
576
+ **Note**: TTS (Text-to-Speech) reference audio will be added in future updates.
577
+ """)
578
 
579
  return demo
580
 
581
  # ---------------- LAUNCH ---------------- #
582
  if __name__ == "__main__":
583
+ print("🚀 Starting Multilingual Pronunciation Trainer...")
584
+ print(f"🔧 Device: {DEVICE}")
585
+ print(f"🔧 PyTorch version: {torch.__version__}")
586
 
587
+ # Pre-load English model for faster startup
588
+ print("📦 Pre-loading English model...")
589
  try:
590
  load_asr_model("English")
591
+ print("English model loaded successfully")
 
592
  except Exception as e:
593
+ print(f"⚠️ Warning: Could not pre-load English model: {e}")
594
 
595
  demo = create_interface()
596
  demo.launch(
597
  share=True,
598
  show_error=True,
599
  server_name="0.0.0.0",
600
+ server_port=7860,
601
+ show_tips=True,
602
+ enable_queue=True
603
  )