sudhanm commited on
Commit
eecaaa5
·
verified ·
1 Parent(s): e8f391d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +427 -99
app.py CHANGED
@@ -7,6 +7,8 @@ import torch
7
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
8
  from indic_transliteration import sanscript
9
  from indic_transliteration.sanscript import transliterate
 
 
10
 
11
  # ---------------- CONFIG ---------------- #
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -45,35 +47,90 @@ SENTENCE_BANK = {
45
  "Learning languages is fun.",
46
  "I like to drink coffee in the morning.",
47
  "Technology helps us communicate better.",
48
- "Reading books expands our knowledge."
 
 
 
49
  ],
50
  "Tamil": [
51
  "இன்று நல்ல வானிலை உள்ளது.",
52
  "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
53
  "எனக்கு புத்தகம் படிக்க விருப்பம்.",
54
  "தமிழ் மொழி மிகவும் அழகானது.",
55
- "நான் தினமும் பள்ளிக்கு செல்கிறேன்."
 
 
 
56
  ],
57
  "Malayalam": [
58
  "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
59
  "ഇന്ന് മഴപെയ്യുന്നു.",
60
  "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
61
  "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
62
- "ഞാൻ മലയാളം പഠിക്കുന്നു."
 
 
 
63
  ]
64
  }
65
 
66
- # ---------------- LOAD MODELS ---------------- #
67
- print("Loading Whisper models...")
68
- whisper_models = {}
69
- whisper_processors = {}
70
 
71
- for lang, model_id in MODEL_CONFIGS.items():
72
- print(f"Loading {lang} model: {model_id}")
73
- whisper_models[lang] = WhisperForConditionalGeneration.from_pretrained(model_id).to(DEVICE)
74
- whisper_processors[lang] = WhisperProcessor.from_pretrained(model_id)
75
-
76
- print("All models loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # ---------------- HELPERS ---------------- #
79
  def get_random_sentence(language_choice):
@@ -84,17 +141,55 @@ def is_script(text, lang_name):
84
  return bool(pattern.search(text)) if pattern else True
85
 
86
  def transliterate_to_hk(text, lang_choice):
 
 
 
 
87
  mapping = {
88
  "Tamil": sanscript.TAMIL,
89
  "Malayalam": sanscript.MALAYALAM,
90
  "English": None
91
  }
92
- return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
93
-
94
- def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temperature, condition_on_previous_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # Get the appropriate model and processor for the language
96
- model = whisper_models[language_choice]
97
- processor = whisper_processors[language_choice]
98
  lang_code = LANG_CODES[language_choice]
99
 
100
  # Load and process audio
@@ -126,110 +221,343 @@ def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temp
126
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
127
  return transcription.strip()
128
 
129
- def highlight_differences(ref, hyp):
130
- ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
131
- sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
132
- out_html = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
134
  if tag == 'equal':
135
- out_html.extend([f"<span style='color:green'>{w}</span>" for w in ref_words[i1:i2]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  elif tag == 'replace':
137
- out_html.extend([f"<span style='color:red'>{w}</span>" for w in ref_words[i1:i2]])
138
- out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  elif tag == 'delete':
140
- out_html.extend([f"<span style='color:red;text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  elif tag == 'insert':
142
- out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
143
- return " ".join(out_html)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- def char_level_highlight(ref, hyp):
146
- sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
147
- out = []
148
- for tag, i1, i2, j1, j2 in sm.get_opcodes():
149
- if tag == 'equal':
150
- out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
151
- elif tag in ('replace', 'delete'):
152
- out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
153
- elif tag == 'insert':
154
- out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
155
- return "".join(out)
156
 
157
  # ---------------- MAIN ---------------- #
158
- def compare_pronunciation(audio, language_choice, intended_sentence,
159
- pass1_beam, pass1_temp, pass1_condition):
160
  if audio is None or not intended_sentence.strip():
161
- return ("No audio or intended sentence.", "", "", "", "", "", "", "")
162
-
163
- primer_weak, primer_strong = LANG_PRIMERS[language_choice]
164
-
165
- # Pass 1: raw transcription with user-configured decoding parameters
166
- actual_text = transcribe_once(audio, language_choice, primer_weak,
167
- pass1_beam, pass1_temp, pass1_condition)
168
 
169
- # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
170
- strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
171
- corrected_text = transcribe_once(audio, language_choice, strict_prompt,
172
- beam_size=5, temperature=0.0, condition_on_previous_text=False)
173
-
174
- # Compute WER and CER
175
- wer_val = jiwer.wer(intended_sentence, actual_text)
176
- cer_val = jiwer.cer(intended_sentence, actual_text)
177
-
178
- # Transliteration of Pass 1 output
179
- hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
180
-
181
- # Highlight word-level and character-level differences
182
- diff_html = highlight_differences(intended_sentence, actual_text)
183
- char_html = char_level_highlight(intended_sentence, actual_text)
184
-
185
- return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
186
- diff_html, char_html, intended_sentence)
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  # ---------------- UI ---------------- #
189
- with gr.Blocks(title="Pronunciation Comparator") as demo:
190
- gr.Markdown("## 🎙 Pronunciation Comparator - English, Tamil & Malayalam")
191
- gr.Markdown("Practice pronunciation with specialized Whisper models for each language!")
 
 
 
 
 
 
 
 
192
 
193
  with gr.Row():
194
- lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
195
- gen_btn = gr.Button("🎲 Generate Sentence")
196
-
197
- intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
 
 
 
 
 
 
 
 
 
 
198
 
199
  with gr.Row():
200
- audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record your pronunciation")
201
-
202
- with gr.Column():
203
- gr.Markdown("### Transcription Parameters")
204
- pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
205
- pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
206
- pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")
207
-
208
- submit_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary")
 
 
 
209
 
210
  with gr.Row():
211
- pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
212
- pass2_out = gr.Textbox(label="Pass 2: Target-Biased Output")
213
-
214
- with gr.Row():
215
- hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
216
- wer_out = gr.Textbox(label="Word Error Rate")
217
- cer_out = gr.Textbox(label="Character Error Rate")
218
 
219
- gr.Markdown("### Visual Feedback")
220
- diff_html_box = gr.HTML(label="Word Differences Highlighted")
221
- char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
222
 
223
  # Event handlers
224
- gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
 
 
 
 
225
 
226
- submit_btn.click(
227
  fn=compare_pronunciation,
228
- inputs=[audio_input, language_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
229
- outputs=[
230
- pass1_out, pass2_out, hk_translit, wer_out, cer_out,
231
- diff_html_box, char_html_box, intended_display
232
- ]
233
  )
234
 
235
  if __name__ == "__main__":
 
7
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
8
  from indic_transliteration import sanscript
9
  from indic_transliteration.sanscript import transliterate
10
+ import spaces
11
+ import gc
12
 
13
  # ---------------- CONFIG ---------------- #
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
47
  "Learning languages is fun.",
48
  "I like to drink coffee in the morning.",
49
  "Technology helps us communicate better.",
50
+ "Reading books expands our knowledge.",
51
+ "Music brings people together.",
52
+ "Exercise keeps us healthy and strong.",
53
+ "Cooking is both art and science."
54
  ],
55
  "Tamil": [
56
  "இன்று நல்ல வானிலை உள்ளது.",
57
  "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
58
  "எனக்கு புத்தகம் படிக்க விருப்பம்.",
59
  "தமிழ் மொழி மிகவும் அழகானது.",
60
+ "நான் தினமும் பள்ளிக்கு செல்கிறேன்.",
61
+ "எனக்கு இசை கேட்க மிகவும் பிடிக்கும்.",
62
+ "அன்னை தமிழ் எங்கள் தாய்மொழி.",
63
+ "நல்ல உணவு உடல் நலத்திற்கு அவசியம்."
64
  ],
65
  "Malayalam": [
66
  "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
67
  "ഇന്ന് മഴപെയ്യുന്നു.",
68
  "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
69
  "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
70
+ "ഞാൻ മലയാളം പഠിക്കുന്നു.",
71
+ "സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്.",
72
+ "നല്ല ആരോഗ്യം വളരെ പ്രധാനമാണ്.",
73
+ "വിദ്യാഭ്യാസം ജീവിതത്തിൽ അത്യാവശ്യമാണ്."
74
  ]
75
  }
76
 
77
+ # ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
78
+ # Store only currently loaded model to save memory
79
+ current_model = {"language": None, "model": None, "processor": None}
 
80
 
81
+ def load_model_for_language(language_choice):
82
+ """Load model on-demand and clear previous model from memory"""
83
+ global current_model
84
+
85
+ # If same language is already loaded, return current model
86
+ if current_model["language"] == language_choice and current_model["model"] is not None:
87
+ return current_model["model"], current_model["processor"]
88
+
89
+ # Clear previous model from memory
90
+ if current_model["model"] is not None:
91
+ del current_model["model"]
92
+ del current_model["processor"]
93
+ gc.collect()
94
+ if DEVICE == "cuda":
95
+ torch.cuda.empty_cache()
96
+
97
+ # Load new model
98
+ model_id = MODEL_CONFIGS[language_choice]
99
+ print(f"Loading {language_choice} model: {model_id}")
100
+
101
+ try:
102
+ model = WhisperForConditionalGeneration.from_pretrained(
103
+ model_id,
104
+ torch_dtype=torch.float32
105
+ ).to(DEVICE)
106
+ processor = WhisperProcessor.from_pretrained(model_id)
107
+
108
+ current_model = {
109
+ "language": language_choice,
110
+ "model": model,
111
+ "processor": processor
112
+ }
113
+
114
+ print(f"✓ {language_choice} model loaded successfully")
115
+ return model, processor
116
+
117
+ except Exception as e:
118
+ print(f"✗ Error loading {language_choice} model: {e}")
119
+ # Fallback to base whisper model
120
+ print(f"Falling back to openai/whisper-base for {language_choice}")
121
+ model = WhisperForConditionalGeneration.from_pretrained(
122
+ "openai/whisper-base",
123
+ torch_dtype=torch.float32
124
+ ).to(DEVICE)
125
+ processor = WhisperProcessor.from_pretrained("openai/whisper-base")
126
+
127
+ current_model = {
128
+ "language": language_choice,
129
+ "model": model,
130
+ "processor": processor
131
+ }
132
+
133
+ return model, processor
134
 
135
  # ---------------- HELPERS ---------------- #
136
  def get_random_sentence(language_choice):
 
141
  return bool(pattern.search(text)) if pattern else True
142
 
143
  def transliterate_to_hk(text, lang_choice):
144
+ """Improved transliteration with better handling"""
145
+ if not text or not text.strip():
146
+ return ""
147
+
148
  mapping = {
149
  "Tamil": sanscript.TAMIL,
150
  "Malayalam": sanscript.MALAYALAM,
151
  "English": None
152
  }
153
+
154
+ if mapping[lang_choice] is None:
155
+ return text # Return as-is for English
156
+
157
+ try:
158
+ # Clean the text and transliterate
159
+ cleaned_text = text.strip()
160
+ transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.HK)
161
+ return transliterated if transliterated else text
162
+ except Exception as e:
163
+ print(f"Transliteration error: {e}")
164
+ return text
165
+
166
+ def transliterate_to_roman(text, lang_choice):
167
+ """Transliterate to more readable Roman script"""
168
+ if not text or not text.strip():
169
+ return ""
170
+
171
+ mapping = {
172
+ "Tamil": sanscript.TAMIL,
173
+ "Malayalam": sanscript.MALAYALAM,
174
+ "English": None
175
+ }
176
+
177
+ if mapping[lang_choice] is None:
178
+ return text # Return as-is for English
179
+
180
+ try:
181
+ # Clean the text and transliterate to IAST (more readable)
182
+ cleaned_text = text.strip()
183
+ transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.IAST)
184
+ return transliterated if transliterated else text
185
+ except Exception as e:
186
+ print(f"Transliteration error: {e}")
187
+ return text
188
+
189
+ @spaces.GPU
190
+ def transcribe_once(audio_path, language_choice, beam_size, temperature):
191
  # Get the appropriate model and processor for the language
192
+ model, processor = load_model_for_language(language_choice)
 
193
  lang_code = LANG_CODES[language_choice]
194
 
195
  # Load and process audio
 
221
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
222
  return transcription.strip()
223
 
224
+ def create_tabular_feedback(intended, actual, lang_choice):
225
+ """Create comprehensive tabular feedback with transliteration"""
226
+
227
+ # Get transliterations
228
+ intended_roman = transliterate_to_roman(intended, lang_choice)
229
+ actual_roman = transliterate_to_roman(actual, lang_choice)
230
+ intended_hk = transliterate_to_hk(intended, lang_choice)
231
+ actual_hk = transliterate_to_hk(actual, lang_choice)
232
+
233
+ # Split into words for comparison
234
+ intended_words = intended.strip().split()
235
+ actual_words = actual.strip().split()
236
+ intended_roman_words = intended_roman.strip().split()
237
+ actual_roman_words = actual_roman.strip().split()
238
+
239
+ # Calculate accuracy
240
+ correct_words = 0
241
+ total_words = len(intended_words)
242
+
243
+ # Create word-by-word comparison table
244
+ feedback_html = """
245
+ <div style='font-family: Arial, sans-serif; padding: 20px; background: #f8f9fa; border-radius: 12px; margin: 10px 0;'>
246
+ <h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Pronunciation Analysis</h3>
247
+ """
248
+
249
+ # Overview table
250
+ feedback_html += """
251
+ <div style='margin-bottom: 25px;'>
252
+ <h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison</h4>
253
+ <table style='width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
254
+ <thead>
255
+ <tr style='background: #3498db; color: white;'>
256
+ <th style='padding: 12px; text-align: left; font-weight: bold;'>Type</th>
257
+ <th style='padding: 12px; text-align: left; font-weight: bold;'>Original Script</th>
258
+ <th style='padding: 12px; text-align: left; font-weight: bold;'>Roman/IAST</th>
259
+ </tr>
260
+ </thead>
261
+ <tbody>
262
+ <tr style='background: #e8f5e8;'>
263
+ <td style='padding: 12px; font-weight: bold; color: #27ae60;'>🎯 Target</td>
264
+ <td style='padding: 12px; font-family: monospace;'>{}</td>
265
+ <td style='padding: 12px; font-family: monospace; font-style: italic;'>{}</td>
266
+ </tr>
267
+ <tr style='background: #fff3e0;'>
268
+ <td style='padding: 12px; font-weight: bold; color: #e67e22;'>🗣️ You Said</td>
269
+ <td style='padding: 12px; font-family: monospace;'>{}</td>
270
+ <td style='padding: 12px; font-family: monospace; font-style: italic;'>{}</td>
271
+ </tr>
272
+ </tbody>
273
+ </table>
274
+ </div>
275
+ """.format(intended, intended_roman, actual, actual_roman)
276
+
277
+ # Word-by-word analysis
278
+ feedback_html += """
279
+ <div style='margin-bottom: 25px;'>
280
+ <h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Word-by-Word Analysis</h4>
281
+ <table style='width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
282
+ <thead>
283
+ <tr style='background: #9b59b6; color: white;'>
284
+ <th style='padding: 12px; text-align: center; font-weight: bold;'>#</th>
285
+ <th style='padding: 12px; text-align: left; font-weight: bold;'>Expected</th>
286
+ <th style='padding: 12px; text-align: left; font-weight: bold;'>You Said</th>
287
+ <th style='padding: 12px; text-align: center; font-weight: bold;'>Status</th>
288
+ </tr>
289
+ </thead>
290
+ <tbody>
291
+ """
292
+
293
+ # Compare words using difflib
294
+ sm = difflib.SequenceMatcher(None, intended_words, actual_words)
295
+ word_index = 0
296
+
297
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
298
  if tag == 'equal':
299
+ # Correct words
300
+ for idx, word in enumerate(intended_words[i1:i2]):
301
+ word_index += 1
302
+ correct_words += 1
303
+ roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
304
+ actual_word = actual_words[j1 + idx] if (j1 + idx) < len(actual_words) else ""
305
+ actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
306
+
307
+ feedback_html += f"""
308
+ <tr style='background: #d4f6d4;'>
309
+ <td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
310
+ <td style='padding: 10px;'>
311
+ <div style='font-family: monospace; font-size: 16px;'>{word}</div>
312
+ <div style='font-size: 12px; color: #666; font-style: italic;'>{roman_word}</div>
313
+ </td>
314
+ <td style='padding: 10px;'>
315
+ <div style='font-family: monospace; font-size: 16px; color: #27ae60;'>{actual_word}</div>
316
+ <div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
317
+ </td>
318
+ <td style='padding: 10px; text-align: center;'>
319
+ <span style='background: #27ae60; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>✓ Correct</span>
320
+ </td>
321
+ </tr>
322
+ """
323
+
324
  elif tag == 'replace':
325
+ # Incorrect words
326
+ max_words = max(i2-i1, j2-j1)
327
+ for idx in range(max_words):
328
+ word_index += 1
329
+ expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
330
+ expected_roman = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
331
+ actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
332
+ actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
333
+
334
+ feedback_html += f"""
335
+ <tr style='background: #ffebee;'>
336
+ <td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
337
+ <td style='padding: 10px;'>
338
+ <div style='font-family: monospace; font-size: 16px;'>{expected_word}</div>
339
+ <div style='font-size: 12px; color: #666; font-style: italic;'>{expected_roman}</div>
340
+ </td>
341
+ <td style='padding: 10px;'>
342
+ <div style='font-family: monospace; font-size: 16px; color: #e74c3c;'>{actual_word}</div>
343
+ <div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
344
+ </td>
345
+ <td style='padding: 10px; text-align: center;'>
346
+ <span style='background: #e74c3c; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>✗ Different</span>
347
+ </td>
348
+ </tr>
349
+ """
350
+
351
  elif tag == 'delete':
352
+ # Missing words
353
+ for idx, word in enumerate(intended_words[i1:i2]):
354
+ word_index += 1
355
+ roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
356
+ feedback_html += f"""
357
+ <tr style='background: #ffeaa7;'>
358
+ <td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
359
+ <td style='padding: 10px;'>
360
+ <div style='font-family: monospace; font-size: 16px;'>{word}</div>
361
+ <div style='font-size: 12px; color: #666; font-style: italic;'>{roman_word}</div>
362
+ </td>
363
+ <td style='padding: 10px; color: #e17055; font-style: italic;'>
364
+ <em>Not spoken</em>
365
+ </td>
366
+ <td style='padding: 10px; text-align: center;'>
367
+ <span style='background: #fdcb6e; color: #2d3436; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>⚠ Missing</span>
368
+ </td>
369
+ </tr>
370
+ """
371
+
372
  elif tag == 'insert':
373
+ # Extra words
374
+ for idx, word in enumerate(actual_words[j1:j2]):
375
+ actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
376
+ feedback_html += f"""
377
+ <tr style='background: #fab1a0;'>
378
+ <td style='padding: 10px; text-align: center; font-weight: bold;'>+</td>
379
+ <td style='padding: 10px; color: #636e72; font-style: italic;'>
380
+ <em>Not expected</em>
381
+ </td>
382
+ <td style='padding: 10px;'>
383
+ <div style='font-family: monospace; font-size: 16px; color: #e17055;'>{word}</div>
384
+ <div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
385
+ </td>
386
+ <td style='padding: 10px; text-align: center;'>
387
+ <span style='background: #fd79a8; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>+ Extra</span>
388
+ </td>
389
+ </tr>
390
+ """
391
+
392
+ feedback_html += """
393
+ </tbody>
394
+ </table>
395
+ </div>
396
+ """
397
+
398
+ # Calculate accuracy
399
+ accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
400
+
401
+ # Summary section
402
+ feedback_html += f"""
403
+ <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 12px; text-align: center;'>
404
+ <h4 style='margin: 0 0 15px 0; font-size: 24px;'>🎯 Performance Summary</h4>
405
+ <div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 15px;'>
406
+ <div style='background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; min-width: 150px;'>
407
+ <div style='font-size: 32px; font-weight: bold;'>{accuracy:.1f}%</div>
408
+ <div style='font-size: 14px; opacity: 0.9;'>Word Accuracy</div>
409
+ </div>
410
+ <div style='background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; min-width: 150px;'>
411
+ <div style='font-size: 32px; font-weight: bold;'>{correct_words}/{total_words}</div>
412
+ <div style='font-size: 14px; opacity: 0.9;'>Correct Words</div>
413
+ </div>
414
+ </div>
415
+ <div style='margin-top: 15px; font-size: 18px;'>
416
+ """
417
+
418
+ # Motivational message
419
+ if accuracy >= 95:
420
+ feedback_html += "<span>🎉 Outstanding! Perfect pronunciation!</span>"
421
+ elif accuracy >= 85:
422
+ feedback_html += "<span>🌟 Excellent work! Very clear pronunciation!</span>"
423
+ elif accuracy >= 70:
424
+ feedback_html += "<span>👍 Good job! Keep practicing those tricky words!</span>"
425
+ elif accuracy >= 50:
426
+ feedback_html += "<span>📚 Making progress! Focus on the highlighted words!</span>"
427
+ else:
428
+ feedback_html += "<span>💪 Keep going! Practice makes perfect!</span>"
429
+
430
+ feedback_html += """
431
+ </div>
432
+ </div>
433
+ """
434
+
435
+ # Add HK transliteration section for reference
436
+ if lang_choice in ["Tamil", "Malayalam"]:
437
+ feedback_html += f"""
438
+ <div style='margin-top: 20px; padding: 15px; background: #ecf0f1; border-radius: 8px;'>
439
+ <h4 style='color: #2c3e50; margin-bottom: 10px;'>🔤 Harvard-Kyoto Transliteration (for reference)</h4>
440
+ <div style='display: grid; grid-template-columns: 1fr 1fr; gap: 15px;'>
441
+ <div>
442
+ <strong>Expected:</strong><br>
443
+ <span style='font-family: monospace; background: white; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{intended_hk}</span>
444
+ </div>
445
+ <div>
446
+ <strong>You said:</strong><br>
447
+ <span style='font-family: monospace; background: white; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{actual_hk}</span>
448
+ </div>
449
+ </div>
450
+ </div>
451
+ """
452
+
453
+ feedback_html += "</div>"
454
+
455
+ return feedback_html, accuracy
456
+
457
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
  # ---------------- MAIN ---------------- #
460
+ @spaces.GPU
461
+ def compare_pronunciation(audio, lang_choice, intended_sentence, pass1_beam, pass1_temp):
462
  if audio is None or not intended_sentence.strip():
463
+ return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
 
 
 
 
 
 
464
 
465
+ try:
466
+ # Single transcription pass with user settings
467
+ actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
468
+
469
+ if not actual_text.strip():
470
+ return ("⚠️ No speech detected. Please try recording again.", "", "", "", "")
471
+
472
+ # Compute metrics
473
+ wer_val = jiwer.wer(intended_sentence, actual_text)
474
+ cer_val = jiwer.cer(intended_sentence, actual_text)
475
+
476
+ # Get transliterations for both texts
477
+ intended_roman = transliterate_to_roman(intended_sentence, lang_choice)
478
+ actual_roman = transliterate_to_roman(actual_text, lang_choice)
479
+
480
+ # Create comprehensive tabular feedback
481
+ feedback_html, accuracy = create_tabular_feedback(intended_sentence, actual_text, lang_choice)
482
+
483
+ return (
484
+ actual_text,
485
+ actual_roman,
486
+ f"{wer_val:.1%}",
487
+ f"{cer_val:.1%}",
488
+ feedback_html
489
+ )
490
+
491
+ except Exception as e:
492
+ error_msg = f"❌ Error during transcription: {str(e)}"
493
+ print(error_msg)
494
+ return (error_msg, "", "", "", "")
495
 
496
  # ---------------- UI ---------------- #
497
+ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
498
+ gr.Markdown("""
499
+ # 🎙️ AI Pronunciation Coach
500
+ ### Practice English, Tamil & Malayalam with AI feedback
501
+
502
+ **How to use:**
503
+ 1. Select your language
504
+ 2. Generate a practice sentence
505
+ 3. Record yourself reading it aloud
506
+ 4. Get instant feedback on your pronunciation!
507
+ """)
508
 
509
  with gr.Row():
510
+ with gr.Column(scale=2):
511
+ lang_choice = gr.Dropdown(
512
+ choices=list(LANG_CODES.keys()),
513
+ value="Malayalam",
514
+ label="🌍 Choose Language"
515
+ )
516
+ with gr.Column(scale=1):
517
+ gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
518
+
519
+ intended_display = gr.Textbox(
520
+ label="📝 Practice Sentence (Read this aloud)",
521
+ interactive=False,
522
+ placeholder="Click 'Generate Practice Sentence' to get started..."
523
+ )
524
 
525
  with gr.Row():
526
+ with gr.Column():
527
+ audio_input = gr.Audio(
528
+ sources=["microphone"],
529
+ type="filepath",
530
+ label="🎤 Record Your Pronunciation"
531
+ )
532
+ with gr.Column():
533
+ gr.Markdown("### ⚙️ Advanced Settings")
534
+ pass1_beam = gr.Slider(1, 10, value=5, step=1, label="Beam Size (accuracy vs speed)")
535
+ pass1_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature (creativity)")
536
+
537
+ analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
538
 
539
  with gr.Row():
540
+ with gr.Column():
541
+ pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
542
+ actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Roman)", interactive=False)
543
+ with gr.Column():
544
+ wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
545
+ cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
 
546
 
547
+ gr.Markdown("### 📋 Detailed Analysis")
548
+ feedback_display = gr.HTML()
 
549
 
550
  # Event handlers
551
+ gen_btn.click(
552
+ fn=get_random_sentence,
553
+ inputs=[lang_choice],
554
+ outputs=[intended_display]
555
+ )
556
 
557
+ analyze_btn.click(
558
  fn=compare_pronunciation,
559
+ inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
560
+ outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
 
 
 
561
  )
562
 
563
  if __name__ == "__main__":