sudhanm commited on
Commit
be6893d
·
verified ·
1 Parent(s): 59775e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -698
app.py CHANGED
@@ -1,20 +1,20 @@
1
  import gradio as gr
2
  import random
3
  import difflib
4
- import re
5
- import unicodedata
6
  import jiwer
7
  import torch
8
- from transformers import WhisperForConditionalGeneration, WhisperProcessor
9
- from indic_transliteration import sanscript
10
- from indic_transliteration.sanscript import transliterate
 
 
 
11
  import spaces
12
  import gc
13
 
14
  # ---------------- CONFIG ---------------- #
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
- # Updated model configurations for each language
18
  MODEL_CONFIGS = {
19
  "English": "openai/whisper-large-v2",
20
  "Tamil": "vasista22/whisper-tamil-large-v2",
@@ -27,847 +27,364 @@ LANG_CODES = {
27
  "Malayalam": "ml"
28
  }
29
 
30
- LANG_PRIMERS = {
31
- "English": ("The transcript should be in English only.",
32
- "Write only in English without translation. Example: This is an English sentence."),
33
- "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
34
- "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
35
- "Malayalam": ("ട്രാൻസ്ഖ്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
36
- "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം.")
37
- }
38
-
39
- SCRIPT_PATTERNS = {
40
- "Tamil": re.compile(r"[஀-௿]"),
41
- "Malayalam": re.compile(r"[ഀ-ൿ]"),
42
- "English": re.compile(r"[A-Za-z]")
43
- }
44
-
45
  SENTENCE_BANK = {
46
  "English": [
47
  "The sun sets over the horizon.",
48
  "Learning languages is fun.",
49
  "I like to drink coffee in the morning.",
50
  "Technology helps us communicate better.",
51
- "Reading books expands our knowledge.",
52
- "Music brings people together.",
53
- "Exercise keeps us healthy and strong.",
54
- "Cooking is both art and science."
55
  ],
56
  "Tamil": [
57
  "இன்று நல்ல வானிலை உள்ளது.",
58
  "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
59
  "எனக்கு புத்தகம் படிக்க விருப்பம்.",
60
  "தமிழ் மொழி மிகவும் அழகானது.",
61
- "நான் தினமும் பள்ளிக்கு செல்கிறேன்.",
62
- "எனக்கு இசை கேட்க மிகவும் பிடிக்கும்.",
63
- "அன்னை தமிழ் எங்கள் தாய்மொழி.",
64
- "நல்ல உணவு உடல் நலத்திற்கு அவசியம்."
65
  ],
66
  "Malayalam": [
67
  "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
68
  "ഇന്ന് മഴപെയ്യുന്നു.",
69
  "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
70
  "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
71
- "ഞാൻ മലയാളം പഠിക്കുന്നു.",
72
- "സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്.",
73
- "നല്ല ആരോഗ്യം വളരെ പ്രധാനമാണ്.",
74
- "വിദ്യാഭ്യാസം ജീവിതത്തിൽ അത്യാവശ്യമാണ്."
75
  ]
76
  }
77
 
78
- # ---------------- IMPROVED TRANSLITERATION SYSTEM ---------------- #
79
-
80
- def transliterate_to_natural_roman(text, lang_choice):
81
- """
82
- Generalizable transliteration to natural romanization (Thanglish/Manglish)
83
- using systematic phonetic rules instead of manual dictionaries
84
- """
85
- if not text or not text.strip():
86
- return ""
87
-
88
- if lang_choice == "English":
89
- return text
90
-
91
- try:
92
- # Step 1: Convert to ISO 15919 (more systematic than IAST)
93
- if lang_choice == "Tamil":
94
- iso_text = transliterate(text, sanscript.TAMIL, sanscript.ISO)
95
- elif lang_choice == "Malayalam":
96
- iso_text = transliterate(text, sanscript.MALAYALAM, sanscript.ISO)
97
- else:
98
- return text
99
-
100
- # Step 2: Apply systematic phonetic conversion
101
- romanized = apply_systematic_phonetic_rules(iso_text)
102
-
103
- # Step 3: Apply language-specific natural patterns
104
- romanized = apply_natural_language_patterns(romanized, lang_choice)
105
-
106
- # Step 4: Final phonetic cleanup and flow optimization
107
- romanized = optimize_natural_flow(romanized)
108
-
109
- return romanized if romanized else text
110
-
111
- except Exception as e:
112
- print(f"Transliteration error: {e}")
113
- return text
114
 
115
- def apply_systematic_phonetic_rules(iso_text):
116
- """
117
- Apply systematic phonetic rules based on linguistic principles
118
- rather than manual character mappings
119
- """
120
- result = iso_text
121
-
122
- # === VOWEL SYSTEM ===
123
- # Long vowels -> natural doubling (how native speakers type)
124
- vowel_rules = [
125
- (r'ā', 'aa'), # long a
126
- (r'ī', 'ii'), # long i
127
- (r'ū', 'uu'), # long u
128
- (r'ē', 'ee'), # long e (some prefer 'e', but 'ee' is clearer)
129
- (r'ō', 'oo'), # long o (some prefer 'o', but 'oo' is clearer)
130
- (r'ai', 'ai'), # diphthong ai
131
- (r'au', 'au'), # diphthong au
132
- (r'r̥', 'ru'), # vocalic r
133
- (r'r̥̄', 'ruu'), # long vocalic r
134
- (r'l̥', 'lu'), # vocalic l
135
- (r'l̥̄', 'luu'), # long vocalic l
136
- ]
137
-
138
- # === CONSONANT SYSTEM ===
139
- # Systematic consonant conversion based on phonetic properties
140
- consonant_rules = [
141
- # Nasals - context-sensitive
142
- (r'ṅ', 'ng'), # velar nasal
143
- (r'ñ', 'nj'), # palatal nasal (natural in South Indian typing)
144
- (r'ṇ', 'n'), # retroflex nasal -> dental (natural simplification)
145
- (r'n̆', 'n'), # any other nasal variants
146
-
147
- # Stops - systematic by place of articulation
148
- (r'([kg])h', r'\1h'), # keep aspirated velars
149
- (r'([cj])h', r'\1h'), # keep aspirated palatals
150
- (r'([ṭḍ])h', r'th'), # retroflex aspirated -> dental aspirated (natural)
151
- (r'([td])h', r'\1h'), # keep dental aspirated
152
- (r'([pb])h', r'\1h'), # keep labial aspirated
153
-
154
- # Retroflex simplification (how native speakers naturally type)
155
- (r'ṭ', 't'), # retroflex t -> dental t
156
- (r'ḍ', 'd'), # retroflex d -> dental d
157
- (r'ṇ', 'n'), # retroflex n -> dental n (already covered above)
158
-
159
- # Liquids and approximants
160
- (r'ṟ', 'r'), # Tamil/Malayalam retroflex r -> simple r
161
- (r'ṛ', 'r'), # any other retroflex r -> simple r
162
- (r'ḷ', 'l'), # retroflex l -> simple l (except for special cases)
163
- (r'ḻ', 'zh'), # Tamil/Malayalam special l -> zh (important!)
164
-
165
- # Sibilants - systematic
166
- (r'ś', 'sh'), # palatal sibilant
167
- (r'ṣ', 'sh'), # retroflex sibilant
168
- (r's', 's'), # dental sibilant (unchanged)
169
-
170
- # Fricatives and others
171
- (r'ḥ', 'h'), # visarga -> simple h
172
- (r'ḫ', 'h'), # any other h variants
173
- (r'×', ''), # multiplication sign sometimes appears
174
-
175
- # Common combinations (compound consonants)
176
- (r'kṣ', 'ksh'), # kṣa combination
177
- (r'jñ', 'gn'), # jña combination (natural pronunciation)
178
- (r'śr', 'shr'), # śra combination
179
- ]
180
-
181
- # Apply vowel rules first
182
- for pattern, replacement in vowel_rules:
183
- result = re.sub(pattern, replacement, result)
184
-
185
- # Apply consonant rules
186
- for pattern, replacement in consonant_rules:
187
- result = re.sub(pattern, replacement, result)
188
-
189
- return result
190
-
191
- def apply_natural_language_patterns(text, lang_choice):
192
- """
193
- Apply language-specific patterns that reflect how native speakers
194
- naturally romanize their languages
195
- """
196
- if lang_choice == "Tamil":
197
- return apply_tamil_natural_patterns(text)
198
- elif lang_choice == "Malayalam":
199
- return apply_malayalam_natural_patterns(text)
200
 
201
- return text
202
-
203
- def apply_tamil_natural_patterns(text):
204
- """Tamil-specific natural romanization patterns"""
205
-
206
- tamil_patterns = [
207
- # Tamil-specific sounds
208
- (r'ḻ', 'zh'), # Tamil zh sound (crucial)
209
- (r'ṟ', 'r'), # Tamil r sound
210
-
211
- # Natural doubling patterns in Tamil
212
- (r'([kgcjṭḍtdpb])\1', r'\1\1'), # Keep natural gemination
213
-
214
- # Tamil word-final patterns
215
- (r'um$', 'um'), # Tamil suffix -um
216
- (r'an$', 'an'), # Tamil suffix -an
217
- (r'al$', 'al'), # Tamil suffix -al
218
-
219
- # Natural vowel harmony adjustments
220
- (r'([aeiou])u([mnlr])', r'\1\2u'), # Vowel + u + liquid/nasal
221
- ]
222
-
223
- for pattern, replacement in tamil_patterns:
224
- text = re.sub(pattern, replacement, text)
225
-
226
- return text
227
-
228
- def apply_malayalam_natural_patterns(text):
229
- """Malayalam-specific natural romanization patterns"""
230
-
231
- malayalam_patterns = [
232
- # Malayalam-specific sounds
233
- (r'ḻ', 'zh'), # Malayalam zh sound (very important!)
234
- (r'ṟ', 'r'), # Malayalam r sound
235
-
236
- # Natural gemination in Malayalam
237
- (r'([kgcjṭḍtdpb])\1', r'\1\1'), # Keep natural gemination
238
-
239
- # Malayalam word patterns
240
- (r'aanu$', 'aanu'), # Malayalam copula ending
241
- (r'unnu$', 'unnu'), # Malayalam verb ending
242
- (r'aam$', 'aam'), # Malayalam suffix
243
-
244
- # Natural flow adjustments for Malayalam
245
- (r'([aeiou])([mnlr])([aeiou])', r'\1\2\3'), # Vowel-liquid-vowel unchanged
246
-
247
- # Handle Malayalam specific consonant clusters
248
- (r'ngh', 'ngh'), # Keep ngh clusters
249
- (r'mph', 'mph'), # Keep mph clusters
250
- ]
251
-
252
- for pattern, replacement in malayalam_patterns:
253
- text = re.sub(pattern, replacement, text)
254
-
255
- return text
256
-
257
- def optimize_natural_flow(text):
258
- """
259
- Final optimization for natural reading flow -
260
- how native speakers would actually type/read
261
- """
262
-
263
- # Remove any remaining diacritical marks using Unicode normalization
264
- text = ''.join(c for c in unicodedata.normalize('NFD', text)
265
- if unicodedata.category(c) != 'Mn')
266
-
267
- # Natural flow optimization rules
268
- flow_rules = [
269
- # Vowel optimization for readability
270
- (r'([aeiou])\1{2,}', r'\1\1'), # Max 2 repeated vowels
271
- (r'aaa+', 'aa'), # Multiple a's -> aa
272
- (r'iii+', 'ii'), # Multiple i's -> ii
273
- (r'uuu+', 'uu'), # Multiple u's -> uu
274
- (r'eee+', 'ee'), # Multiple e's -> ee
275
- (r'ooo+', 'oo'), # Multiple o's -> oo
276
-
277
- # Consonant cluster optimization
278
- (r'([bcdfghjklmnpqrstvwxyz])\1{2,}', r'\1\1'), # Max 2 repeated consonants
279
-
280
- # Natural word boundaries and spacing
281
- (r'\s+', ' '), # Normalize spaces
282
- (r'^\s+|\s+$', ''), # Trim leading/trailing spaces
283
-
284
- # Handle common awkward sequences
285
- (r'([aeiou])h([aeiou])', r'\1\2'), # Remove h between vowels if awkward
286
- (r'([bcdfghjklmnpqrstvwxyz])y([bcdfghjklmnpqrstvwxyz])', r'\1i\2'), # y->i in consonant clusters
287
-
288
- # Ensure readability of common endings
289
- (r'([mnlr])u$', r'\1u'), # Keep natural endings
290
- (r'([kgt])u$', r'\1u'), # Keep natural endings
291
- ]
292
-
293
- for pattern, replacement in flow_rules:
294
- text = re.sub(pattern, replacement, text)
295
-
296
- return text
297
-
298
- def enhanced_phonetic_similarity_check(intended_roman, actual_roman):
299
- """
300
- Enhanced similarity check that accounts for natural variations
301
- in how people might romanize the same sounds
302
- """
303
-
304
- # Define phonetically equivalent mappings
305
- phonetic_equivalents = {
306
- 'aa': ['a', 'aa'],
307
- 'ii': ['i', 'ii'],
308
- 'uu': ['u', 'uu'],
309
- 'ee': ['e', 'ee'],
310
- 'oo': ['o', 'oo'],
311
- 'zh': ['zh', 'z', 'l'], # Common variations for zh sound
312
- 'sh': ['sh', 's'], # sh vs s variations
313
- 'ch': ['ch', 'c'], # ch vs c variations
314
- 'th': ['th', 't'], # th vs t variations
315
- 'dh': ['dh', 'd'], # dh vs d variations
316
- 'ksh': ['ksh', 'ksh', 'ks'], # ksh variations
317
- 'gn': ['gn', 'ny', 'nj'], # gn/ny/nj variations
318
- }
319
-
320
- # Normalize both strings for comparison
321
- intended_normalized = normalize_for_comparison(intended_roman, phonetic_equivalents)
322
- actual_normalized = normalize_for_comparison(actual_roman, phonetic_equivalents)
323
-
324
- return intended_normalized, actual_normalized
325
-
326
- def normalize_for_comparison(text, equivalents):
327
- """Normalize text for phonetic comparison"""
328
 
329
- text = text.lower().strip()
330
-
331
- # Replace equivalents with canonical forms
332
- for canonical, variants in equivalents.items():
333
- for variant in variants:
334
- text = text.replace(variant, canonical)
335
-
336
- return text
337
-
338
- # ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
339
- # Store only currently loaded model to save memory
340
- current_model = {"language": None, "model": None, "processor": None}
341
-
342
- def load_model_for_language(language_choice):
343
- """Load model on-demand and clear previous model from memory"""
344
- global current_model
345
-
346
- # If same language is already loaded, return current model
347
- if current_model["language"] == language_choice and current_model["model"] is not None:
348
- return current_model["model"], current_model["processor"]
349
-
350
- # Clear previous model from memory
351
- if current_model["model"] is not None:
352
- del current_model["model"]
353
- del current_model["processor"]
354
  gc.collect()
355
  if DEVICE == "cuda":
356
  torch.cuda.empty_cache()
357
 
358
  # Load new model
359
  model_id = MODEL_CONFIGS[language_choice]
360
- print(f"Loading {language_choice} model: {model_id}")
361
 
362
  try:
363
  model = WhisperForConditionalGeneration.from_pretrained(
364
- model_id,
365
- torch_dtype=torch.float32
366
  ).to(DEVICE)
367
  processor = WhisperProcessor.from_pretrained(model_id)
368
 
369
- current_model = {
370
  "language": language_choice,
371
  "model": model,
372
  "processor": processor
373
  }
374
 
375
- print(f"✓ {language_choice} model loaded successfully")
376
  return model, processor
377
 
378
  except Exception as e:
379
- print(f"✗ Error loading {language_choice} model: {e}")
380
- # Fallback to base whisper model
381
- print(f"Falling back to openai/whisper-base for {language_choice}")
382
  model = WhisperForConditionalGeneration.from_pretrained(
383
- "openai/whisper-base",
384
- torch_dtype=torch.float32
385
  ).to(DEVICE)
386
  processor = WhisperProcessor.from_pretrained("openai/whisper-base")
387
 
388
- current_model = {
389
  "language": language_choice,
390
  "model": model,
391
  "processor": processor
392
  }
393
-
394
  return model, processor
395
 
396
- # ---------------- HELPERS ---------------- #
397
- def get_random_sentence(language_choice):
398
- return random.choice(SENTENCE_BANK[language_choice])
399
-
400
- def get_random_sentence_with_transliteration(language_choice):
401
- sentence = random.choice(SENTENCE_BANK[language_choice])
402
- if language_choice in ["Tamil", "Malayalam"]:
403
- # Use the new improved transliteration system
404
- transliteration = transliterate_to_natural_roman(sentence, language_choice)
405
- # Combine sentence with transliteration in the same box
406
- combined_sentence = f"{sentence}\n\n🔤 {transliteration}"
407
- return combined_sentence, transliteration
408
- else:
409
- return sentence, ""
410
-
411
- def is_script(text, lang_name):
412
- pattern = SCRIPT_PATTERNS.get(lang_name)
413
- return bool(pattern.search(text)) if pattern else True
414
-
415
- def transliterate_to_hk(text, lang_choice):
416
- """Improved transliteration with better handling"""
417
- if not text or not text.strip():
418
- return ""
419
 
420
- mapping = {
421
- "Tamil": sanscript.TAMIL,
422
- "Malayalam": sanscript.MALAYALAM,
423
- "English": None
424
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
- if mapping[lang_choice] is None:
427
- return text # Return as-is for English
 
428
 
429
  try:
430
- # Clean the text and transliterate
431
- cleaned_text = text.strip()
432
- transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.HK)
433
- return transliterated if transliterated else text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  except Exception as e:
435
- print(f"Transliteration error: {e}")
436
  return text
437
 
438
- # Updated function that uses the new transliteration system
439
- def transliterate_to_simple_roman(text, lang_choice):
440
- """
441
- IMPROVED VERSION: Natural transliteration using systematic phonetic rules
442
- """
443
- return transliterate_to_natural_roman(text, lang_choice)
444
 
445
  @spaces.GPU
446
- def transcribe_once(audio_path, language_choice, beam_size, temperature):
447
- # Get the appropriate model and processor for the language
448
- model, processor = load_model_for_language(language_choice)
449
  lang_code = LANG_CODES[language_choice]
450
 
451
- # Load and process audio
452
  import librosa
453
  audio, sr = librosa.load(audio_path, sr=16000)
454
 
455
- # Process audio with the specific model's processor
456
  input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
 
457
 
458
- # Ensure input tensor matches model dtype
459
- model_dtype = next(model.parameters()).dtype
460
- input_features = input_features.to(device=DEVICE, dtype=model_dtype)
461
-
462
- # Generate transcription with fallback for different model capabilities
463
  with torch.no_grad():
464
  try:
465
- # Try with forced decoder ids first (standard Whisper models)
466
  forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
467
  predicted_ids = model.generate(
468
  input_features,
469
  forced_decoder_ids=forced_decoder_ids,
470
  max_length=448,
471
- num_beams=beam_size,
472
- temperature=temperature if temperature > 0 else None,
473
- do_sample=temperature > 0,
474
  )
475
- except (TypeError, ValueError) as e:
476
- # Fallback for models that don't support forced_decoder_ids (like some fine-tuned models)
477
- print(f"Fallback generation for {language_choice}: {e}")
478
  predicted_ids = model.generate(
479
  input_features,
480
  max_length=448,
481
- num_beams=beam_size,
482
- temperature=temperature if temperature > 0 else None,
483
- do_sample=temperature > 0,
484
  )
485
 
486
- # Decode the transcription
487
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
488
  return transcription.strip()
489
 
490
- def normalize_word(word):
491
- """Remove punctuation and normalize word for comparison"""
492
- import string
493
- # Remove punctuation and whitespace
494
- return word.strip().translate(str.maketrans('', '', string.punctuation)).lower()
495
 
496
- def create_enhanced_tabular_feedback(intended, actual, lang_choice):
497
- """
498
- Enhanced feedback system with better phonetic comparison
499
- """
 
500
 
501
- # Get natural transliterations using the new system
502
- intended_roman = transliterate_to_natural_roman(intended, lang_choice)
503
- actual_roman = transliterate_to_natural_roman(actual, lang_choice)
504
-
505
- # Split into words for comparison
506
  intended_words = intended.strip().split()
507
  actual_words = actual.strip().split()
508
- intended_roman_words = intended_roman.strip().split()
509
- actual_roman_words = actual_roman.strip().split()
510
-
511
- # Calculate accuracy with phonetic awareness
512
- correct_words = 0
513
- total_words = len(intended_words)
514
-
515
- # Create word-by-word comparison table
516
- feedback_html = """
517
- <div style='font-family: Arial, sans-serif; padding: 20px; margin: 10px 0;'>
518
- <h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Enhanced Pronunciation Analysis</h3>
519
- """
520
-
521
- # Overview table with improved romanization
522
- feedback_html += """
523
- <div style='margin-bottom: 25px;'>
524
- <h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison (Improved Natural Romanization)</h4>
525
- <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
526
- <thead>
527
- <tr style='border-bottom: 2px solid #ddd;'>
528
- <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
529
- <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
530
- <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Natural Romanization</th>
531
- </tr>
532
- </thead>
533
- <tbody>
534
- <tr style='border-bottom: 1px solid #ddd;'>
535
- <td style='padding: 15px; font-weight: bold; color: #27ae60; border-right: 1px solid #ddd;'>🎯 Target</td>
536
- <td style='padding: 15px; font-family: monospace; font-size: 18px; border-right: 1px solid #ddd;'>{}</td>
537
- <td style='padding: 15px; font-family: monospace; font-size: 16px; color: #555;'>{}</td>
538
- </tr>
539
- <tr>
540
- <td style='padding: 15px; font-weight: bold; color: #e67e22; border-right: 1px solid #ddd;'>🗣️ You Said</td>
541
- <td style='padding: 15px; font-family: monospace; font-size: 18px; border-right: 1px solid #ddd;'>{}</td>
542
- <td style='padding: 15px; font-family: monospace; font-size: 16px; color: #555;'>{}</td>
543
- </tr>
544
- </tbody>
545
- </table>
546
- </div>
547
- """.format(intended, intended_roman, actual, actual_roman)
548
-
549
- # Enhanced word-by-word analysis with phonetic awareness
550
- feedback_html += """
551
- <div style='margin-bottom: 25px;'>
552
- <h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Enhanced Word-by-Word Analysis</h4>
553
- <table style='width: 100%; border-collapse: collapse; border: 2px solid #ddd;'>
554
- <thead>
555
- <tr style='border-bottom: 2px solid #ddd;'>
556
- <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>#</th>
557
- <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Expected Word</th>
558
- <th style='padding: 12px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>What You Said</th>
559
- <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Phonetic Match</th>
560
- <th style='padding: 12px; text-align: center; font-weight: bold; color: #2c3e50;'>Result</th>
561
- </tr>
562
- </thead>
563
- <tbody>
564
- """
565
 
566
- # Enhanced word comparison with phonetic similarity
567
  sm = difflib.SequenceMatcher(None, intended_words, actual_words)
568
- word_index = 0
569
 
570
- for tag, i1, i2, j1, j2 in sm.get_opcodes():
571
- if tag == 'equal':
572
- # Correct words
573
- for idx, word in enumerate(intended_words[i1:i2]):
574
- word_index += 1
575
- correct_words += 1
576
- roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
577
- actual_word = actual_words[j1 + idx] if (j1 + idx) < len(actual_words) else ""
578
- actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
579
-
580
- feedback_html += f"""
581
- <tr style='border-bottom: 1px solid #eee;'>
582
- <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
583
- <td style='padding: 12px; border-right: 1px solid #ddd;'>
584
- <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px;'>{word}</div>
585
- <div style='font-size: 13px; color: #888;'>({roman_word})</div>
586
- </td>
587
- <td style='padding: 12px; border-right: 1px solid #ddd;'>
588
- <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #27ae60;'>{actual_word}</div>
589
- <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
590
- </td>
591
- <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
592
- <span style='color: #27ae60; font-weight: bold;'>Perfect</span>
593
- </td>
594
- <td style='padding: 12px; text-align: center;'>
595
- <span style='color: #27ae60; font-weight: bold; font-size: 20px;'>✓</span>
596
- <div style='font-size: 12px; color: #27ae60; margin-top: 2px;'>Exact</div>
597
- </td>
598
- </tr>
599
- """
600
 
601
- elif tag == 'replace':
602
- # Check for phonetic similarity in replacements
603
- max_words = max(i2-i1, j2-j1)
604
- for idx in range(max_words):
605
- word_index += 1
606
- expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
607
- expected_roman = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
608
- actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
609
- actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
610
-
611
- # Check phonetic similarity
612
- if expected_roman and actual_roman_word:
613
- norm_expected, norm_actual = enhanced_phonetic_similarity_check(expected_roman, actual_roman_word)
614
- similarity_ratio = difflib.SequenceMatcher(None, norm_expected, norm_actual).ratio()
615
-
616
- if similarity_ratio > 0.8: # High phonetic similarity
617
- phonetic_match = "Very Close"
618
- phonetic_color = "#f39c12"
619
- result_icon = "≈"
620
- result_text = "Similar"
621
- correct_words += 0.8 # Partial credit
622
- elif similarity_ratio > 0.6: # Moderate similarity
623
- phonetic_match = "Close"
624
- phonetic_color = "#e67e22"
625
- result_icon = "~"
626
- result_text = "Close"
627
- correct_words += 0.5 # Partial credit
628
- else:
629
- phonetic_match = "Different"
630
- phonetic_color = "#e74c3c"
631
- result_icon = "✗"
632
- result_text = "Different"
633
- else:
634
- phonetic_match = "Different"
635
- phonetic_color = "#e74c3c"
636
- result_icon = "✗"
637
- result_text = "Different"
638
-
639
- feedback_html += f"""
640
- <tr style='border-bottom: 1px solid #eee;'>
641
- <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
642
- <td style='padding: 12px; border-right: 1px solid #ddd;'>
643
- <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px;'>{expected_word}</div>
644
- <div style='font-size: 13px; color: #888;'>({expected_roman})</div>
645
- </td>
646
- <td style='padding: 12px; border-right: 1px solid #ddd;'>
647
- <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: {phonetic_color};'>{actual_word}</div>
648
- <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
649
- </td>
650
- <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
651
- <span style='color: {phonetic_color}; font-weight: bold;'>{phonetic_match}</span>
652
- </td>
653
- <td style='padding: 12px; text-align: center;'>
654
- <span style='color: {phonetic_color}; font-weight: bold; font-size: 20px;'>{result_icon}</span>
655
- <div style='font-size: 12px; color: {phonetic_color}; margin-top: 2px;'>{result_text}</div>
656
- </td>
657
- </tr>
658
- """
659
-
660
- elif tag == 'delete':
661
- # Missing words
662
- for idx, word in enumerate(intended_words[i1:i2]):
663
- word_index += 1
664
- roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
665
- feedback_html += f"""
666
- <tr style='border-bottom: 1px solid #eee;'>
667
- <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>{word_index}</td>
668
- <td style='padding: 12px; border-right: 1px solid #ddd;'>
669
- <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px;'>{word}</div>
670
- <div style='font-size: 13px; color: #888;'>({roman_word})</div>
671
- </td>
672
- <td style='padding: 12px; color: #f39c12; font-style: italic; border-right: 1px solid #ddd;'>
673
- <em>Not spoken</em>
674
- </td>
675
- <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
676
- <span style='color: #f39c12; font-weight: bold;'>Missing</span>
677
- </td>
678
- <td style='padding: 12px; text-align: center;'>
679
- <span style='color: #f39c12; font-weight: bold; font-size: 20px;'>⚠</span>
680
- <div style='font-size: 12px; color: #f39c12; margin-top: 2px;'>Missing</div>
681
- </td>
682
- </tr>
683
- """
684
-
685
- elif tag == 'insert':
686
- # Extra words
687
- for idx, word in enumerate(actual_words[j1:j2]):
688
- actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
689
- feedback_html += f"""
690
- <tr style='border-bottom: 1px solid #eee;'>
691
- <td style='padding: 12px; text-align: center; font-weight: bold; color: #666; border-right: 1px solid #ddd;'>+</td>
692
- <td style='padding: 12px; color: #9b59b6; font-style: italic; border-right: 1px solid #ddd;'>
693
- <em>Not expected</em>
694
- </td>
695
- <td style='padding: 12px; border-right: 1px solid #ddd;'>
696
- <div style='font-family: monospace; font-size: 16px; margin-bottom: 4px; color: #9b59b6;'>{word}</div>
697
- <div style='font-size: 13px; color: #888;'>({actual_roman_word})</div>
698
- </td>
699
- <td style='padding: 12px; text-align: center; border-right: 1px solid #ddd;'>
700
- <span style='color: #9b59b6; font-weight: bold;'>Extra</span>
701
- </td>
702
- <td style='padding: 12px; text-align: center;'>
703
- <span style='color: #9b59b6; font-weight: bold; font-size: 20px;'>+</span>
704
- <div style='font-size: 12px; color: #9b59b6; margin-top: 2px;'>Extra</div>
705
- </td>
706
- </tr>
707
- """
708
-
709
- feedback_html += """
710
- </tbody>
711
  </table>
712
- </div>
713
- """
714
-
715
- # Calculate enhanced accuracy
716
- accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
717
-
718
- # Enhanced summary section
719
- feedback_html += f"""
720
- <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 25px; border-radius: 12px; text-align: center; margin-top: 20px;'>
721
- <h4 style='margin: 0 0 20px 0; font-size: 24px;'>🎯 Enhanced Pronunciation Score</h4>
722
- <div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 20px;'>
723
- <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
724
- <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{accuracy:.0f}%</div>
725
- <div style='font-size: 16px; opacity: 0.9;'>Phonetic Accuracy</div>
726
- </div>
727
- <div style='background: rgba(255,255,255,0.15); padding: 20px; border-radius: 12px; min-width: 160px;'>
728
- <div style='font-size: 40px; font-weight: bold; margin-bottom: 8px;'>{correct_words:.1f}/{total_words}</div>
729
- <div style='font-size: 16px; opacity: 0.9;'>Words Matched</div>
730
  </div>
731
  </div>
732
- <div style='margin-top: 15px; font-size: 14px; opacity: 0.8;'>
733
- ✨ Now with enhanced phonetic matching for better accuracy!
734
- </div>
735
  """
736
 
737
- # Enhanced motivational message
738
- if accuracy >= 95:
739
- feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>🎉 Outstanding! Perfect natural pronunciation!</span></div>"
740
- elif accuracy >= 85:
741
- feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>🌟 Excellent! Very natural sounding!</span></div>"
742
- elif accuracy >= 70:
743
- feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>👍 Good job! Your pronunciation is improving!</span></div>"
744
- elif accuracy >= 50:
745
- feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>📚 Getting there! Focus on the highlighted sounds!</span></div>"
746
- else:
747
- feedback_html += "<div style='margin-top: 15px; font-size: 18px;'><span>💪 Keep practicing! Every attempt makes you better!</span></div>"
748
-
749
- feedback_html += "</div></div>"
750
-
751
  return feedback_html, accuracy
752
 
753
- # ---------------- MAIN ---------------- #
 
754
  @spaces.GPU
755
- def compare_pronunciation(audio, lang_choice, intended_display_text, pass1_beam, pass1_temp):
756
- if audio is None or not intended_display_text.strip():
757
- return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
 
758
 
759
  try:
760
- # Extract just the original sentence (before the transliteration part)
761
- if "🔤" in intended_display_text:
762
- intended_sentence = intended_display_text.split("🔤")[0].strip()
763
  else:
764
- intended_sentence = intended_display_text.strip()
765
 
766
- # Single transcription pass with user settings
767
- actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
768
 
769
  if not actual_text.strip():
770
- return ("⚠️ No speech detected. Please try recording again.", "", "", "", "")
771
 
772
- # Compute metrics
773
  wer_val = jiwer.wer(intended_sentence, actual_text)
774
  cer_val = jiwer.cer(intended_sentence, actual_text)
775
 
776
- # Get improved transliterations for both texts
777
- intended_roman = transliterate_to_natural_roman(intended_sentence, lang_choice)
778
- actual_roman = transliterate_to_natural_roman(actual_text, lang_choice)
779
 
780
- # Create enhanced tabular feedback with phonetic awareness
781
- feedback_html, accuracy = create_enhanced_tabular_feedback(intended_sentence, actual_text, lang_choice)
782
 
783
- return (
784
- actual_text,
785
- actual_roman,
786
- f"{wer_val:.1%}",
787
- f"{cer_val:.1%}",
788
- feedback_html
789
- )
790
 
791
  except Exception as e:
792
- error_msg = f"❌ Error during transcription: {str(e)}"
793
- print(error_msg)
794
- return (error_msg, "", "", "", "")
795
 
796
- def get_sentence_for_display(language_choice):
797
- sentence, transliteration = get_random_sentence_with_transliteration(language_choice)
 
 
 
 
 
798
  return sentence
799
 
800
  # ---------------- UI ---------------- #
801
- with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
 
802
  gr.Markdown("""
803
- # 🎙️ AI Pronunciation Coach (Enhanced)
804
- ### Practice English, Tamil & Malayalam with AI feedback
805
 
806
- **New Features:**
807
- - ✨ **Natural Romanization**: Improved Thanglish/Manglish that looks like how you actually type
808
- - 🎯 **Phonetic Matching**: Gives partial credit for sounds that are close (zh/z/l variations)
809
- - 📊 **Enhanced Feedback**: More accurate scoring with linguistic awareness
810
 
811
  **How to use:**
812
  1. Select your language
813
  2. Generate a practice sentence
814
  3. Record yourself reading it aloud
815
- 4. Get instant enhanced feedback on your pronunciation!
816
  """)
817
 
818
  with gr.Row():
819
- with gr.Column(scale=2):
820
- lang_choice = gr.Dropdown(
821
- choices=list(LANG_CODES.keys()),
822
- value="Malayalam",
823
- label="🌍 Choose Language"
824
- )
825
- with gr.Column(scale=1):
826
- gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
827
 
828
  intended_display = gr.Textbox(
829
- label="📝 Practice Sentence (Read this aloud)",
830
  interactive=False,
831
  placeholder="Click 'Generate Practice Sentence' to get started...",
832
  lines=3
833
  )
834
 
835
- with gr.Row():
836
- with gr.Column():
837
- audio_input = gr.Audio(
838
- sources=["microphone"],
839
- type="filepath",
840
- label="🎤 Record Your Pronunciation"
841
- )
842
- with gr.Column():
843
- gr.Markdown("### ⚙️ Advanced Settings")
844
- pass1_beam = gr.Slider(1, 10, value=5, step=1, label="Beam Size (accuracy vs speed)")
845
- pass1_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature (creativity)")
846
 
847
  analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
848
 
849
  with gr.Row():
850
- with gr.Column():
851
- pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
852
- actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Natural Romanized)", interactive=False)
853
- with gr.Column():
854
- wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
855
- cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
856
-
857
- gr.Markdown("### 📋 Enhanced Detailed Analysis")
858
  feedback_display = gr.HTML()
859
 
860
  # Event handlers
861
  gen_btn.click(
862
- fn=get_sentence_for_display,
863
- inputs=[lang_choice],
864
  outputs=[intended_display]
865
  )
866
 
867
  analyze_btn.click(
868
- fn=compare_pronunciation,
869
- inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
870
- outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
871
  )
872
 
873
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import random
3
  import difflib
 
 
4
  import jiwer
5
  import torch
6
+ from transformers import (
7
+ WhisperForConditionalGeneration,
8
+ WhisperProcessor,
9
+ AutoModelForCausalLM,
10
+ AutoTokenizer
11
+ )
12
  import spaces
13
  import gc
14
 
15
  # ---------------- CONFIG ---------------- #
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
 
 
18
  MODEL_CONFIGS = {
19
  "English": "openai/whisper-large-v2",
20
  "Tamil": "vasista22/whisper-tamil-large-v2",
 
27
  "Malayalam": "ml"
28
  }
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  SENTENCE_BANK = {
31
  "English": [
32
  "The sun sets over the horizon.",
33
  "Learning languages is fun.",
34
  "I like to drink coffee in the morning.",
35
  "Technology helps us communicate better.",
36
+ "Reading books expands our knowledge."
 
 
 
37
  ],
38
  "Tamil": [
39
  "இன்று நல்ல வானிலை உள்ளது.",
40
  "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
41
  "எனக்கு புத்தகம் படிக்க விருப்பம்.",
42
  "தமிழ் மொழி மிகவும் அழகானது.",
43
+ "அன்னை தமிழ் எங்கள் தாய்மொழி."
 
 
 
44
  ],
45
  "Malayalam": [
46
  "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
47
  "ഇന്ന് മഴപെയ്യുന്നു.",
48
  "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
49
  "കേരളം എന്റെ സ്വന്തം നാടാണ്.",
50
+ "സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്."
 
 
 
51
  ]
52
  }
53
 
54
+ # ---------------- MODELS ---------------- #
55
+ current_whisper_model = {"language": None, "model": None, "processor": None}
56
+ qwen_model = {"model": None, "tokenizer": None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def load_whisper_model(language_choice):
59
+ """Load Whisper model for the selected language"""
60
+ global current_whisper_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ if current_whisper_model["language"] == language_choice and current_whisper_model["model"] is not None:
63
+ return current_whisper_model["model"], current_whisper_model["processor"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Clear previous model
66
+ if current_whisper_model["model"] is not None:
67
+ del current_whisper_model["model"]
68
+ del current_whisper_model["processor"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  gc.collect()
70
  if DEVICE == "cuda":
71
  torch.cuda.empty_cache()
72
 
73
  # Load new model
74
  model_id = MODEL_CONFIGS[language_choice]
75
+ print(f"Loading Whisper model: {model_id}")
76
 
77
  try:
78
  model = WhisperForConditionalGeneration.from_pretrained(
79
+ model_id, torch_dtype=torch.float32
 
80
  ).to(DEVICE)
81
  processor = WhisperProcessor.from_pretrained(model_id)
82
 
83
+ current_whisper_model = {
84
  "language": language_choice,
85
  "model": model,
86
  "processor": processor
87
  }
88
 
89
+ print(f"✓ Whisper model loaded successfully")
90
  return model, processor
91
 
92
  except Exception as e:
93
+ print(f"✗ Error loading Whisper model: {e}")
94
+ # Fallback to base model
 
95
  model = WhisperForConditionalGeneration.from_pretrained(
96
+ "openai/whisper-base", torch_dtype=torch.float32
 
97
  ).to(DEVICE)
98
  processor = WhisperProcessor.from_pretrained("openai/whisper-base")
99
 
100
+ current_whisper_model = {
101
  "language": language_choice,
102
  "model": model,
103
  "processor": processor
104
  }
 
105
  return model, processor
106
 
107
+ def load_qwen_model():
108
+ """Load Qwen2.5-1.5B-Instruct for transliteration"""
109
+ global qwen_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ if qwen_model["model"] is not None:
112
+ return qwen_model["model"], qwen_model["tokenizer"]
113
+
114
+ try:
115
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
116
+ print(f"Loading Qwen model: {model_name}")
117
+
118
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
119
+ model = AutoModelForCausalLM.from_pretrained(
120
+ model_name,
121
+ trust_remote_code=True,
122
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
123
+ device_map="auto" if DEVICE == "cuda" else None
124
+ )
125
+
126
+ if DEVICE == "cpu":
127
+ model = model.to(DEVICE)
128
+
129
+ model.eval()
130
+
131
+ qwen_model = {"model": model, "tokenizer": tokenizer}
132
+ print(f"✓ Qwen model loaded successfully")
133
+ return model, tokenizer
134
+
135
+ except Exception as e:
136
+ print(f"✗ Failed to load Qwen model: {e}")
137
+ return None, None
138
+
139
+ # ---------------- TRANSLITERATION ---------------- #
140
+
141
+ def transliterate_with_qwen(text, source_lang):
142
+ """Use Qwen for natural transliteration"""
143
+ if source_lang == "English" or not text.strip():
144
+ return text
145
 
146
+ model, tokenizer = load_qwen_model()
147
+ if model is None or tokenizer is None:
148
+ return text # Return original if model fails
149
 
150
  try:
151
+ # Create prompts
152
+ if source_lang == "Tamil":
153
+ system_prompt = "Convert Tamil text to natural Thanglish (how Tamil people type on phones). Only output the romanized text."
154
+ user_prompt = f"Tamil: {text}\nThanglish:"
155
+ else: # Malayalam
156
+ system_prompt = "Convert Malayalam text to natural Manglish (how Malayalam people type on phones). Only output the romanized text."
157
+ user_prompt = f"Malayalam: {text}\nManglish:"
158
+
159
+ # Format for Qwen
160
+ messages = [
161
+ {"role": "system", "content": system_prompt},
162
+ {"role": "user", "content": user_prompt}
163
+ ]
164
+
165
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
166
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
167
+ inputs = inputs.to(DEVICE)
168
+
169
+ # Generate
170
+ with torch.no_grad():
171
+ outputs = model.generate(
172
+ **inputs,
173
+ max_new_tokens=50,
174
+ temperature=0.1,
175
+ do_sample=True,
176
+ pad_token_id=tokenizer.eos_token_id
177
+ )
178
+
179
+ # Extract response
180
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
181
+ response = full_response[len(prompt):].strip()
182
+
183
+ # Clean response
184
+ response = response.split('\n')[0].strip() # Take first line only
185
+ return response if response else text
186
+
187
  except Exception as e:
188
+ print(f"Qwen transliteration error: {e}")
189
  return text
190
 
191
+ # ---------------- SPEECH RECOGNITION ---------------- #
 
 
 
 
 
192
 
193
  @spaces.GPU
194
+ def transcribe_audio(audio_path, language_choice):
195
+ """Transcribe audio using Whisper"""
196
+ model, processor = load_whisper_model(language_choice)
197
  lang_code = LANG_CODES[language_choice]
198
 
199
+ # Load audio
200
  import librosa
201
  audio, sr = librosa.load(audio_path, sr=16000)
202
 
203
+ # Process audio
204
  input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
205
+ input_features = input_features.to(DEVICE, dtype=next(model.parameters()).dtype)
206
 
207
+ # Generate transcription
 
 
 
 
208
  with torch.no_grad():
209
  try:
 
210
  forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
211
  predicted_ids = model.generate(
212
  input_features,
213
  forced_decoder_ids=forced_decoder_ids,
214
  max_length=448,
215
+ num_beams=5,
216
+ temperature=0.0
 
217
  )
218
+ except:
 
 
219
  predicted_ids = model.generate(
220
  input_features,
221
  max_length=448,
222
+ num_beams=5,
223
+ temperature=0.0
 
224
  )
225
 
 
226
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
227
  return transcription.strip()
228
 
229
+ # ---------------- FEEDBACK SYSTEM ---------------- #
 
 
 
 
230
 
231
+ def create_feedback(intended, actual, lang_choice):
232
+ """Create simple feedback comparison"""
233
+ # Get transliterations
234
+ intended_roman = transliterate_with_qwen(intended, lang_choice)
235
+ actual_roman = transliterate_with_qwen(actual, lang_choice)
236
 
237
+ # Calculate accuracy
 
 
 
 
238
  intended_words = intended.strip().split()
239
  actual_words = actual.strip().split()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ # Simple word-level accuracy
242
  sm = difflib.SequenceMatcher(None, intended_words, actual_words)
243
+ accuracy = sm.ratio() * 100
244
 
245
+ # Create feedback HTML
246
+ feedback_html = f"""
247
+ <div style='font-family: Arial, sans-serif; padding: 20px;'>
248
+ <h3 style='color: #2c3e50; text-align: center;'>📊 Pronunciation Analysis</h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ <table style='width: 100%; border-collapse: collapse; margin: 20px 0;'>
251
+ <tr style='background: #f8f9fa;'>
252
+ <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Target</td>
253
+ <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace;'>{intended}</td>
254
+ </tr>
255
+ <tr style='background: #f8f9fa;'>
256
+ <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Romanized</td>
257
+ <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace; color: #666;'>{intended_roman}</td>
258
+ </tr>
259
+ <tr>
260
+ <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>You Said</td>
261
+ <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace;'>{actual}</td>
262
+ </tr>
263
+ <tr>
264
+ <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Your Romanized</td>
265
+ <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace; color: #666;'>{actual_roman}</td>
266
+ </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  </table>
268
+
269
+ <div style='text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px;'>
270
+ <h4 style='margin: 0 0 10px 0;'>Accuracy Score</h4>
271
+ <div style='font-size: 36px; font-weight: bold;'>{accuracy:.0f}%</div>
272
+ <div style='margin-top: 10px;'>
273
+ {'🎉 Excellent!' if accuracy >= 90 else '👍 Good job!' if accuracy >= 70 else '📚 Keep practicing!'}
 
 
 
 
 
 
 
 
 
 
 
 
274
  </div>
275
  </div>
276
+ </div>
 
 
277
  """
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  return feedback_html, accuracy
280
 
281
+ # ---------------- MAIN FUNCTION ---------------- #
282
+
283
  @spaces.GPU
284
+ def analyze_pronunciation(audio, lang_choice, intended_text):
285
+ """Main function to analyze pronunciation"""
286
+ if audio is None or not intended_text.strip():
287
+ return "⚠️ Please record audio and generate a sentence first.", "", "", ""
288
 
289
  try:
290
+ # Extract original sentence (remove romanization if present)
291
+ if "🔤" in intended_text:
292
+ intended_sentence = intended_text.split("🔤")[0].strip()
293
  else:
294
+ intended_sentence = intended_text.strip()
295
 
296
+ # Transcribe audio
297
+ actual_text = transcribe_audio(audio, lang_choice)
298
 
299
  if not actual_text.strip():
300
+ return "⚠️ No speech detected. Please try recording again.", "", "", ""
301
 
302
+ # Calculate metrics
303
  wer_val = jiwer.wer(intended_sentence, actual_text)
304
  cer_val = jiwer.cer(intended_sentence, actual_text)
305
 
306
+ # Get romanizations
307
+ actual_roman = transliterate_with_qwen(actual_text, lang_choice)
 
308
 
309
+ # Create feedback
310
+ feedback_html, accuracy = create_feedback(intended_sentence, actual_text, lang_choice)
311
 
312
+ return actual_text, actual_roman, f"{wer_val:.1%}", feedback_html
 
 
 
 
 
 
313
 
314
  except Exception as e:
315
+ return f"❌ Error: {str(e)}", "", "", ""
316
+
317
+ # ---------------- HELPERS ---------------- #
318
 
319
+ def get_random_sentence_with_transliteration(language_choice):
320
+ """Get a random sentence with its transliteration"""
321
+ sentence = random.choice(SENTENCE_BANK[language_choice])
322
+ if language_choice in ["Tamil", "Malayalam"]:
323
+ transliteration = transliterate_with_qwen(sentence, language_choice)
324
+ combined = f"{sentence}\n\n🔤 {transliteration}"
325
+ return combined
326
  return sentence
327
 
328
  # ---------------- UI ---------------- #
329
+
330
+ with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
331
  gr.Markdown("""
332
+ # 🎙️ AI Pronunciation Coach
333
+ ### Practice English, Tamil & Malayalam with AI feedback powered by Qwen2.5
334
 
335
+ **Features:**
336
+ - ✨ **Smart Transliteration**: Natural Thanglish/Manglish using Qwen2.5-1.5B-Instruct
337
+ - 🎯 **Accurate Recognition**: Language-specific Whisper models
338
+ - 📊 **Instant Feedback**: Real-time pronunciation analysis
339
 
340
  **How to use:**
341
  1. Select your language
342
  2. Generate a practice sentence
343
  3. Record yourself reading it aloud
344
+ 4. Get instant feedback!
345
  """)
346
 
347
  with gr.Row():
348
+ lang_choice = gr.Dropdown(
349
+ choices=list(LANG_CODES.keys()),
350
+ value="Malayalam",
351
+ label="🌍 Choose Language"
352
+ )
353
+ gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
 
 
354
 
355
  intended_display = gr.Textbox(
356
+ label="📝 Practice Sentence",
357
  interactive=False,
358
  placeholder="Click 'Generate Practice Sentence' to get started...",
359
  lines=3
360
  )
361
 
362
+ audio_input = gr.Audio(
363
+ sources=["microphone"],
364
+ type="filepath",
365
+ label="🎤 Record Your Pronunciation"
366
+ )
 
 
 
 
 
 
367
 
368
  analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
369
 
370
  with gr.Row():
371
+ actual_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
372
+ actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
373
+ wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
374
+
 
 
 
 
375
  feedback_display = gr.HTML()
376
 
377
  # Event handlers
378
  gen_btn.click(
379
+ fn=get_random_sentence_with_transliteration,
380
+ inputs=[lang_choice],
381
  outputs=[intended_display]
382
  )
383
 
384
  analyze_btn.click(
385
+ fn=analyze_pronunciation,
386
+ inputs=[audio_input, lang_choice, intended_display],
387
+ outputs=[actual_out, actual_roman_out, wer_out, feedback_display]
388
  )
389
 
390
  if __name__ == "__main__":