sudhanm commited on
Commit
57ea064
·
verified ·
1 Parent(s): d031a29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -19
app.py CHANGED
@@ -188,31 +188,66 @@ def transliterate_to_simple_roman(text, lang_choice):
188
  else:
189
  return text
190
 
191
- # Convert IAST to natural Thanglish/Manglish
192
  natural_map = {
193
- # Remove all diacritics and make it natural
194
- 'ā': 'a', 'ī': 'i', 'ū': 'u', 'ē': 'e', 'ō': 'o',
195
- '': 'ng', 'ñ': 'nj', '': 't', '': 'd', '': 'n',
196
- '': 'r', '': 'n', '': 'l', '': 'zh', '': 'ru',
197
- 'ś': 'sh', '': 'sh', '': 'h', '': 'm', '': 'm',
198
- '': 'ri', '': 'li',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # Common combinations
200
  'kṣ': 'ksh', 'jñ': 'gn', 'śr': 'shr',
201
- # Remove virama marks
202
- '·': '', 'ŕ': 'r', 'ľ': 'l',
203
- # Handle long vowels naturally
 
 
204
  'aa': 'a', 'ii': 'i', 'uu': 'u', 'ee': 'e', 'oo': 'o'
205
  }
206
 
207
  natural_text = iast_text
 
 
208
  for iast, natural in natural_map.items():
209
  natural_text = natural_text.replace(iast, natural)
210
-
211
- # Additional cleanup for natural flow
 
 
 
 
 
 
212
  natural_text = natural_text.replace('zhz', 'zh') # Double zh fix
213
  natural_text = natural_text.replace('nnn', 'nn') # Triple n fix
214
  natural_text = natural_text.replace('lll', 'll') # Triple l fix
 
 
215
 
 
 
 
 
 
 
216
  return natural_text if natural_text else text
217
 
218
  except Exception as e:
@@ -299,10 +334,10 @@ def create_tabular_feedback(intended, actual, lang_choice):
299
  if lang_choice in ["Tamil", "Malayalam"]:
300
  feedback_html += f"""
301
  <div style='margin-bottom: 25px; padding: 15px; border: 2px solid #3498db; border-radius: 8px; background: #f8f9fa;'>
302
- <h4 style='color: #3498db; margin-bottom: 10px;'>🎯 Target Sentence (How to Read)</h4>
303
  <div style='font-size: 20px; font-family: monospace; color: #2c3e50; line-height: 1.4;'>
304
  <strong>Original:</strong> {intended}<br>
305
- <strong>Thanglish/Manglish:</strong> <span style='color: #e67e22; font-weight: bold;'>{intended_roman}</span>
306
  </div>
307
  </div>
308
  """
@@ -316,7 +351,7 @@ def create_tabular_feedback(intended, actual, lang_choice):
316
  <tr style='border-bottom: 2px solid #ddd;'>
317
  <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
318
  <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
319
- <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Thanglish/Manglish</th>
320
  </tr>
321
  </thead>
322
  <tbody>
@@ -590,10 +625,11 @@ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo
590
  )
591
 
592
  intended_transliteration = gr.Textbox(
593
- label="🔤 How to Read (Thanglish/Manglish)",
594
  interactive=False,
595
- placeholder="Natural romanization will appear here...",
596
- visible=False
 
597
  )
598
 
599
  with gr.Row():
@@ -613,7 +649,7 @@ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo
613
  with gr.Row():
614
  with gr.Column():
615
  pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
616
- actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Thanglish/Manglish)", interactive=False)
617
  with gr.Column():
618
  wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
619
  cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
 
188
  else:
189
  return text
190
 
191
+ # Comprehensive cleanup to remove ALL diacritics and make it natural
192
  natural_map = {
193
+ # Vowels with diacritics
194
+ 'ā': 'a', 'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a',
195
+ 'ī': 'i', 'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
196
+ 'ū': 'u', 'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u',
197
+ 'ē': 'e', 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
198
+ 'ō': 'o', 'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
199
+
200
+ # Consonants with diacritics
201
+ 'ṅ': 'ng', 'ň': 'n', 'ñ': 'nj', 'ń': 'n',
202
+ 'ṭ': 't', 'ť': 't', 'ţ': 't',
203
+ 'ḍ': 'd', 'ď': 'd', 'ḏ': 'd',
204
+ 'ṇ': 'n', 'ņ': 'n', 'ṉ': 'n',
205
+ 'ṟ': 'r', 'ř': 'r', 'ŕ': 'r', 'ṛ': 'ru',
206
+ 'ḷ': 'l', 'ľ': 'l', 'ļ': 'l', 'ḻ': 'zh',
207
+ 'ś': 'sh', 'š': 'sh', 'ṣ': 'sh', 'ş': 's',
208
+ 'ḥ': 'h', 'ḫ': 'h', 'ħ': 'h',
209
+ 'ṃ': 'm', 'ṁ': 'm', 'ḿ': 'm',
210
+ 'ç': 'ch', 'č': 'ch',
211
+
212
+ # Vocalic consonants
213
+ 'r̥': 'ri', 'r̥̄': 'ri',
214
+ 'l̥': 'li', 'l̥̄': 'li',
215
+
216
  # Common combinations
217
  'kṣ': 'ksh', 'jñ': 'gn', 'śr': 'shr',
218
+
219
+ # Remove virama and other marks
220
+ '·': '', '̥': '', '̄': '', '̃': '', '̂': '', '̀': '', '́': '',
221
+
222
+ # Double letters cleanup
223
  'aa': 'a', 'ii': 'i', 'uu': 'u', 'ee': 'e', 'oo': 'o'
224
  }
225
 
226
  natural_text = iast_text
227
+
228
+ # Apply all mappings
229
  for iast, natural in natural_map.items():
230
  natural_text = natural_text.replace(iast, natural)
231
+
232
+ # Additional cleanup passes for any remaining diacritics
233
+ import unicodedata
234
+ # Remove all combining diacritical marks
235
+ natural_text = ''.join(c for c in unicodedata.normalize('NFD', natural_text)
236
+ if unicodedata.category(c) != 'Mn')
237
+
238
+ # Fix common Malayalam/Tamil patterns
239
  natural_text = natural_text.replace('zhz', 'zh') # Double zh fix
240
  natural_text = natural_text.replace('nnn', 'nn') # Triple n fix
241
  natural_text = natural_text.replace('lll', 'll') # Triple l fix
242
+ natural_text = natural_text.replace('tth', 'th') # Simplify aspirated
243
+ natural_text = natural_text.replace('ddh', 'dh') # Simplify aspirated
244
 
245
+ # Make it more natural for Manglish/Thanglish
246
+ if lang_choice == "Malayalam":
247
+ natural_text = natural_text.replace('samgitam', 'sangeetham')
248
+ natural_text = natural_text.replace('jivitattinre', 'jeevitathinte')
249
+ natural_text = natural_text.replace('bhagaman', 'bhagamaanu')
250
+
251
  return natural_text if natural_text else text
252
 
253
  except Exception as e:
 
334
  if lang_choice in ["Tamil", "Malayalam"]:
335
  feedback_html += f"""
336
  <div style='margin-bottom: 25px; padding: 15px; border: 2px solid #3498db; border-radius: 8px; background: #f8f9fa;'>
337
+ <h4 style='color: #3498db; margin-bottom: 10px;'>🎯 Target Sentence (Reading Guide)</h4>
338
  <div style='font-size: 20px; font-family: monospace; color: #2c3e50; line-height: 1.4;'>
339
  <strong>Original:</strong> {intended}<br>
340
+ <strong>Romanized:</strong> <span style='color: #e67e22; font-weight: bold;'>{intended_roman}</span>
341
  </div>
342
  </div>
343
  """
 
351
  <tr style='border-bottom: 2px solid #ddd;'>
352
  <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
353
  <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
354
+ <th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Romanized</th>
355
  </tr>
356
  </thead>
357
  <tbody>
 
625
  )
626
 
627
  intended_transliteration = gr.Textbox(
628
+ label="🔤 Pronunciation Guide",
629
  interactive=False,
630
+ placeholder="Pronunciation guide will appear here...",
631
+ visible=False,
632
+ lines=1
633
  )
634
 
635
  with gr.Row():
 
649
  with gr.Row():
650
  with gr.Column():
651
  pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
652
+ actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
653
  with gr.Column():
654
  wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
655
  cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)