Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -188,31 +188,66 @@ def transliterate_to_simple_roman(text, lang_choice):
|
|
188 |
else:
|
189 |
return text
|
190 |
|
191 |
-
#
|
192 |
natural_map = {
|
193 |
-
#
|
194 |
-
'ā': 'a', '
|
195 |
-
'
|
196 |
-
'
|
197 |
-
'
|
198 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
# Common combinations
|
200 |
'kṣ': 'ksh', 'jñ': 'gn', 'śr': 'shr',
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
204 |
'aa': 'a', 'ii': 'i', 'uu': 'u', 'ee': 'e', 'oo': 'o'
|
205 |
}
|
206 |
|
207 |
natural_text = iast_text
|
|
|
|
|
208 |
for iast, natural in natural_map.items():
|
209 |
natural_text = natural_text.replace(iast, natural)
|
210 |
-
|
211 |
-
# Additional cleanup for
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
natural_text = natural_text.replace('zhz', 'zh') # Double zh fix
|
213 |
natural_text = natural_text.replace('nnn', 'nn') # Triple n fix
|
214 |
natural_text = natural_text.replace('lll', 'll') # Triple l fix
|
|
|
|
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
return natural_text if natural_text else text
|
217 |
|
218 |
except Exception as e:
|
@@ -299,10 +334,10 @@ def create_tabular_feedback(intended, actual, lang_choice):
|
|
299 |
if lang_choice in ["Tamil", "Malayalam"]:
|
300 |
feedback_html += f"""
|
301 |
<div style='margin-bottom: 25px; padding: 15px; border: 2px solid #3498db; border-radius: 8px; background: #f8f9fa;'>
|
302 |
-
<h4 style='color: #3498db; margin-bottom: 10px;'>🎯 Target Sentence (
|
303 |
<div style='font-size: 20px; font-family: monospace; color: #2c3e50; line-height: 1.4;'>
|
304 |
<strong>Original:</strong> {intended}<br>
|
305 |
-
<strong>
|
306 |
</div>
|
307 |
</div>
|
308 |
"""
|
@@ -316,7 +351,7 @@ def create_tabular_feedback(intended, actual, lang_choice):
|
|
316 |
<tr style='border-bottom: 2px solid #ddd;'>
|
317 |
<th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
|
318 |
<th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
|
319 |
-
<th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>
|
320 |
</tr>
|
321 |
</thead>
|
322 |
<tbody>
|
@@ -590,10 +625,11 @@ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo
|
|
590 |
)
|
591 |
|
592 |
intended_transliteration = gr.Textbox(
|
593 |
-
label="🔤
|
594 |
interactive=False,
|
595 |
-
placeholder="
|
596 |
-
visible=False
|
|
|
597 |
)
|
598 |
|
599 |
with gr.Row():
|
@@ -613,7 +649,7 @@ with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo
|
|
613 |
with gr.Row():
|
614 |
with gr.Column():
|
615 |
pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
|
616 |
-
actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (
|
617 |
with gr.Column():
|
618 |
wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
|
619 |
cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
|
|
|
188 |
else:
|
189 |
return text
|
190 |
|
191 |
+
# Comprehensive cleanup to remove ALL diacritics and make it natural
|
192 |
natural_map = {
|
193 |
+
# Vowels with diacritics
|
194 |
+
'ā': 'a', 'á': 'a', 'à': 'a', 'â': 'a', 'ä': 'a',
|
195 |
+
'ī': 'i', 'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
|
196 |
+
'ū': 'u', 'ú': 'u', 'ù': 'u', 'û': 'u', 'ü': 'u',
|
197 |
+
'ē': 'e', 'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
|
198 |
+
'ō': 'o', 'ó': 'o', 'ò': 'o', 'ô': 'o', 'ö': 'o',
|
199 |
+
|
200 |
+
# Consonants with diacritics
|
201 |
+
'ṅ': 'ng', 'ň': 'n', 'ñ': 'nj', 'ń': 'n',
|
202 |
+
'ṭ': 't', 'ť': 't', 'ţ': 't',
|
203 |
+
'ḍ': 'd', 'ď': 'd', 'ḏ': 'd',
|
204 |
+
'ṇ': 'n', 'ņ': 'n', 'ṉ': 'n',
|
205 |
+
'ṟ': 'r', 'ř': 'r', 'ŕ': 'r', 'ṛ': 'ru',
|
206 |
+
'ḷ': 'l', 'ľ': 'l', 'ļ': 'l', 'ḻ': 'zh',
|
207 |
+
'ś': 'sh', 'š': 'sh', 'ṣ': 'sh', 'ş': 's',
|
208 |
+
'ḥ': 'h', 'ḫ': 'h', 'ħ': 'h',
|
209 |
+
'ṃ': 'm', 'ṁ': 'm', 'ḿ': 'm',
|
210 |
+
'ç': 'ch', 'č': 'ch',
|
211 |
+
|
212 |
+
# Vocalic consonants
|
213 |
+
'r̥': 'ri', 'r̥̄': 'ri',
|
214 |
+
'l̥': 'li', 'l̥̄': 'li',
|
215 |
+
|
216 |
# Common combinations
|
217 |
'kṣ': 'ksh', 'jñ': 'gn', 'śr': 'shr',
|
218 |
+
|
219 |
+
# Remove virama and other marks
|
220 |
+
'·': '', '̥': '', '̄': '', '̃': '', '̂': '', '̀': '', '́': '',
|
221 |
+
|
222 |
+
# Double letters cleanup
|
223 |
'aa': 'a', 'ii': 'i', 'uu': 'u', 'ee': 'e', 'oo': 'o'
|
224 |
}
|
225 |
|
226 |
natural_text = iast_text
|
227 |
+
|
228 |
+
# Apply all mappings
|
229 |
for iast, natural in natural_map.items():
|
230 |
natural_text = natural_text.replace(iast, natural)
|
231 |
+
|
232 |
+
# Additional cleanup passes for any remaining diacritics
|
233 |
+
import unicodedata
|
234 |
+
# Remove all combining diacritical marks
|
235 |
+
natural_text = ''.join(c for c in unicodedata.normalize('NFD', natural_text)
|
236 |
+
if unicodedata.category(c) != 'Mn')
|
237 |
+
|
238 |
+
# Fix common Malayalam/Tamil patterns
|
239 |
natural_text = natural_text.replace('zhz', 'zh') # Double zh fix
|
240 |
natural_text = natural_text.replace('nnn', 'nn') # Triple n fix
|
241 |
natural_text = natural_text.replace('lll', 'll') # Triple l fix
|
242 |
+
natural_text = natural_text.replace('tth', 'th') # Simplify aspirated
|
243 |
+
natural_text = natural_text.replace('ddh', 'dh') # Simplify aspirated
|
244 |
|
245 |
+
# Make it more natural for Manglish/Thanglish
|
246 |
+
if lang_choice == "Malayalam":
|
247 |
+
natural_text = natural_text.replace('samgitam', 'sangeetham')
|
248 |
+
natural_text = natural_text.replace('jivitattinre', 'jeevitathinte')
|
249 |
+
natural_text = natural_text.replace('bhagaman', 'bhagamaanu')
|
250 |
+
|
251 |
return natural_text if natural_text else text
|
252 |
|
253 |
except Exception as e:
|
|
|
334 |
if lang_choice in ["Tamil", "Malayalam"]:
|
335 |
feedback_html += f"""
|
336 |
<div style='margin-bottom: 25px; padding: 15px; border: 2px solid #3498db; border-radius: 8px; background: #f8f9fa;'>
|
337 |
+
<h4 style='color: #3498db; margin-bottom: 10px;'>🎯 Target Sentence (Reading Guide)</h4>
|
338 |
<div style='font-size: 20px; font-family: monospace; color: #2c3e50; line-height: 1.4;'>
|
339 |
<strong>Original:</strong> {intended}<br>
|
340 |
+
<strong>Romanized:</strong> <span style='color: #e67e22; font-weight: bold;'>{intended_roman}</span>
|
341 |
</div>
|
342 |
</div>
|
343 |
"""
|
|
|
351 |
<tr style='border-bottom: 2px solid #ddd;'>
|
352 |
<th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Type</th>
|
353 |
<th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50; border-right: 1px solid #ddd;'>Original Text</th>
|
354 |
+
<th style='padding: 15px; text-align: left; font-weight: bold; color: #2c3e50;'>Romanized</th>
|
355 |
</tr>
|
356 |
</thead>
|
357 |
<tbody>
|
|
|
625 |
)
|
626 |
|
627 |
intended_transliteration = gr.Textbox(
|
628 |
+
label="🔤 Pronunciation Guide",
|
629 |
interactive=False,
|
630 |
+
placeholder="Pronunciation guide will appear here...",
|
631 |
+
visible=False,
|
632 |
+
lines=1
|
633 |
)
|
634 |
|
635 |
with gr.Row():
|
|
|
649 |
with gr.Row():
|
650 |
with gr.Column():
|
651 |
pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
|
652 |
+
actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
|
653 |
with gr.Column():
|
654 |
wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
|
655 |
cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
|