Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,8 @@ import torch
|
|
7 |
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
8 |
from indic_transliteration import sanscript
|
9 |
from indic_transliteration.sanscript import transliterate
|
|
|
|
|
10 |
|
11 |
# ---------------- CONFIG ---------------- #
|
12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
@@ -45,35 +47,90 @@ SENTENCE_BANK = {
|
|
45 |
"Learning languages is fun.",
|
46 |
"I like to drink coffee in the morning.",
|
47 |
"Technology helps us communicate better.",
|
48 |
-
"Reading books expands our knowledge."
|
|
|
|
|
|
|
49 |
],
|
50 |
"Tamil": [
|
51 |
"இன்று நல்ல வானிலை உள்ளது.",
|
52 |
"நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
|
53 |
"எனக்கு புத்தகம் படிக்க விருப்பம்.",
|
54 |
"தமிழ் மொழி மிகவும் அழகானது.",
|
55 |
-
"நான் தினமும் பள்ளிக்கு செல்கிறேன்."
|
|
|
|
|
|
|
56 |
],
|
57 |
"Malayalam": [
|
58 |
"എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
|
59 |
"ഇന്ന് മഴപെയ്യുന്നു.",
|
60 |
"ഞാൻ പുസ്തകം വായിക്കുന്നു.",
|
61 |
"കേരളം എന്റെ സ്വന്തം നാടാണ്.",
|
62 |
-
"ഞാൻ മലയാളം പഠിക്കുന്നു."
|
|
|
|
|
|
|
63 |
]
|
64 |
}
|
65 |
|
66 |
-
# ----------------
|
67 |
-
|
68 |
-
|
69 |
-
whisper_processors = {}
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# ---------------- HELPERS ---------------- #
|
79 |
def get_random_sentence(language_choice):
|
@@ -84,17 +141,55 @@ def is_script(text, lang_name):
|
|
84 |
return bool(pattern.search(text)) if pattern else True
|
85 |
|
86 |
def transliterate_to_hk(text, lang_choice):
|
|
|
|
|
|
|
|
|
87 |
mapping = {
|
88 |
"Tamil": sanscript.TAMIL,
|
89 |
"Malayalam": sanscript.MALAYALAM,
|
90 |
"English": None
|
91 |
}
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# Get the appropriate model and processor for the language
|
96 |
-
model =
|
97 |
-
processor = whisper_processors[language_choice]
|
98 |
lang_code = LANG_CODES[language_choice]
|
99 |
|
100 |
# Load and process audio
|
@@ -126,110 +221,343 @@ def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temp
|
|
126 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
127 |
return transcription.strip()
|
128 |
|
129 |
-
def
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
134 |
if tag == 'equal':
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
elif tag == 'replace':
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
elif tag == 'delete':
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
elif tag == 'insert':
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
-
def char_level_highlight(ref, hyp):
|
146 |
-
sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
|
147 |
-
out = []
|
148 |
-
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
149 |
-
if tag == 'equal':
|
150 |
-
out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
|
151 |
-
elif tag in ('replace', 'delete'):
|
152 |
-
out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
|
153 |
-
elif tag == 'insert':
|
154 |
-
out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
|
155 |
-
return "".join(out)
|
156 |
|
157 |
# ---------------- MAIN ---------------- #
|
158 |
-
|
159 |
-
|
160 |
if audio is None or not intended_sentence.strip():
|
161 |
-
return ("
|
162 |
-
|
163 |
-
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
164 |
-
|
165 |
-
# Pass 1: raw transcription with user-configured decoding parameters
|
166 |
-
actual_text = transcribe_once(audio, language_choice, primer_weak,
|
167 |
-
pass1_beam, pass1_temp, pass1_condition)
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
# ---------------- UI ---------------- #
|
189 |
-
with gr.Blocks(title="Pronunciation Comparator") as demo:
|
190 |
-
gr.Markdown("
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
with gr.Row():
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
with gr.Row():
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
209 |
|
210 |
with gr.Row():
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
cer_out = gr.Textbox(label="Character Error Rate")
|
218 |
|
219 |
-
gr.Markdown("###
|
220 |
-
|
221 |
-
char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
|
222 |
|
223 |
# Event handlers
|
224 |
-
gen_btn.click(
|
|
|
|
|
|
|
|
|
225 |
|
226 |
-
|
227 |
fn=compare_pronunciation,
|
228 |
-
inputs=[audio_input,
|
229 |
-
outputs=[
|
230 |
-
pass1_out, pass2_out, hk_translit, wer_out, cer_out,
|
231 |
-
diff_html_box, char_html_box, intended_display
|
232 |
-
]
|
233 |
)
|
234 |
|
235 |
if __name__ == "__main__":
|
|
|
7 |
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
8 |
from indic_transliteration import sanscript
|
9 |
from indic_transliteration.sanscript import transliterate
|
10 |
+
import spaces
|
11 |
+
import gc
|
12 |
|
13 |
# ---------------- CONFIG ---------------- #
|
14 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
47 |
"Learning languages is fun.",
|
48 |
"I like to drink coffee in the morning.",
|
49 |
"Technology helps us communicate better.",
|
50 |
+
"Reading books expands our knowledge.",
|
51 |
+
"Music brings people together.",
|
52 |
+
"Exercise keeps us healthy and strong.",
|
53 |
+
"Cooking is both art and science."
|
54 |
],
|
55 |
"Tamil": [
|
56 |
"இன்று நல்ல வானிலை உள்ளது.",
|
57 |
"நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
|
58 |
"எனக்கு புத்தகம் படிக்க விருப்பம்.",
|
59 |
"தமிழ் மொழி மிகவும் அழகானது.",
|
60 |
+
"நான் தினமும் பள்ளிக்கு செல்கிறேன்.",
|
61 |
+
"எனக்கு இசை கேட்க மிகவும் பிடிக்கும்.",
|
62 |
+
"அன்னை தமிழ் எங்கள் தாய்மொழி.",
|
63 |
+
"நல்ல உணவு உடல் நலத்திற்கு அவசியம்."
|
64 |
],
|
65 |
"Malayalam": [
|
66 |
"എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
|
67 |
"ഇന്ന് മഴപെയ്യുന്നു.",
|
68 |
"ഞാൻ പുസ്തകം വായിക്കുന്നു.",
|
69 |
"കേരളം എന്റെ സ്വന്തം നാടാണ്.",
|
70 |
+
"ഞാൻ മലയാളം പഠിക്കുന്നു.",
|
71 |
+
"സംഗീതം ജീവിതത്തിന്റെ ഭാഗമാണ്.",
|
72 |
+
"നല്ല ആരോഗ്യം വളരെ പ്രധാനമാണ്.",
|
73 |
+
"വിദ്യാഭ്യാസം ജീവിതത്തിൽ അത്യാവശ്യമാണ്."
|
74 |
]
|
75 |
}
|
76 |
|
77 |
+
# ---------------- MEMORY OPTIMIZED MODEL LOADING ---------------- #
|
78 |
+
# Store only currently loaded model to save memory
|
79 |
+
current_model = {"language": None, "model": None, "processor": None}
|
|
|
80 |
|
81 |
+
def load_model_for_language(language_choice):
|
82 |
+
"""Load model on-demand and clear previous model from memory"""
|
83 |
+
global current_model
|
84 |
+
|
85 |
+
# If same language is already loaded, return current model
|
86 |
+
if current_model["language"] == language_choice and current_model["model"] is not None:
|
87 |
+
return current_model["model"], current_model["processor"]
|
88 |
+
|
89 |
+
# Clear previous model from memory
|
90 |
+
if current_model["model"] is not None:
|
91 |
+
del current_model["model"]
|
92 |
+
del current_model["processor"]
|
93 |
+
gc.collect()
|
94 |
+
if DEVICE == "cuda":
|
95 |
+
torch.cuda.empty_cache()
|
96 |
+
|
97 |
+
# Load new model
|
98 |
+
model_id = MODEL_CONFIGS[language_choice]
|
99 |
+
print(f"Loading {language_choice} model: {model_id}")
|
100 |
+
|
101 |
+
try:
|
102 |
+
model = WhisperForConditionalGeneration.from_pretrained(
|
103 |
+
model_id,
|
104 |
+
torch_dtype=torch.float32
|
105 |
+
).to(DEVICE)
|
106 |
+
processor = WhisperProcessor.from_pretrained(model_id)
|
107 |
+
|
108 |
+
current_model = {
|
109 |
+
"language": language_choice,
|
110 |
+
"model": model,
|
111 |
+
"processor": processor
|
112 |
+
}
|
113 |
+
|
114 |
+
print(f"✓ {language_choice} model loaded successfully")
|
115 |
+
return model, processor
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
print(f"✗ Error loading {language_choice} model: {e}")
|
119 |
+
# Fallback to base whisper model
|
120 |
+
print(f"Falling back to openai/whisper-base for {language_choice}")
|
121 |
+
model = WhisperForConditionalGeneration.from_pretrained(
|
122 |
+
"openai/whisper-base",
|
123 |
+
torch_dtype=torch.float32
|
124 |
+
).to(DEVICE)
|
125 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
|
126 |
+
|
127 |
+
current_model = {
|
128 |
+
"language": language_choice,
|
129 |
+
"model": model,
|
130 |
+
"processor": processor
|
131 |
+
}
|
132 |
+
|
133 |
+
return model, processor
|
134 |
|
135 |
# ---------------- HELPERS ---------------- #
|
136 |
def get_random_sentence(language_choice):
|
|
|
141 |
return bool(pattern.search(text)) if pattern else True
|
142 |
|
143 |
def transliterate_to_hk(text, lang_choice):
|
144 |
+
"""Improved transliteration with better handling"""
|
145 |
+
if not text or not text.strip():
|
146 |
+
return ""
|
147 |
+
|
148 |
mapping = {
|
149 |
"Tamil": sanscript.TAMIL,
|
150 |
"Malayalam": sanscript.MALAYALAM,
|
151 |
"English": None
|
152 |
}
|
153 |
+
|
154 |
+
if mapping[lang_choice] is None:
|
155 |
+
return text # Return as-is for English
|
156 |
+
|
157 |
+
try:
|
158 |
+
# Clean the text and transliterate
|
159 |
+
cleaned_text = text.strip()
|
160 |
+
transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.HK)
|
161 |
+
return transliterated if transliterated else text
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Transliteration error: {e}")
|
164 |
+
return text
|
165 |
+
|
166 |
+
def transliterate_to_roman(text, lang_choice):
|
167 |
+
"""Transliterate to more readable Roman script"""
|
168 |
+
if not text or not text.strip():
|
169 |
+
return ""
|
170 |
+
|
171 |
+
mapping = {
|
172 |
+
"Tamil": sanscript.TAMIL,
|
173 |
+
"Malayalam": sanscript.MALAYALAM,
|
174 |
+
"English": None
|
175 |
+
}
|
176 |
+
|
177 |
+
if mapping[lang_choice] is None:
|
178 |
+
return text # Return as-is for English
|
179 |
+
|
180 |
+
try:
|
181 |
+
# Clean the text and transliterate to IAST (more readable)
|
182 |
+
cleaned_text = text.strip()
|
183 |
+
transliterated = transliterate(cleaned_text, mapping[lang_choice], sanscript.IAST)
|
184 |
+
return transliterated if transliterated else text
|
185 |
+
except Exception as e:
|
186 |
+
print(f"Transliteration error: {e}")
|
187 |
+
return text
|
188 |
+
|
189 |
+
@spaces.GPU
|
190 |
+
def transcribe_once(audio_path, language_choice, beam_size, temperature):
|
191 |
# Get the appropriate model and processor for the language
|
192 |
+
model, processor = load_model_for_language(language_choice)
|
|
|
193 |
lang_code = LANG_CODES[language_choice]
|
194 |
|
195 |
# Load and process audio
|
|
|
221 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
222 |
return transcription.strip()
|
223 |
|
224 |
+
def create_tabular_feedback(intended, actual, lang_choice):
|
225 |
+
"""Create comprehensive tabular feedback with transliteration"""
|
226 |
+
|
227 |
+
# Get transliterations
|
228 |
+
intended_roman = transliterate_to_roman(intended, lang_choice)
|
229 |
+
actual_roman = transliterate_to_roman(actual, lang_choice)
|
230 |
+
intended_hk = transliterate_to_hk(intended, lang_choice)
|
231 |
+
actual_hk = transliterate_to_hk(actual, lang_choice)
|
232 |
+
|
233 |
+
# Split into words for comparison
|
234 |
+
intended_words = intended.strip().split()
|
235 |
+
actual_words = actual.strip().split()
|
236 |
+
intended_roman_words = intended_roman.strip().split()
|
237 |
+
actual_roman_words = actual_roman.strip().split()
|
238 |
+
|
239 |
+
# Calculate accuracy
|
240 |
+
correct_words = 0
|
241 |
+
total_words = len(intended_words)
|
242 |
+
|
243 |
+
# Create word-by-word comparison table
|
244 |
+
feedback_html = """
|
245 |
+
<div style='font-family: Arial, sans-serif; padding: 20px; background: #f8f9fa; border-radius: 12px; margin: 10px 0;'>
|
246 |
+
<h3 style='color: #2c3e50; margin-bottom: 20px; text-align: center;'>📊 Pronunciation Analysis</h3>
|
247 |
+
"""
|
248 |
+
|
249 |
+
# Overview table
|
250 |
+
feedback_html += """
|
251 |
+
<div style='margin-bottom: 25px;'>
|
252 |
+
<h4 style='color: #34495e; margin-bottom: 15px;'>📝 Text Comparison</h4>
|
253 |
+
<table style='width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
|
254 |
+
<thead>
|
255 |
+
<tr style='background: #3498db; color: white;'>
|
256 |
+
<th style='padding: 12px; text-align: left; font-weight: bold;'>Type</th>
|
257 |
+
<th style='padding: 12px; text-align: left; font-weight: bold;'>Original Script</th>
|
258 |
+
<th style='padding: 12px; text-align: left; font-weight: bold;'>Roman/IAST</th>
|
259 |
+
</tr>
|
260 |
+
</thead>
|
261 |
+
<tbody>
|
262 |
+
<tr style='background: #e8f5e8;'>
|
263 |
+
<td style='padding: 12px; font-weight: bold; color: #27ae60;'>🎯 Target</td>
|
264 |
+
<td style='padding: 12px; font-family: monospace;'>{}</td>
|
265 |
+
<td style='padding: 12px; font-family: monospace; font-style: italic;'>{}</td>
|
266 |
+
</tr>
|
267 |
+
<tr style='background: #fff3e0;'>
|
268 |
+
<td style='padding: 12px; font-weight: bold; color: #e67e22;'>🗣️ You Said</td>
|
269 |
+
<td style='padding: 12px; font-family: monospace;'>{}</td>
|
270 |
+
<td style='padding: 12px; font-family: monospace; font-style: italic;'>{}</td>
|
271 |
+
</tr>
|
272 |
+
</tbody>
|
273 |
+
</table>
|
274 |
+
</div>
|
275 |
+
""".format(intended, intended_roman, actual, actual_roman)
|
276 |
+
|
277 |
+
# Word-by-word analysis
|
278 |
+
feedback_html += """
|
279 |
+
<div style='margin-bottom: 25px;'>
|
280 |
+
<h4 style='color: #34495e; margin-bottom: 15px;'>🔍 Word-by-Word Analysis</h4>
|
281 |
+
<table style='width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
|
282 |
+
<thead>
|
283 |
+
<tr style='background: #9b59b6; color: white;'>
|
284 |
+
<th style='padding: 12px; text-align: center; font-weight: bold;'>#</th>
|
285 |
+
<th style='padding: 12px; text-align: left; font-weight: bold;'>Expected</th>
|
286 |
+
<th style='padding: 12px; text-align: left; font-weight: bold;'>You Said</th>
|
287 |
+
<th style='padding: 12px; text-align: center; font-weight: bold;'>Status</th>
|
288 |
+
</tr>
|
289 |
+
</thead>
|
290 |
+
<tbody>
|
291 |
+
"""
|
292 |
+
|
293 |
+
# Compare words using difflib
|
294 |
+
sm = difflib.SequenceMatcher(None, intended_words, actual_words)
|
295 |
+
word_index = 0
|
296 |
+
|
297 |
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
298 |
if tag == 'equal':
|
299 |
+
# Correct words
|
300 |
+
for idx, word in enumerate(intended_words[i1:i2]):
|
301 |
+
word_index += 1
|
302 |
+
correct_words += 1
|
303 |
+
roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
|
304 |
+
actual_word = actual_words[j1 + idx] if (j1 + idx) < len(actual_words) else ""
|
305 |
+
actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
|
306 |
+
|
307 |
+
feedback_html += f"""
|
308 |
+
<tr style='background: #d4f6d4;'>
|
309 |
+
<td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
|
310 |
+
<td style='padding: 10px;'>
|
311 |
+
<div style='font-family: monospace; font-size: 16px;'>{word}</div>
|
312 |
+
<div style='font-size: 12px; color: #666; font-style: italic;'>{roman_word}</div>
|
313 |
+
</td>
|
314 |
+
<td style='padding: 10px;'>
|
315 |
+
<div style='font-family: monospace; font-size: 16px; color: #27ae60;'>{actual_word}</div>
|
316 |
+
<div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
|
317 |
+
</td>
|
318 |
+
<td style='padding: 10px; text-align: center;'>
|
319 |
+
<span style='background: #27ae60; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>✓ Correct</span>
|
320 |
+
</td>
|
321 |
+
</tr>
|
322 |
+
"""
|
323 |
+
|
324 |
elif tag == 'replace':
|
325 |
+
# Incorrect words
|
326 |
+
max_words = max(i2-i1, j2-j1)
|
327 |
+
for idx in range(max_words):
|
328 |
+
word_index += 1
|
329 |
+
expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
|
330 |
+
expected_roman = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
|
331 |
+
actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
|
332 |
+
actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
|
333 |
+
|
334 |
+
feedback_html += f"""
|
335 |
+
<tr style='background: #ffebee;'>
|
336 |
+
<td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
|
337 |
+
<td style='padding: 10px;'>
|
338 |
+
<div style='font-family: monospace; font-size: 16px;'>{expected_word}</div>
|
339 |
+
<div style='font-size: 12px; color: #666; font-style: italic;'>{expected_roman}</div>
|
340 |
+
</td>
|
341 |
+
<td style='padding: 10px;'>
|
342 |
+
<div style='font-family: monospace; font-size: 16px; color: #e74c3c;'>{actual_word}</div>
|
343 |
+
<div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
|
344 |
+
</td>
|
345 |
+
<td style='padding: 10px; text-align: center;'>
|
346 |
+
<span style='background: #e74c3c; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>✗ Different</span>
|
347 |
+
</td>
|
348 |
+
</tr>
|
349 |
+
"""
|
350 |
+
|
351 |
elif tag == 'delete':
|
352 |
+
# Missing words
|
353 |
+
for idx, word in enumerate(intended_words[i1:i2]):
|
354 |
+
word_index += 1
|
355 |
+
roman_word = intended_roman_words[i1 + idx] if (i1 + idx) < len(intended_roman_words) else ""
|
356 |
+
feedback_html += f"""
|
357 |
+
<tr style='background: #ffeaa7;'>
|
358 |
+
<td style='padding: 10px; text-align: center; font-weight: bold;'>{word_index}</td>
|
359 |
+
<td style='padding: 10px;'>
|
360 |
+
<div style='font-family: monospace; font-size: 16px;'>{word}</div>
|
361 |
+
<div style='font-size: 12px; color: #666; font-style: italic;'>{roman_word}</div>
|
362 |
+
</td>
|
363 |
+
<td style='padding: 10px; color: #e17055; font-style: italic;'>
|
364 |
+
<em>Not spoken</em>
|
365 |
+
</td>
|
366 |
+
<td style='padding: 10px; text-align: center;'>
|
367 |
+
<span style='background: #fdcb6e; color: #2d3436; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>⚠ Missing</span>
|
368 |
+
</td>
|
369 |
+
</tr>
|
370 |
+
"""
|
371 |
+
|
372 |
elif tag == 'insert':
|
373 |
+
# Extra words
|
374 |
+
for idx, word in enumerate(actual_words[j1:j2]):
|
375 |
+
actual_roman_word = actual_roman_words[j1 + idx] if (j1 + idx) < len(actual_roman_words) else ""
|
376 |
+
feedback_html += f"""
|
377 |
+
<tr style='background: #fab1a0;'>
|
378 |
+
<td style='padding: 10px; text-align: center; font-weight: bold;'>+</td>
|
379 |
+
<td style='padding: 10px; color: #636e72; font-style: italic;'>
|
380 |
+
<em>Not expected</em>
|
381 |
+
</td>
|
382 |
+
<td style='padding: 10px;'>
|
383 |
+
<div style='font-family: monospace; font-size: 16px; color: #e17055;'>{word}</div>
|
384 |
+
<div style='font-size: 12px; color: #666; font-style: italic;'>{actual_roman_word}</div>
|
385 |
+
</td>
|
386 |
+
<td style='padding: 10px; text-align: center;'>
|
387 |
+
<span style='background: #fd79a8; color: white; padding: 4px 8px; border-radius: 12px; font-size: 12px;'>+ Extra</span>
|
388 |
+
</td>
|
389 |
+
</tr>
|
390 |
+
"""
|
391 |
+
|
392 |
+
feedback_html += """
|
393 |
+
</tbody>
|
394 |
+
</table>
|
395 |
+
</div>
|
396 |
+
"""
|
397 |
+
|
398 |
+
# Calculate accuracy
|
399 |
+
accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
|
400 |
+
|
401 |
+
# Summary section
|
402 |
+
feedback_html += f"""
|
403 |
+
<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 12px; text-align: center;'>
|
404 |
+
<h4 style='margin: 0 0 15px 0; font-size: 24px;'>🎯 Performance Summary</h4>
|
405 |
+
<div style='display: flex; justify-content: space-around; flex-wrap: wrap; gap: 15px;'>
|
406 |
+
<div style='background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; min-width: 150px;'>
|
407 |
+
<div style='font-size: 32px; font-weight: bold;'>{accuracy:.1f}%</div>
|
408 |
+
<div style='font-size: 14px; opacity: 0.9;'>Word Accuracy</div>
|
409 |
+
</div>
|
410 |
+
<div style='background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; min-width: 150px;'>
|
411 |
+
<div style='font-size: 32px; font-weight: bold;'>{correct_words}/{total_words}</div>
|
412 |
+
<div style='font-size: 14px; opacity: 0.9;'>Correct Words</div>
|
413 |
+
</div>
|
414 |
+
</div>
|
415 |
+
<div style='margin-top: 15px; font-size: 18px;'>
|
416 |
+
"""
|
417 |
+
|
418 |
+
# Motivational message
|
419 |
+
if accuracy >= 95:
|
420 |
+
feedback_html += "<span>🎉 Outstanding! Perfect pronunciation!</span>"
|
421 |
+
elif accuracy >= 85:
|
422 |
+
feedback_html += "<span>🌟 Excellent work! Very clear pronunciation!</span>"
|
423 |
+
elif accuracy >= 70:
|
424 |
+
feedback_html += "<span>👍 Good job! Keep practicing those tricky words!</span>"
|
425 |
+
elif accuracy >= 50:
|
426 |
+
feedback_html += "<span>📚 Making progress! Focus on the highlighted words!</span>"
|
427 |
+
else:
|
428 |
+
feedback_html += "<span>💪 Keep going! Practice makes perfect!</span>"
|
429 |
+
|
430 |
+
feedback_html += """
|
431 |
+
</div>
|
432 |
+
</div>
|
433 |
+
"""
|
434 |
+
|
435 |
+
# Add HK transliteration section for reference
|
436 |
+
if lang_choice in ["Tamil", "Malayalam"]:
|
437 |
+
feedback_html += f"""
|
438 |
+
<div style='margin-top: 20px; padding: 15px; background: #ecf0f1; border-radius: 8px;'>
|
439 |
+
<h4 style='color: #2c3e50; margin-bottom: 10px;'>🔤 Harvard-Kyoto Transliteration (for reference)</h4>
|
440 |
+
<div style='display: grid; grid-template-columns: 1fr 1fr; gap: 15px;'>
|
441 |
+
<div>
|
442 |
+
<strong>Expected:</strong><br>
|
443 |
+
<span style='font-family: monospace; background: white; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{intended_hk}</span>
|
444 |
+
</div>
|
445 |
+
<div>
|
446 |
+
<strong>You said:</strong><br>
|
447 |
+
<span style='font-family: monospace; background: white; padding: 8px; border-radius: 4px; display: block; margin-top: 5px;'>{actual_hk}</span>
|
448 |
+
</div>
|
449 |
+
</div>
|
450 |
+
</div>
|
451 |
+
"""
|
452 |
+
|
453 |
+
feedback_html += "</div>"
|
454 |
+
|
455 |
+
return feedback_html, accuracy
|
456 |
+
|
457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
|
459 |
# ---------------- MAIN ---------------- #
|
460 |
+
@spaces.GPU
|
461 |
+
def compare_pronunciation(audio, lang_choice, intended_sentence, pass1_beam, pass1_temp):
|
462 |
if audio is None or not intended_sentence.strip():
|
463 |
+
return ("⚠️ Please record audio and generate a sentence first.", "", "", "", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
|
465 |
+
try:
|
466 |
+
# Single transcription pass with user settings
|
467 |
+
actual_text = transcribe_once(audio, lang_choice, pass1_beam, pass1_temp)
|
468 |
+
|
469 |
+
if not actual_text.strip():
|
470 |
+
return ("⚠️ No speech detected. Please try recording again.", "", "", "", "")
|
471 |
+
|
472 |
+
# Compute metrics
|
473 |
+
wer_val = jiwer.wer(intended_sentence, actual_text)
|
474 |
+
cer_val = jiwer.cer(intended_sentence, actual_text)
|
475 |
+
|
476 |
+
# Get transliterations for both texts
|
477 |
+
intended_roman = transliterate_to_roman(intended_sentence, lang_choice)
|
478 |
+
actual_roman = transliterate_to_roman(actual_text, lang_choice)
|
479 |
+
|
480 |
+
# Create comprehensive tabular feedback
|
481 |
+
feedback_html, accuracy = create_tabular_feedback(intended_sentence, actual_text, lang_choice)
|
482 |
+
|
483 |
+
return (
|
484 |
+
actual_text,
|
485 |
+
actual_roman,
|
486 |
+
f"{wer_val:.1%}",
|
487 |
+
f"{cer_val:.1%}",
|
488 |
+
feedback_html
|
489 |
+
)
|
490 |
+
|
491 |
+
except Exception as e:
|
492 |
+
error_msg = f"❌ Error during transcription: {str(e)}"
|
493 |
+
print(error_msg)
|
494 |
+
return (error_msg, "", "", "", "")
|
495 |
|
496 |
# ---------------- UI ---------------- #
|
497 |
+
with gr.Blocks(title="Pronunciation Comparator", theme=gr.themes.Soft()) as demo:
|
498 |
+
gr.Markdown("""
|
499 |
+
# 🎙️ AI Pronunciation Coach
|
500 |
+
### Practice English, Tamil & Malayalam with AI feedback
|
501 |
+
|
502 |
+
**How to use:**
|
503 |
+
1. Select your language
|
504 |
+
2. Generate a practice sentence
|
505 |
+
3. Record yourself reading it aloud
|
506 |
+
4. Get instant feedback on your pronunciation!
|
507 |
+
""")
|
508 |
|
509 |
with gr.Row():
|
510 |
+
with gr.Column(scale=2):
|
511 |
+
lang_choice = gr.Dropdown(
|
512 |
+
choices=list(LANG_CODES.keys()),
|
513 |
+
value="Malayalam",
|
514 |
+
label="🌍 Choose Language"
|
515 |
+
)
|
516 |
+
with gr.Column(scale=1):
|
517 |
+
gen_btn = gr.Button("🎲 Generate Practice Sentence", variant="primary")
|
518 |
+
|
519 |
+
intended_display = gr.Textbox(
|
520 |
+
label="📝 Practice Sentence (Read this aloud)",
|
521 |
+
interactive=False,
|
522 |
+
placeholder="Click 'Generate Practice Sentence' to get started..."
|
523 |
+
)
|
524 |
|
525 |
with gr.Row():
|
526 |
+
with gr.Column():
|
527 |
+
audio_input = gr.Audio(
|
528 |
+
sources=["microphone"],
|
529 |
+
type="filepath",
|
530 |
+
label="🎤 Record Your Pronunciation"
|
531 |
+
)
|
532 |
+
with gr.Column():
|
533 |
+
gr.Markdown("### ⚙️ Advanced Settings")
|
534 |
+
pass1_beam = gr.Slider(1, 10, value=5, step=1, label="Beam Size (accuracy vs speed)")
|
535 |
+
pass1_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature (creativity)")
|
536 |
+
|
537 |
+
analyze_btn = gr.Button("🔍 Analyze My Pronunciation", variant="primary", size="lg")
|
538 |
|
539 |
with gr.Row():
|
540 |
+
with gr.Column():
|
541 |
+
pass1_out = gr.Textbox(label="🗣️ What You Said", interactive=False)
|
542 |
+
actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Roman)", interactive=False)
|
543 |
+
with gr.Column():
|
544 |
+
wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
|
545 |
+
cer_out = gr.Textbox(label="📈 Character Error Rate", interactive=False)
|
|
|
546 |
|
547 |
+
gr.Markdown("### 📋 Detailed Analysis")
|
548 |
+
feedback_display = gr.HTML()
|
|
|
549 |
|
550 |
# Event handlers
|
551 |
+
gen_btn.click(
|
552 |
+
fn=get_random_sentence,
|
553 |
+
inputs=[lang_choice],
|
554 |
+
outputs=[intended_display]
|
555 |
+
)
|
556 |
|
557 |
+
analyze_btn.click(
|
558 |
fn=compare_pronunciation,
|
559 |
+
inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp],
|
560 |
+
outputs=[pass1_out, actual_roman_out, wer_out, cer_out, feedback_display]
|
|
|
|
|
|
|
561 |
)
|
562 |
|
563 |
if __name__ == "__main__":
|