Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -313,55 +313,99 @@ def transcribe_audio(audio_path, language_choice):
|
|
313 |
|
314 |
# ---------------- FEEDBACK SYSTEM ---------------- #
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
def create_feedback(intended, actual, lang_choice):
|
317 |
-
"""Create simple feedback comparison"""
|
318 |
# Get transliterations
|
319 |
intended_roman = transliterate_with_qwen(intended, lang_choice)
|
320 |
actual_roman = transliterate_with_qwen(actual, lang_choice)
|
321 |
|
|
|
|
|
|
|
|
|
322 |
# Calculate accuracy
|
323 |
-
intended_words =
|
324 |
-
actual_words =
|
325 |
|
326 |
# Simple word-level accuracy
|
327 |
sm = difflib.SequenceMatcher(None, intended_words, actual_words)
|
328 |
accuracy = sm.ratio() * 100
|
329 |
|
330 |
-
# Create
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
# ---------------- MAIN FUNCTION ---------------- #
|
367 |
|
@@ -369,7 +413,7 @@ def create_feedback(intended, actual, lang_choice):
|
|
369 |
def analyze_pronunciation(audio, lang_choice, intended_text):
|
370 |
"""Main function to analyze pronunciation"""
|
371 |
if audio is None or not intended_text.strip():
|
372 |
-
return "β οΈ Please record audio and generate a sentence first.", "", "", ""
|
373 |
|
374 |
try:
|
375 |
# Extract original sentence (remove romanization if present)
|
@@ -382,7 +426,7 @@ def analyze_pronunciation(audio, lang_choice, intended_text):
|
|
382 |
actual_text = transcribe_audio(audio, lang_choice)
|
383 |
|
384 |
if not actual_text.strip():
|
385 |
-
return "β οΈ No speech detected. Please try recording again.", "", "", ""
|
386 |
|
387 |
# Calculate metrics
|
388 |
wer_val = jiwer.wer(intended_sentence, actual_text)
|
@@ -391,13 +435,13 @@ def analyze_pronunciation(audio, lang_choice, intended_text):
|
|
391 |
# Get romanizations
|
392 |
actual_roman = transliterate_with_qwen(actual_text, lang_choice)
|
393 |
|
394 |
-
# Create feedback
|
395 |
-
|
396 |
|
397 |
-
return actual_text, actual_roman, f"{wer_val:.1%}",
|
398 |
|
399 |
except Exception as e:
|
400 |
-
return f"β Error: {str(e)}", "", "", ""
|
401 |
|
402 |
# ---------------- HELPERS ---------------- #
|
403 |
|
@@ -415,18 +459,18 @@ def get_random_sentence_with_transliteration(language_choice):
|
|
415 |
with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
|
416 |
gr.Markdown("""
|
417 |
# ποΈ AI Pronunciation Coach
|
418 |
-
### Practice English, Tamil & Malayalam with AI feedback powered by
|
419 |
|
420 |
**Features:**
|
421 |
-
- β¨ **
|
422 |
- π― **Accurate Recognition**: Language-specific Whisper models
|
423 |
-
- π **
|
424 |
|
425 |
**How to use:**
|
426 |
1. Select your language
|
427 |
2. Generate a practice sentence
|
428 |
3. Record yourself reading it aloud
|
429 |
-
4. Get instant feedback!
|
430 |
""")
|
431 |
|
432 |
with gr.Row():
|
@@ -457,7 +501,24 @@ with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
|
|
457 |
actual_roman_out = gr.Textbox(label="π€ Your Pronunciation (Romanized)", interactive=False)
|
458 |
wer_out = gr.Textbox(label="π Word Error Rate", interactive=False)
|
459 |
|
460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
|
462 |
# Event handlers
|
463 |
gen_btn.click(
|
@@ -469,7 +530,7 @@ with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
|
|
469 |
analyze_btn.click(
|
470 |
fn=analyze_pronunciation,
|
471 |
inputs=[audio_input, lang_choice, intended_display],
|
472 |
-
outputs=[actual_out, actual_roman_out, wer_out,
|
473 |
)
|
474 |
|
475 |
if __name__ == "__main__":
|
|
|
313 |
|
314 |
# ---------------- FEEDBACK SYSTEM ---------------- #
|
315 |
|
316 |
+
def normalize_text_for_comparison(text):
|
317 |
+
"""Remove punctuation and normalize text for fair comparison"""
|
318 |
+
import string
|
319 |
+
# Remove punctuation and extra spaces
|
320 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
321 |
+
text = ' '.join(text.split()) # Normalize spaces
|
322 |
+
return text.lower()
|
323 |
+
|
324 |
def create_feedback(intended, actual, lang_choice):
|
325 |
+
"""Create simple feedback comparison with tables"""
|
326 |
# Get transliterations
|
327 |
intended_roman = transliterate_with_qwen(intended, lang_choice)
|
328 |
actual_roman = transliterate_with_qwen(actual, lang_choice)
|
329 |
|
330 |
+
# Normalize for comparison (remove punctuation)
|
331 |
+
intended_normalized = normalize_text_for_comparison(intended)
|
332 |
+
actual_normalized = normalize_text_for_comparison(actual)
|
333 |
+
|
334 |
# Calculate accuracy
|
335 |
+
intended_words = intended_normalized.split()
|
336 |
+
actual_words = actual_normalized.split()
|
337 |
|
338 |
# Simple word-level accuracy
|
339 |
sm = difflib.SequenceMatcher(None, intended_words, actual_words)
|
340 |
accuracy = sm.ratio() * 100
|
341 |
|
342 |
+
# Create comparison data for table
|
343 |
+
comparison_data = [
|
344 |
+
["Target Text", intended],
|
345 |
+
["Target (Romanized)", intended_roman],
|
346 |
+
["Your Speech", actual],
|
347 |
+
["Your Speech (Romanized)", actual_roman],
|
348 |
+
["Accuracy Score", f"{accuracy:.1f}%"]
|
349 |
+
]
|
350 |
+
|
351 |
+
# Find incorrect words for pronunciation table
|
352 |
+
wrong_pronunciations = []
|
353 |
+
|
354 |
+
# Get word-level differences
|
355 |
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
356 |
+
if tag == 'replace':
|
357 |
+
# Words that were pronounced differently
|
358 |
+
for idx in range(max(i2-i1, j2-j1)):
|
359 |
+
expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
|
360 |
+
actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
|
361 |
+
|
362 |
+
if expected_word and actual_word and expected_word != actual_word:
|
363 |
+
# Get romanized versions
|
364 |
+
expected_roman = transliterate_with_qwen(expected_word, lang_choice)
|
365 |
+
actual_roman = transliterate_with_qwen(actual_word, lang_choice)
|
366 |
+
|
367 |
+
wrong_pronunciations.append([
|
368 |
+
expected_word,
|
369 |
+
expected_roman,
|
370 |
+
actual_word,
|
371 |
+
actual_roman
|
372 |
+
])
|
373 |
+
elif tag == 'delete':
|
374 |
+
# Missing words
|
375 |
+
for idx in range(i2-i1):
|
376 |
+
expected_word = intended_words[i1 + idx]
|
377 |
+
expected_roman = transliterate_with_qwen(expected_word, lang_choice)
|
378 |
+
wrong_pronunciations.append([
|
379 |
+
expected_word,
|
380 |
+
expected_roman,
|
381 |
+
"(Not spoken)",
|
382 |
+
""
|
383 |
+
])
|
384 |
+
elif tag == 'insert':
|
385 |
+
# Extra words
|
386 |
+
for idx in range(j2-j1):
|
387 |
+
actual_word = actual_words[j1 + idx]
|
388 |
+
actual_roman = transliterate_with_qwen(actual_word, lang_choice)
|
389 |
+
wrong_pronunciations.append([
|
390 |
+
"(Not expected)",
|
391 |
+
"",
|
392 |
+
actual_word,
|
393 |
+
actual_roman
|
394 |
+
])
|
395 |
|
396 |
+
# Create motivational message
|
397 |
+
if accuracy >= 95:
|
398 |
+
message = "π Outstanding! Perfect pronunciation!"
|
399 |
+
elif accuracy >= 85:
|
400 |
+
message = "π Excellent! Very natural sounding!"
|
401 |
+
elif accuracy >= 70:
|
402 |
+
message = "π Good job! Your pronunciation is improving!"
|
403 |
+
elif accuracy >= 50:
|
404 |
+
message = "π Getting there! Focus on the highlighted sounds!"
|
405 |
+
else:
|
406 |
+
message = "πͺ Keep practicing! Every attempt makes you better!"
|
407 |
+
|
408 |
+
return comparison_data, wrong_pronunciations, message, accuracy
|
409 |
|
410 |
# ---------------- MAIN FUNCTION ---------------- #
|
411 |
|
|
|
413 |
def analyze_pronunciation(audio, lang_choice, intended_text):
|
414 |
"""Main function to analyze pronunciation"""
|
415 |
if audio is None or not intended_text.strip():
|
416 |
+
return "β οΈ Please record audio and generate a sentence first.", "", "", [], [], ""
|
417 |
|
418 |
try:
|
419 |
# Extract original sentence (remove romanization if present)
|
|
|
426 |
actual_text = transcribe_audio(audio, lang_choice)
|
427 |
|
428 |
if not actual_text.strip():
|
429 |
+
return "β οΈ No speech detected. Please try recording again.", "", "", [], [], ""
|
430 |
|
431 |
# Calculate metrics
|
432 |
wer_val = jiwer.wer(intended_sentence, actual_text)
|
|
|
435 |
# Get romanizations
|
436 |
actual_roman = transliterate_with_qwen(actual_text, lang_choice)
|
437 |
|
438 |
+
# Create feedback tables
|
439 |
+
comparison_data, wrong_pronunciations, message, accuracy = create_feedback(intended_sentence, actual_text, lang_choice)
|
440 |
|
441 |
+
return actual_text, actual_roman, f"{wer_val:.1%}", comparison_data, wrong_pronunciations, message
|
442 |
|
443 |
except Exception as e:
|
444 |
+
return f"β Error: {str(e)}", "", "", [], [], ""
|
445 |
|
446 |
# ---------------- HELPERS ---------------- #
|
447 |
|
|
|
459 |
with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
|
460 |
gr.Markdown("""
|
461 |
# ποΈ AI Pronunciation Coach
|
462 |
+
### Practice English, Tamil & Malayalam with AI feedback powered by top open-source LLMs
|
463 |
|
464 |
**Features:**
|
465 |
+
- β¨ **Advanced Transliteration**: Natural Thanglish/Manglish using Qwen2.5-7B/Llama3.1-8B
|
466 |
- π― **Accurate Recognition**: Language-specific Whisper models
|
467 |
+
- π **Smart Analysis**: Punctuation-aware comparison with correction tables
|
468 |
|
469 |
**How to use:**
|
470 |
1. Select your language
|
471 |
2. Generate a practice sentence
|
472 |
3. Record yourself reading it aloud
|
473 |
+
4. Get instant feedback with detailed analysis!
|
474 |
""")
|
475 |
|
476 |
with gr.Row():
|
|
|
501 |
actual_roman_out = gr.Textbox(label="π€ Your Pronunciation (Romanized)", interactive=False)
|
502 |
wer_out = gr.Textbox(label="π Word Error Rate", interactive=False)
|
503 |
|
504 |
+
# Analysis tables
|
505 |
+
gr.Markdown("### π Analysis Results")
|
506 |
+
|
507 |
+
with gr.Row():
|
508 |
+
with gr.Column():
|
509 |
+
comparison_table = gr.Dataframe(
|
510 |
+
headers=["Metric", "Value"],
|
511 |
+
label="π Overall Comparison",
|
512 |
+
interactive=False
|
513 |
+
)
|
514 |
+
with gr.Column():
|
515 |
+
pronunciation_table = gr.Dataframe(
|
516 |
+
headers=["Expected Word", "Expected (Romanized)", "You Said", "You Said (Romanized)"],
|
517 |
+
label="β Pronunciation Corrections Needed",
|
518 |
+
interactive=False
|
519 |
+
)
|
520 |
+
|
521 |
+
feedback_message = gr.Textbox(label="π¬ Feedback", interactive=False)
|
522 |
|
523 |
# Event handlers
|
524 |
gen_btn.click(
|
|
|
530 |
analyze_btn.click(
|
531 |
fn=analyze_pronunciation,
|
532 |
inputs=[audio_input, lang_choice, intended_display],
|
533 |
+
outputs=[actual_out, actual_roman_out, wer_out, comparison_table, pronunciation_table, feedback_message]
|
534 |
)
|
535 |
|
536 |
if __name__ == "__main__":
|