Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 4 days ago

Commit

751fdfd

verified ·

1 Parent(s): 5954007

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -49

app.py CHANGED Viewed

@@ -313,55 +313,99 @@ def transcribe_audio(audio_path, language_choice):
 # ---------------- FEEDBACK SYSTEM ---------------- #
 def create_feedback(intended, actual, lang_choice):
-    """Create simple feedback comparison"""
     # Get transliterations
     intended_roman = transliterate_with_qwen(intended, lang_choice)
     actual_roman = transliterate_with_qwen(actual, lang_choice)
     # Calculate accuracy
-    intended_words = intended.strip().split()
-    actual_words = actual.strip().split()
     # Simple word-level accuracy
     sm = difflib.SequenceMatcher(None, intended_words, actual_words)
     accuracy = sm.ratio() * 100
-    # Create feedback HTML
-    feedback_html = f"""
-    <div style='font-family: Arial, sans-serif; padding: 20px;'>
-        <h3 style='color: #2c3e50; text-align: center;'>📊 Pronunciation Analysis</h3>
-        <table style='width: 100%; border-collapse: collapse; margin: 20px 0;'>
-            <tr style='background: #f8f9fa;'>
-                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Target</td>
-                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace;'>{intended}</td>
-            </tr>
-            <tr style='background: #f8f9fa;'>
-                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Romanized</td>
-                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace; color: #666;'>{intended_roman}</td>
-            </tr>
-            <tr>
-                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>You Said</td>
-                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace;'>{actual}</td>
-            </tr>
-            <tr>
-                <td style='padding: 15px; font-weight: bold; border: 1px solid #ddd;'>Your Romanized</td>
-                <td style='padding: 15px; border: 1px solid #ddd; font-family: monospace; color: #666;'>{actual_roman}</td>
-            </tr>
-        </table>
-        <div style='text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px;'>
-            <h4 style='margin: 0 0 10px 0;'>Accuracy Score</h4>
-            <div style='font-size: 36px; font-weight: bold;'>{accuracy:.0f}%</div>
-            <div style='margin-top: 10px;'>
-                {'🎉 Excellent!' if accuracy >= 90 else '👍 Good job!' if accuracy >= 70 else '📚 Keep practicing!'}
-            </div>
-        </div>
-    </div>
-    """
-    return feedback_html, accuracy
 # ---------------- MAIN FUNCTION ---------------- #
@@ -369,7 +413,7 @@ def create_feedback(intended, actual, lang_choice):
 def analyze_pronunciation(audio, lang_choice, intended_text):
     """Main function to analyze pronunciation"""
     if audio is None or not intended_text.strip():
-        return "⚠️ Please record audio and generate a sentence first.", "", "", ""
     try:
         # Extract original sentence (remove romanization if present)
@@ -382,7 +426,7 @@ def analyze_pronunciation(audio, lang_choice, intended_text):
         actual_text = transcribe_audio(audio, lang_choice)
         if not actual_text.strip():
-            return "⚠️ No speech detected. Please try recording again.", "", "", ""
         # Calculate metrics
         wer_val = jiwer.wer(intended_sentence, actual_text)
@@ -391,13 +435,13 @@ def analyze_pronunciation(audio, lang_choice, intended_text):
         # Get romanizations
         actual_roman = transliterate_with_qwen(actual_text, lang_choice)
-        # Create feedback
-        feedback_html, accuracy = create_feedback(intended_sentence, actual_text, lang_choice)
-        return actual_text, actual_roman, f"{wer_val:.1%}", feedback_html
     except Exception as e:
-        return f"❌ Error: {str(e)}", "", "", ""
 # ---------------- HELPERS ---------------- #
@@ -415,18 +459,18 @@ def get_random_sentence_with_transliteration(language_choice):
 with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎙️ AI Pronunciation Coach
-    ### Practice English, Tamil & Malayalam with AI feedback powered by Gemma-3-4B
     **Features:**
-    - ✨ **Smart Transliteration**: Natural Thanglish/Manglish using Gemma-3-4B-IT
     - 🎯 **Accurate Recognition**: Language-specific Whisper models
-    - 📊 **Instant Feedback**: Real-time pronunciation analysis
     **How to use:**
     1. Select your language
     2. Generate a practice sentence
     3. Record yourself reading it aloud
-    4. Get instant feedback!
     """)
     with gr.Row():
@@ -457,7 +501,24 @@ with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
         actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
         wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
-    feedback_display = gr.HTML()
     # Event handlers
     gen_btn.click(
@@ -469,7 +530,7 @@ with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     analyze_btn.click(
         fn=analyze_pronunciation,
         inputs=[audio_input, lang_choice, intended_display],
-        outputs=[actual_out, actual_roman_out, wer_out, feedback_display]
     )
 if __name__ == "__main__":

 # ---------------- FEEDBACK SYSTEM ---------------- #
+def normalize_text_for_comparison(text):
+    """Remove punctuation and normalize text for fair comparison"""
+    import string
+    # Remove punctuation and extra spaces
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    text = ' '.join(text.split())  # Normalize spaces
+    return text.lower()
 def create_feedback(intended, actual, lang_choice):
+    """Create simple feedback comparison with tables"""
     # Get transliterations
     intended_roman = transliterate_with_qwen(intended, lang_choice)
     actual_roman = transliterate_with_qwen(actual, lang_choice)
+    # Normalize for comparison (remove punctuation)
+    intended_normalized = normalize_text_for_comparison(intended)
+    actual_normalized = normalize_text_for_comparison(actual)
     # Calculate accuracy
+    intended_words = intended_normalized.split()
+    actual_words = actual_normalized.split()
     # Simple word-level accuracy
     sm = difflib.SequenceMatcher(None, intended_words, actual_words)
     accuracy = sm.ratio() * 100
+    # Create comparison data for table
+    comparison_data = [
+        ["Target Text", intended],
+        ["Target (Romanized)", intended_roman],
+        ["Your Speech", actual],
+        ["Your Speech (Romanized)", actual_roman],
+        ["Accuracy Score", f"{accuracy:.1f}%"]
+    ]
+    # Find incorrect words for pronunciation table
+    wrong_pronunciations = []
+    # Get word-level differences
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag == 'replace':
+            # Words that were pronounced differently
+            for idx in range(max(i2-i1, j2-j1)):
+                expected_word = intended_words[i1 + idx] if (i1 + idx) < i2 else ""
+                actual_word = actual_words[j1 + idx] if (j1 + idx) < j2 else ""
+                if expected_word and actual_word and expected_word != actual_word:
+                    # Get romanized versions
+                    expected_roman = transliterate_with_qwen(expected_word, lang_choice)
+                    actual_roman = transliterate_with_qwen(actual_word, lang_choice)
+                    wrong_pronunciations.append([
+                        expected_word,
+                        expected_roman,
+                        actual_word,
+                        actual_roman
+                    ])
+        elif tag == 'delete':
+            # Missing words
+            for idx in range(i2-i1):
+                expected_word = intended_words[i1 + idx]
+                expected_roman = transliterate_with_qwen(expected_word, lang_choice)
+                wrong_pronunciations.append([
+                    expected_word,
+                    expected_roman,
+                    "(Not spoken)",
+                    ""
+                ])
+        elif tag == 'insert':
+            # Extra words
+            for idx in range(j2-j1):
+                actual_word = actual_words[j1 + idx]
+                actual_roman = transliterate_with_qwen(actual_word, lang_choice)
+                wrong_pronunciations.append([
+                    "(Not expected)",
+                    "",
+                    actual_word,
+                    actual_roman
+                ])
+    # Create motivational message
+    if accuracy >= 95:
+        message = "🎉 Outstanding! Perfect pronunciation!"
+    elif accuracy >= 85:
+        message = "🌟 Excellent! Very natural sounding!"
+    elif accuracy >= 70:
+        message = "👍 Good job! Your pronunciation is improving!"
+    elif accuracy >= 50:
+        message = "📚 Getting there! Focus on the highlighted sounds!"
+    else:
+        message = "💪 Keep practicing! Every attempt makes you better!"
+    return comparison_data, wrong_pronunciations, message, accuracy
 # ---------------- MAIN FUNCTION ---------------- #
 def analyze_pronunciation(audio, lang_choice, intended_text):
     """Main function to analyze pronunciation"""
     if audio is None or not intended_text.strip():
+        return "⚠️ Please record audio and generate a sentence first.", "", "", [], [], ""
     try:
         # Extract original sentence (remove romanization if present)
         actual_text = transcribe_audio(audio, lang_choice)
         if not actual_text.strip():
+            return "⚠️ No speech detected. Please try recording again.", "", "", [], [], ""
         # Calculate metrics
         wer_val = jiwer.wer(intended_sentence, actual_text)
         # Get romanizations
         actual_roman = transliterate_with_qwen(actual_text, lang_choice)
+        # Create feedback tables
+        comparison_data, wrong_pronunciations, message, accuracy = create_feedback(intended_sentence, actual_text, lang_choice)
+        return actual_text, actual_roman, f"{wer_val:.1%}", comparison_data, wrong_pronunciations, message
     except Exception as e:
+        return f"❌ Error: {str(e)}", "", "", [], [], ""
 # ---------------- HELPERS ---------------- #
 with gr.Blocks(title="AI Pronunciation Coach", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎙️ AI Pronunciation Coach
+    ### Practice English, Tamil & Malayalam with AI feedback powered by top open-source LLMs
     **Features:**
+    - ✨ **Advanced Transliteration**: Natural Thanglish/Manglish using Qwen2.5-7B/Llama3.1-8B
     - 🎯 **Accurate Recognition**: Language-specific Whisper models
+    - 📊 **Smart Analysis**: Punctuation-aware comparison with correction tables
     **How to use:**
     1. Select your language
     2. Generate a practice sentence
     3. Record yourself reading it aloud
+    4. Get instant feedback with detailed analysis!
     """)
     with gr.Row():
         actual_roman_out = gr.Textbox(label="🔤 Your Pronunciation (Romanized)", interactive=False)
         wer_out = gr.Textbox(label="📊 Word Error Rate", interactive=False)
+    # Analysis tables
+    gr.Markdown("### 📊 Analysis Results")
+    with gr.Row():
+        with gr.Column():
+            comparison_table = gr.Dataframe(
+                headers=["Metric", "Value"],
+                label="📋 Overall Comparison",
+                interactive=False
+            )
+        with gr.Column():
+            pronunciation_table = gr.Dataframe(
+                headers=["Expected Word", "Expected (Romanized)", "You Said", "You Said (Romanized)"],
+                label="❌ Pronunciation Corrections Needed",
+                interactive=False
+            )
+    feedback_message = gr.Textbox(label="💬 Feedback", interactive=False)
     # Event handlers
     gen_btn.click(
     analyze_btn.click(
         fn=analyze_pronunciation,
         inputs=[audio_input, lang_choice, intended_display],
+        outputs=[actual_out, actual_roman_out, wer_out, comparison_table, pronunciation_table, feedback_message]
     )
 if __name__ == "__main__":