| import gradio as gr | |
| from transformers import pipeline | |
| import numpy as np | |
| import os | |
| accuracy_classifier = pipeline(task="audio-classification", model="JohnJumon/pronunciation_accuracy") | |
| fluency_classifier = pipeline(task="audio-classification", model="JohnJumon/fluency_accuracy") | |
| prosodic_classifier = pipeline(task="audio-classification", model="JohnJumon/prosodic_accuracy") | |
| def pronunciation_scoring(audio): | |
| accuracy_description = { | |
| 'Extremely Poor': 'Extremely poor pronunciation and only one or two words are recognizable', | |
| 'Poor': 'Poor, clumsy and rigid pronunciation of the sentence as a whole, with serious pronunciation mistakes', | |
| 'Average': 'The overall pronunciation of the sentence is understandable, with many pronunciation mistakes and accent, but it does not affect the understanding of basic meanings', | |
| 'Good': 'The overall pronunciation of the sentence is good, with a few pronunciation mistakes', | |
| 'Excellent': 'The overall pronunciation of the sentence is excellent, with accurate phonology and no obvious pronunciation mistakes' | |
| } | |
| fluency_description = { | |
| 'Very Influent': 'Intermittent, very influent speech, with lots of pauses, repetition, and stammering', | |
| 'Influent': 'The speech is a little influent, with many pauses, repetition, and stammering', | |
| 'Average': 'Fluent in general, with a few pauses, repetition, and stammering', | |
| 'Fluent': 'Fluent without noticeable pauses or stammering' | |
| } | |
| prosodic_description = { | |
| 'Poor': 'Poor intonation and lots of stammering and pauses, unable to read a complete sentence', | |
| 'Unstable': 'Unstable speech speed, speak too fast or too slow, without the sense of rhythm', | |
| 'Stable': 'Unstable speech speed, many stammering and pauses with a poor sense of rhythm', | |
| 'Almost': 'Nearly correct intonation at a stable speaking speed, nearly smooth and coherent, but with little stammering and few pauses', | |
| 'Perfect': 'Correct intonation at a stable speaking speed, speak with cadence, and can speak like a native' | |
| } | |
| accuracy = accuracy_classifier(audio) | |
| fluency = fluency_classifier(audio) | |
| prosodic = prosodic_classifier(audio) | |
| result = { | |
| 'accuracy': accuracy, | |
| 'fluency': fluency, | |
| 'prosodic': prosodic | |
| } | |
| for category, scores in result.items(): | |
| max_score_label = max(scores, key=lambda x: x['score'])['label'] | |
| result[category] = max_score_label | |
| return result['accuracy'], accuracy_description[result['accuracy']], result['fluency'], fluency_description[result['fluency']], result['prosodic'], prosodic_description[result['prosodic']] | |
| gradio_app = gr.Interface( | |
| pronunciation_scoring, | |
| inputs=gr.Audio(sources="microphone", type="filepath"), | |
| outputs=[ | |
| gr.Label(label="Accuracy Result"), | |
| gr.Textbox(interactive=False, show_label=False), | |
| gr.Label(label="Fluency Result"), | |
| gr.Textbox(interactive=False, show_label=False), | |
| gr.Label(label="Prosodic Result"), | |
| gr.Textbox(interactive=False, show_label=False) | |
| ], | |
| title="Pronunciation Scoring", | |
| description="This app will score your pronunciation accuracy, fluency, and prosodic (intonation)", | |
| examples=[ | |
| [os.path.join(os.path.dirname(__file__),"audio.wav")], | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| gradio_app.launch() |