Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
from difflib import Differ | |
# import spaces #[uncomment to use ZeroGPU] | |
import torch | |
from transformers import pipeline | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" | |
if torch.cuda.is_available(): | |
torch_dtype = torch.float16 | |
else: | |
torch_dtype = torch.float32 | |
pipe = pipeline(task="automatic-speech-recognition", model=model_repo_id, device=device) | |
# @spaces.GPU #[uncomment to use ZeroGPU] | |
def infer( | |
audio, | |
target | |
): | |
sampling_rate, wav = audio | |
if wav.ndim > 1: | |
wav = wav.mean(axis=1) | |
wav = wav.astype(np.float32) | |
wav /= np.max(np.abs(wav)) | |
user_pron = pipe(wav)['text'] | |
# compare texts | |
d = Differ() | |
d_toks = [(i[2:], i[0] if i[0] != " " else None) for i in d.compare(target, user_pron)] | |
return (user_pron, d_toks) | |
css = """ | |
#col-container { | |
margin: 0 auto; | |
max-width: 640px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(" # PhonoLearn") | |
target = gr.Textbox(label='Practice Sentence (Tâi-lô)') | |
input_audio = gr.Audio( | |
sources=["microphone", "upload"] | |
) | |
output = gr.Textbox(label='Your Pronunciation') | |
diff = gr.HighlightedText( | |
label='Comparison', | |
combine_adjacent=True, | |
show_legend=True, | |
color_map={'+': 'red', '-': 'green'} | |
) | |
input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff]) | |
if __name__ == "__main__": | |
demo.launch() | |