import gradio as gr import numpy as np from difflib import Differ # import spaces #[uncomment to use ZeroGPU] import torch from transformers import pipeline device = "cuda" if torch.cuda.is_available() else "cpu" model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" if torch.cuda.is_available(): torch_dtype = torch.float16 else: torch_dtype = torch.float32 pipe = pipeline(task="automatic-speech-recognition", model=model_repo_id, device=device) # @spaces.GPU #[uncomment to use ZeroGPU] def infer( audio, target ): sampling_rate, wav = audio if wav.ndim > 1: wav = wav.mean(axis=1) wav = wav.astype(np.float32) wav /= np.max(np.abs(wav)) user_pron = pipe(wav)['text'] # compare texts d = Differ() d_toks [(i[2:], i[0] if i[0] != " " else None) for i in d.compare(target, user_pron)] return (user_pron, d_toks) css = """ #col-container { margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: gr.Markdown(" # PhonoLearn") target = gr.Textbox(label='Practice Sentence (Tâi-lô)') input_audio = gr.Audio( sources=["microphone", "upload"] ) output = gr.Textbox(label='Your Pronunciation') diff = gr.HighlightedText( label='Comparison', combine_adjacent=True, show_legend=True, color_map={'+': 'red', '-': 'green'} ) input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff]) if __name__ == "__main__": demo.launch()