Spaces:

etchen
/

phonolearn

Sleeping

File size: 1,538 Bytes

365d3a1
 
eb7096f
365d3a1
 
 
 
e91b2cd
 
365d3a1
ee8c172
365d3a1
 
 
 
 
 
e91b2cd
365d3a1
 
 
eb7096f
 
365d3a1
298d6a8
 
 
 
 
eb7096f
 
 
 
d4e6541
eb7096f
365d3a1
4f40b30
 
 
 
 
 
 
365d3a1
7038cf7
eb7096f
7038cf7
 
365d3a1
eb7096f
 
 
 
 
 
 
 
e91b2cd
365d3a1

import gradio as gr
import numpy as np
from difflib import Differ

# import spaces #[uncomment to use ZeroGPU]
import torch

from transformers import pipeline 

device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" 

if torch.cuda.is_available():
    torch_dtype = torch.float16
else:
    torch_dtype = torch.float32

pipe = pipeline(task="automatic-speech-recognition", model=model_repo_id, device=device)

# @spaces.GPU #[uncomment to use ZeroGPU]
def infer(
    audio,
    target 
):
    sampling_rate, wav = audio 
    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    wav = wav.astype(np.float32)
    wav /= np.max(np.abs(wav))
    user_pron = pipe(wav)['text']
    
    # compare texts 
    d = Differ() 
    d_toks = [(i[2:], i[0] if i[0] != " " else None) for i in d.compare(target, user_pron)]
    return (user_pron, d_toks)

css = """
#col-container {
    margin: 0 auto;
    max-width: 640px;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(" # PhonoLearn")
    target = gr.Textbox(label='Practice Sentence (Tâi-lô)')
    input_audio = gr.Audio(
        sources=["microphone", "upload"]
    )
    output = gr.Textbox(label='Your Pronunciation')
    diff = gr.HighlightedText(
        label='Comparison',
        combine_adjacent=True,
        show_legend=True,
        color_map={'+': 'red', '-': 'green'}
    )
    input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff])
    
if __name__ == "__main__":
    demo.launch()