|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import paddlehub as hub |
|
from paddlenlp import Taskflow |
|
from paddlespeech.cli import ASRExecutor |
|
import soundfile as sf |
|
|
|
|
|
asr_executor = ASRExecutor() |
|
text_correct_model = Taskflow("text_correction") |
|
punc_model = hub.Module(name='auto_punc') |
|
|
|
|
|
def speech_recognize(file): |
|
data, sr = librosa.load(file) |
|
if sr != 16000: |
|
data = librosa.resample(data, sr, 16000) |
|
sf.write(file, data, samplerate=16000) |
|
|
|
print(f'[Audio Input] shape: {data.shape}, dtype: {data.dtype}, file: {file}') |
|
|
|
text = asr_executor(file) |
|
text_correction = text_correct_model(text)[0] |
|
cor_text, errors = text_correction['target'], text_correction['errors'] |
|
print(f'[Text Correction] errors: {errors}') |
|
punc_text = punc_model.add_puncs(cor_text, device='cpu')[0] |
|
|
|
ret = '' |
|
ret += f'[ASR] {text}\n' |
|
ret += f'[COR] {cor_text}\n' |
|
ret += f'[PUN] {punc_text}' |
|
|
|
return text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=speech_recognize, |
|
inputs=gr.Audio(source="microphone", type='filepath'), |
|
outputs="text" |
|
) |
|
iface.launch() |
|
|