Spaces:
Running
Running
File size: 3,111 Bytes
aca9f3d feb2a2b adca0d8 feb2a2b adca0d8 6b93fd2 feb2a2b 1c6b627 adca0d8 1c6b627 adca0d8 1c6b627 adca0d8 1c6b627 adca0d8 1c6b627 feb2a2b 6b93fd2 feb2a2b 44daa8d adca0d8 feb2a2b 44daa8d adca0d8 3702096 44daa8d d71b5df 44daa8d feb2a2b 6b93fd2 5b4ea6e abfa68a 5b4ea6e feb2a2b 1c6b627 adca0d8 1c6b627 adca0d8 1c6b627 44daa8d adca0d8 44daa8d adca0d8 feb2a2b adca0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from pprint import pformat
import gradio as gr
import librosa
from huggingface_hub import hf_hub_download
from pipeline import PreTrainedPipeline
HF_HUB_URL = "ales/wav2vec2-cv-be"
LM_HUB_FP = "language_model/cv8be_5gram.bin"
MODEL_SAMPLING_RATE = 16_000 # 16kHz
# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
def main(recorded_audio_fp: str | None, uploaded_audio_fp: str | None):
audio_fp = None
if recorded_audio_fp is not None:
audio_fp = recorded_audio_fp
used_audiofile = "recorded"
elif uploaded_audio_fp is not None:
audio_fp = uploaded_audio_fp
used_audiofile = "uploaded"
else:
return (
"Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.",
"Error! You have to either record or upload an audiofile.",
)
# read audio file
inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
# recognize speech
pipeline_res = pipeline(inputs=inputs)
text = pipeline_res["text"][0] # unpack batch of size 1
# add technical information to the output
tech_data = pipeline_res
del tech_data["text"]
tech_data["used_audiofile"] = used_audiofile
tech_data["recorded_file_present"] = recorded_audio_fp is not None
tech_data["uploaded_file_present"] = uploaded_audio_fp is not None
tech_data["audiofile_path"] = audio_fp
tech_data["model_sampling_rate"] = MODEL_SAMPLING_RATE
tech_data["inputs_shape"] = inputs.shape
tech_data["inputs_max"] = inputs.max().item()
tech_data["inputs_min"] = inputs.min().item()
tech_data_str = pformat(tech_data)
return text, tech_data_str
article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)

"""
iface = gr.Interface(
fn=main,
inputs=[
gr.Audio(
sources=["microphone"],
type="filepath",
label="Запішыце аўдыяфайл, каб распазнаць маўленьне",
),
gr.Audio(
sources=["upload"],
type="filepath",
label="Альбо загрузіце ўжо запісаны аўдыяфайл сюды",
),
],
outputs=[
gr.Textbox(label="Распазнаны тэкст"),
gr.Textbox(label="Тэхнічная інфармацыя"),
],
title="wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model",
description=(
"Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n"
"Акустычная мадэль + моўная мадэль."
),
article=article,
)
iface.launch()
|