File size: 3,111 Bytes
aca9f3d
feb2a2b
 
adca0d8
 
feb2a2b
 
 
adca0d8
 
6b93fd2
feb2a2b
1c6b627
 
 
 
 
 
 
adca0d8
1c6b627
 
 
adca0d8
1c6b627
 
adca0d8
1c6b627
 
adca0d8
 
1c6b627
feb2a2b
6b93fd2
 
feb2a2b
 
44daa8d
adca0d8
feb2a2b
44daa8d
 
adca0d8
 
 
 
 
 
 
 
 
3702096
44daa8d
d71b5df
44daa8d
feb2a2b
6b93fd2
5b4ea6e
 
 
abfa68a
5b4ea6e
feb2a2b
 
 
1c6b627
adca0d8
 
 
 
1c6b627
adca0d8
 
 
 
1c6b627
 
44daa8d
adca0d8
 
44daa8d
adca0d8
 
 
 
 
 
feb2a2b
 
adca0d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from pprint import pformat

import gradio as gr
import librosa
from huggingface_hub import hf_hub_download

from pipeline import PreTrainedPipeline

HF_HUB_URL = "ales/wav2vec2-cv-be"
LM_HUB_FP = "language_model/cv8be_5gram.bin"
MODEL_SAMPLING_RATE = 16_000  # 16kHz

# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)


def main(recorded_audio_fp: str | None, uploaded_audio_fp: str | None):
    audio_fp = None
    if recorded_audio_fp is not None:
        audio_fp = recorded_audio_fp
        used_audiofile = "recorded"
    elif uploaded_audio_fp is not None:
        audio_fp = uploaded_audio_fp
        used_audiofile = "uploaded"
    else:
        return (
            "Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.",
            "Error! You have to either record or upload an audiofile.",
        )

    # read audio file
    inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]

    # recognize speech
    pipeline_res = pipeline(inputs=inputs)
    text = pipeline_res["text"][0]  # unpack batch of size 1

    # add technical information to the output
    tech_data = pipeline_res
    del tech_data["text"]
    tech_data["used_audiofile"] = used_audiofile
    tech_data["recorded_file_present"] = recorded_audio_fp is not None
    tech_data["uploaded_file_present"] = uploaded_audio_fp is not None
    tech_data["audiofile_path"] = audio_fp
    tech_data["model_sampling_rate"] = MODEL_SAMPLING_RATE
    tech_data["inputs_shape"] = inputs.shape
    tech_data["inputs_max"] = inputs.max().item()
    tech_data["inputs_min"] = inputs.min().item()

    tech_data_str = pformat(tech_data)

    return text, tech_data_str


article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)

![Page Visits](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson&left_text=Page%20Visits)
"""

iface = gr.Interface(
    fn=main,
    inputs=[
        gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="Запішыце аўдыяфайл, каб распазнаць маўленьне",
        ),
        gr.Audio(
            sources=["upload"],
            type="filepath",
            label="Альбо загрузіце ўжо запісаны аўдыяфайл сюды",
        ),
    ],
    outputs=[
        gr.Textbox(label="Распазнаны тэкст"),
        gr.Textbox(label="Тэхнічная інфармацыя"),
    ],
    title="wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model",
    description=(
        "Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n"
        "Акустычная мадэль + моўная мадэль."
    ),
    article=article,
)

iface.launch()