File size: 2,345 Bytes
aca9f3d
feb2a2b
 
 
51f7123
 
feb2a2b
 
 
 
 
 
 
 
 
ded23d4
51f7123
 
 
 
 
 
feb2a2b
 
 
 
 
 
 
 
44daa8d
 
feb2a2b
44daa8d
 
 
 
 
51f7123
 
3702096
44daa8d
d71b5df
44daa8d
feb2a2b
5b4ea6e
 
 
a2498ab
5b4ea6e
feb2a2b
 
 
ded23d4
d71b5df
1022fd5
ded23d4
44daa8d
1022fd5
 
44daa8d
cff8d27
 
 
1022fd5
5b4ea6e
feb2a2b
 
5b4ea6e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from pprint import pformat

from huggingface_hub import hf_hub_download

import datasets as hfd

import gradio as gr

from pipeline import PreTrainedPipeline


HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'


def main(audio_fp: str):
    # read and preprocess audio with huggingface.datasets
    ds = hfd.Dataset.from_dict({'path': [audio_fp]})
    ds = ds.cast_column('path', hfd.Audio(sampling_rate=16_000, mono=True))
    ds = ds.rename_column('path', 'audio')
    inputs = ds[0]['audio']['array']
    sampling_rate = ds[0]['audio']['sampling_rate']

    # download Language Model from HF Hub
    lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

    # init pipeline
    pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)

    # recognize speech
    pipeline_res = pipeline(inputs=inputs)
    text = pipeline_res['text'][0]  # unpack batch of size 1

    # add technical information to the output
    tech_data = pipeline_res
    del tech_data['text']
    tech_data['sampling_rate_orig'] = sampling_rate
    tech_data['inputs_shape'] = inputs.shape
    tech_data['inputs_max'] = inputs.max().item()
    tech_data['inputs_min'] = inputs.min().item()

    tech_data_str = pformat(tech_data)

    return text, tech_data_str

article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)

![visitors](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson)
"""

iface = gr.Interface(
    fn=main,
    inputs=gr.inputs.Audio(
        source='microphone', type='filepath',
        label='Запішыце аўдыяфайл, каб распазнаць маўленьне'
    ),
    outputs=[
        gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
        gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
    ],
    title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
    description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
                'Акустычная мадэль + моўная мадэль.'
    ),
    article=article,
)

iface.launch(enable_queue=True)