File size: 1,440 Bytes
412c852
 
 
 
 
 
 
 
41d1235
412c852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from transformers import pipeline
import torch
import gradio as gr
import librosa
import numpy as np
import os


p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german")
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad')

(get_speech_timestamps,
 _, read_audio,
 *_) = utils

def is_speech(wav, sr):
    speech_timestamps = get_speech_timestamps(wav, model,
                                    sampling_rate=sr)

    return len(speech_timestamps) > 0

def transcribe(audio, state={"text": "", "temp_text": "", "audio": None}):
    if state is None:
        state={"text": "", "temp_text": "", "audio": None}
    wav_data, _sr = librosa.load(audio, sr=16000)
    speech = is_speech(wav_data, _sr)
    if(speech):
        if(state["audio"] is None):
            state["audio"] = wav_data
        else:
            state["audio"] = np.concatenate((state["audio"], wav_data))

        text = p(state["audio"])["text"] + "\n"
        state["temp_text"] = text
    else:
        state["text"] += state["temp_text"]
        state["temp_text"] = ""
        state["audio"] = None

    return f'{state["text"]} ( {state["temp_text"]} )', state

gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath"), 
        "state"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    live=True).launch()