File size: 2,578 Bytes
12baae1
8c5a4c5
12b6ee7
 
8c5a4c5
 
 
 
 
12b6ee7
 
 
 
 
 
8c5a4c5
 
12b6ee7
8c5a4c5
12b6ee7
 
 
 
 
 
8c5a4c5
12b6ee7
8c5a4c5
 
12b6ee7
8c5a4c5
12b6ee7
8c5a4c5
 
 
 
0e268b1
 
 
 
 
 
 
 
 
 
12b6ee7
 
 
0e268b1
12b6ee7
 
 
 
 
0e268b1
12b6ee7
 
 
 
 
8c5a4c5
 
12b6ee7
 
8c5a4c5
 
 
 
12b6ee7
 
 
8c5a4c5
 
12b6ee7
 
 
 
 
 
 
0e268b1
12b6ee7
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
import gradio as gr
from transformers import pipeline
import pytube as pt

MODEL_NAME = "openai/whisper-small"

device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )
        file = microphone

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]

    return warn_output + text


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        '<center><iframe width="500" height="320" src="https://www.youtube.com/embed/'
        + video_id
        + '"></iframe></center>'
    )
    return HTML_str


def yt_transcribe(yt_url):

    yt = pt.YouTube(yt_url)
    html_embed_str = _return_yt_html_embed(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")

    text = pipe("audio.mp3")["text"]

    return html_embed_str, text


demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Audio(source="upload", type="filepath", optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Audio Transcribe",
    description="Transcribe long audio/ microphone input (powered by 🤗transformers) with a click of a button!",
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.inputs.Textbox(
            lines=1, placeholder="Paste a URL to YT video here", label="yt_url"
        )
    ],
    outputs=["html", "text"],
    layout="horizontal",
    theme="huggingface",
    title="Whisper YT Transcribe",
    description="Transcribe long YouTube videos (powered by 🤗transformers) with a click of a button!",
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface(
        [mf_transcribe, yt_transcribe], ["Audio Transcribe", "YouTube Transcribe"]
    )

demo.launch(enable_queue=True)