File size: 10,998 Bytes
fc4ba88 5f95ca3 fc4ba88 4a18e2f a564048 2802277 1ea2340 faf33c0 f79c23e fc4ba88 ec1d635 306f4a4 ec1d635 306f4a4 fc4ba88 daef960 3036933 d51a20c fc4ba88 76c31fd fc4ba88 3036933 fc4ba88 3036933 fc4ba88 3036933 fc4ba88 3036933 8921104 3036933 fc4ba88 c94044f e2df842 b8f37d3 ec1d635 9e3d054 d63c230 e75f078 c29ed6b e75f078 9cb6dfc e75f078 b446ea0 b7f64d1 ec1d635 f99469e ec1d635 b7f64d1 ec1d635 3036933 48be26b b446ea0 d233eb6 45d3a13 3ef4c6e c6e6055 e26865b fc4ba88 3036933 fc4ba88 db5d6fb 2aadead fc4ba88 c016131 fc4ba88 c016131 3814d7c fc4ba88 ced2c71 fc4ba88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import os
from pprint import pprint
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from transformers import pipeline
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
# import streaming.py
# from next_word_prediction import GPT2
### code snippet
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", return_dict_in_generate=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
### /code snippet
from share_btn import community_icon_html, loading_icon_html, share_js
# get gpt2 model
generator = pipeline('text-generation', model='gpt2')
# whisper model specification
model = whisper.load_model("tiny")
def buttonValues(value):
value = "Hello"
return value
def inference(audio):
# load audio data
audio = whisper.load_audio(audio)
# ensure sample is in correct format for inference
audio = whisper.pad_or_trim(audio)
# generate a log-mel spetrogram of the audio data
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
# decode audio data
options = whisper.DecodingOptions(fp16 = False)
# transcribe speech to text
result = whisper.decode(model, mel, options)
# Added prompt below
input_prompt = "The following is a transcript of someone talking, please predict what they will say next. \n"
### code
input_total = input_prompt + result.text
input_ids = tokenizer(input_total, return_tensors="pt").input_ids
print("inputs ", input_ids)
# prompt length
# prompt_length = len(tokenizer.decode(inputs_ids[0]))
# length penalty for gpt2.generate???
#Prompt
generated_outputs = gpt2.generate(input_ids, do_sample=True, num_return_sequences=3, output_scores=True)
print("outputs generated ", generated_outputs[0])
# only use id's that were generated
# gen_sequences has shape [3, 15]
gen_sequences = generated_outputs.sequences[:, input_ids.shape[-1]:]
print("gen sequences: ", gen_sequences)
# let's stack the logits generated at each step to a tensor and transform
# logits to probs
probs = torch.stack(generated_outputs.scores, dim=1).softmax(-1) # -> shape [3, 15, vocab_size]
# now we need to collect the probability of the generated token
# we need to add a dummy dim in the end to make gather work
gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)
print("gen probs result: ", gen_probs)
# now we can do all kinds of things with the probs
# 1) the probs that exactly those sequences are generated again
# those are normally going to be very small
# unique_prob_per_sequence = gen_probs.prod(-1)
# 2) normalize the probs over the three sequences
# normed_gen_probs = gen_probs / gen_probs.sum(0)
# assert normed_gen_probs[:, 0].sum() == 1.0, "probs should be normalized"
# 3) compare normalized probs to each other like in 1)
# unique_normed_prob_per_sequence = normed_gen_probs.prod(-1)
### end code
# print audio data as text
# print(result.text)
# prompt
getText = generator(result.text, max_new_tokens=10, num_return_sequences=5)
# pprint("getText: ", getText)
# pprint("text.result: ", result.text)
# result.text
return getText, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
css = """
.gradio-container {
font-family: 'IBM Plex Sans', sans-serif;
}
.gr-button {
color: white;
border-color: black;
background: black;
}
input[type='range'] {
accent-color: black;
}
.dark input[type='range'] {
accent-color: #dfdfdf;
}
.container {
max-width: 730px;
margin: auto;
padding-top: 1.5rem;
}
.details:hover {
text-decoration: underline;
}
.gr-button {
white-space: nowrap;
}
.gr-button:focus {
border-color: rgb(147 197 253 / var(--tw-border-opacity));
outline: none;
box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
--tw-border-opacity: 1;
--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
--tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
--tw-ring-opacity: .5;
}
.footer {
margin-bottom: 45px;
margin-top: 35px;
text-align: center;
border-bottom: 1px solid #e5e5e5;
}
.footer>p {
font-size: .8rem;
display: inline-block;
padding: 0 10px;
transform: translateY(10px);
background: white;
}
.dark .footer {
border-color: #303030;
}
.dark .footer>p {
background: #0b0f19;
}
.prompt h4{
margin: 1.25em 0 .25em 0;
font-weight: bold;
font-size: 115%;
}
.animate-spin {
animation: spin 1s linear infinite;
}
@keyframes spin {
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
#share-btn-container {
display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
}
#share-btn {
all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
}
#share-btn * {
all: unset;
}
"""
block = gr.Blocks(css=css)
with block:
gr.HTML(
"""
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<svg
width="0.65em"
height="0.65em"
viewBox="0 0 115 115"
fill="none"
xmlns="http://www.w3.org/2000/svg"
>
<rect width="23" height="23" fill="white"></rect>
<rect y="69" width="23" height="23" fill="white"></rect>
<rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
<rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
<rect x="46" width="23" height="23" fill="white"></rect>
<rect x="46" y="69" width="23" height="23" fill="white"></rect>
<rect x="69" width="23" height="23" fill="black"></rect>
<rect x="69" y="69" width="23" height="23" fill="black"></rect>
<rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
<rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
<rect x="115" y="46" width="23" height="23" fill="white"></rect>
<rect x="115" y="115" width="23" height="23" fill="white"></rect>
<rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
<rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
<rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
<rect x="92" y="69" width="23" height="23" fill="white"></rect>
<rect x="69" y="46" width="23" height="23" fill="white"></rect>
<rect x="69" y="115" width="23" height="23" fill="white"></rect>
<rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
<rect x="46" y="46" width="23" height="23" fill="black"></rect>
<rect x="46" y="115" width="23" height="23" fill="black"></rect>
<rect x="46" y="69" width="23" height="23" fill="black"></rect>
<rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
<rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
<rect x="23" y="69" width="23" height="23" fill="black"></rect>
</svg>
<h1 style="font-weight: 900; margin-bottom: 7px;">
Whisper
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs.
</p>
<p>You can skip the queue by using google colab for the space: <a href="https://colab.research.google.com/drive/1WJ98KHgZxFGrHiMm4TyWZllSew_Af_ff?usp=sharing"><img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667"></a></p>
</div>
"""
)
with gr.Group():
with gr.Box():
with gr.Row().style(mobile_collapse=False, equal_height=True):
# get audio from microphone
audio = gr.Audio(
label="Input Audio",
show_label=False,
source="microphone",
type="filepath"
)
btn = gr.Button("Transcribe")
text = gr.Textbox(show_label=False, elem_id="result-textarea")
# added rText below
# rText = gr.Textbox(show_label=False, elem_id="result-textarea")
buttonV = gr.Button(" ")
buttonV.click(buttonValues, inputs=[], outputs=[])
btn.click(inference, inputs=[audio], outputs=[text])
gr.HTML('''
<div class="footer">
<p>Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - Gradio Demo by 🤗 Hugging Face
</p>
</div>
''')
block.launch() |