Spaces:
Build error
Build error
File size: 3,268 Bytes
869e885 8cb8264 592f7e1 869e885 6d1025b f89a062 8cb8264 592f7e1 8cb8264 869e885 592f7e1 869e885 f9cd637 592f7e1 f9cd637 869e885 5c56ed6 592f7e1 869e885 47661bd f89a062 6d1025b 6de75ee 869e885 47661bd 1b9402b 47661bd 1b9402b 47661bd 869e885 1b9402b 869e885 47661bd 1b9402b 592f7e1 6de75ee 6d1025b 592f7e1 6de75ee 6d1025b 592f7e1 6d1025b f9cd637 592f7e1 1b9402b 5e4096f fc933fb 5e4096f 1b9402b 592f7e1 aea18b3 869e885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# app.py
import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf
import ffmpeg
import os
from huggingface_hub import InferenceClient
from gradio_client import Client, file
import spaces
warnings.filterwarnings("ignore")
# Load tokenizer, model, and processor
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# Move model to device
model.to(device)
def convert_audio_format(audio_path):
output_path = "converted_audio.wav"
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
return output_path
@spaces.GPU(duration=120, queue=False)
def transcribe_audio(audio_file, batch_size=4):
audio_path = convert_audio_format(audio_file)
audio_input, sample_rate = sf.read(audio_path)
chunk_size = 16000 * 28 # 28 seconds chunks
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=1024,
num_beams=7,
attention_mask=attention_mask
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
return transcription.strip()
# HTML
banner_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="picture" width="50%" height="auto">
</div>
"""
images_path = os.path.dirname(__file__)
IMAGES = [
[
{
"text": "What usual stuff happens in this image? :)",
"files": [f"{images_path}/500x_picture.png"],
}
]
]
# Gradio interface
iface = gr.Blocks()
with iface:
gr.HTML(banner_html)
gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox()
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
# Launch interface
iface.launch(share=True, debug=True)
|