File size: 2,223 Bytes
8cb8264
592f7e1
 
 
 
930fb43
afe3d6c
8cb8264
592f7e1
8cb8264
f9cd637
592f7e1
 
 
 
f9cd637
 
592f7e1
f9cd637
 
5c56ed6
592f7e1
47661bd
592f7e1
47661bd
aea18b3
47661bd
 
 
 
 
 
 
 
 
aea18b3
 
47661bd
 
 
 
 
 
592f7e1
aea18b3
592f7e1
 
5e4096f
592f7e1
 
 
f9cd637
592f7e1
 
 
 
aea18b3
5e4096f
 
 
 
 
592f7e1
aea18b3
47661bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf



warnings.filterwarnings("ignore")

# Load tokenizer + model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")

# set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# move model to device
model.to(device)


def transcribe_audio(audio_file):
    audio_input, sample_rate = sf.read(audio_file)
    chunk_size = 16000 * 28  # 28 seconds chunks, seems to work best
    chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]

    transcription = ""
    for chunk in chunks:
        inputs = processor(chunk, sampling_rate=16000, return_tensors="pt")
        inputs = inputs.to(device)
        with torch.no_grad():
            output = model.generate(
                inputs.input_features,
                max_length=1024,  # Increase max_length@longer outputs
                num_beams=5,
                task="transcribe",
                language="no"
            )
        transcription += processor.batch_decode(output, skip_special_tokens=True)[0] + " "

    return transcription.strip()

# HTML |banner image
banner_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%; height:auto;">
</div>
"""

# Gradio interface
iface = gr.Blocks()

with iface:
    gr.HTML(banner_html)
    gr.Markdown("# Nvidia A100πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘β˜•πŸ§‘πŸΌβ€πŸ«@{NbAiLab/whisper-norwegian-medium}\nUpload audio file (*needs to be in .mp3 format before upload*)")
    audio_input = gr.Audio(type="filepath")
    transcription_output = gr.Textbox()
    transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)

# Launch interface
iface.launch(share=True, debug=True)