File size: 3,268 Bytes
869e885
 
8cb8264
592f7e1
 
 
 
869e885
6d1025b
 
 
f89a062
8cb8264
592f7e1
8cb8264
869e885
592f7e1
 
 
 
869e885
f9cd637
592f7e1
f9cd637
869e885
5c56ed6
592f7e1
869e885
 
 
 
47661bd
f89a062
6d1025b
6de75ee
869e885
 
 
47661bd
 
 
1b9402b
 
 
47661bd
1b9402b
47661bd
 
 
869e885
1b9402b
869e885
47661bd
1b9402b
592f7e1
6de75ee
 
6d1025b
592f7e1
 
6de75ee
 
 
6d1025b
592f7e1
 
 
6d1025b
 
 
 
 
 
 
 
 
 
f9cd637
592f7e1
 
 
 
1b9402b
5e4096f
fc933fb
5e4096f
 
 
1b9402b
592f7e1
aea18b3
869e885
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# app.py

import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf
import ffmpeg
import os
from huggingface_hub import InferenceClient
from gradio_client import Client, file
import spaces

warnings.filterwarnings("ignore")

# Load tokenizer, model, and processor
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# Move model to device
model.to(device)

def convert_audio_format(audio_path):
    output_path = "converted_audio.wav"
    ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
    return output_path

    
@spaces.GPU(duration=120, queue=False)
def transcribe_audio(audio_file, batch_size=4):
    audio_path = convert_audio_format(audio_file)
    audio_input, sample_rate = sf.read(audio_path)
    chunk_size = 16000 * 28  # 28 seconds chunks
    chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]

    transcription = ""
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
        with torch.no_grad():
            output = model.generate(
                inputs.input_features,
                max_length=1024,
                num_beams=7,
                attention_mask=attention_mask
            )
        transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "

    return transcription.strip()

# HTML
banner_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="picture" width="50%" height="auto">
</div>
"""

images_path = os.path.dirname(__file__)
IMAGES = [
    [
        {
            "text": "What usual stuff happens in this image? :)",
            "files": [f"{images_path}/500x_picture.png"],
        }
    ]
]

# Gradio interface
iface = gr.Blocks()

with iface:
    gr.HTML(banner_html)
    gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β˜•")
    audio_input = gr.Audio(type="filepath")
    batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
    transcription_output = gr.Textbox()
    transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)

# Launch interface
iface.launch(share=True, debug=True)