Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,7 @@ import warnings
|
|
3 |
import torch
|
4 |
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
5 |
import soundfile as sf
|
6 |
-
|
7 |
-
import os
|
8 |
|
9 |
|
10 |
warnings.filterwarnings("ignore")
|
@@ -21,21 +20,27 @@ torch_dtype = torch.float32
|
|
21 |
# move model to device
|
22 |
model.to(device)
|
23 |
|
24 |
-
|
25 |
def transcribe_audio(audio_file):
|
26 |
-
audio_input,
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# HTML for banner image
|
41 |
banner_html = """
|
@@ -49,7 +54,7 @@ iface = gr.Blocks()
|
|
49 |
|
50 |
with iface:
|
51 |
gr.HTML(banner_html)
|
52 |
-
gr.Markdown("#
|
53 |
audio_input = gr.Audio(type="filepath")
|
54 |
transcription_output = gr.Textbox()
|
55 |
transcribe_button = gr.Button("Transcribe")
|
@@ -57,5 +62,4 @@ with iface:
|
|
57 |
transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
|
58 |
|
59 |
# Launch the interface
|
60 |
-
iface.launch(share=True, debug=True)
|
61 |
-
|
|
|
3 |
import torch
|
4 |
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
5 |
import soundfile as sf
|
6 |
+
from huggingface_hub import spaces
|
|
|
7 |
|
8 |
|
9 |
warnings.filterwarnings("ignore")
|
|
|
20 |
# move model to device
|
21 |
model.to(device)
|
22 |
|
23 |
+
|
24 |
def transcribe_audio(audio_file):
|
25 |
+
audio_input, sample_rate = sf.read(audio_file)
|
26 |
+
chunk_size = 16000 * 28 # 28 seconds chunks (seems to work best)
|
27 |
+
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
28 |
+
|
29 |
+
transcription = ""
|
30 |
+
for chunk in chunks:
|
31 |
+
inputs = processor(chunk, sampling_rate=16000, return_tensors="pt")
|
32 |
+
inputs = inputs.to(device)
|
33 |
+
with torch.no_grad():
|
34 |
+
output = model.generate(
|
35 |
+
inputs.input_features,
|
36 |
+
max_length=2048, # Increase max_length for longer outputs
|
37 |
+
num_beams=10,
|
38 |
+
task="transcribe",
|
39 |
+
language="no"
|
40 |
+
)
|
41 |
+
transcription += processor.batch_decode(output, skip_special_tokens=True)[0] + " "
|
42 |
+
|
43 |
+
return transcription.strip()
|
44 |
|
45 |
# HTML for banner image
|
46 |
banner_html = """
|
|
|
54 |
|
55 |
with iface:
|
56 |
gr.HTML(banner_html)
|
57 |
+
gr.Markdown("# Ola's AudioSwitch2Go ππ§βπ§πΌβπ«@{NbAiLab/whisper-norwegian-medium}\nUpload audio file (if .ma4 ~simply rename it to .mp3 before upload)")
|
58 |
audio_input = gr.Audio(type="filepath")
|
59 |
transcription_output = gr.Textbox()
|
60 |
transcribe_button = gr.Button("Transcribe")
|
|
|
62 |
transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
|
63 |
|
64 |
# Launch the interface
|
65 |
+
iface.launch(share=True, debug=True)
|
|