Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -37,6 +37,14 @@ import gradio as gr
|
|
37 |
from fpdf import FPDF
|
38 |
from PIL import Image
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Suppress warnings
|
41 |
warnings.filterwarnings("ignore")
|
42 |
|
@@ -104,34 +112,25 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
104 |
if chunk_waveform.shape[0] > 1:
|
105 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
106 |
|
107 |
-
# Process chunk with tokenizer
|
108 |
-
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
|
109 |
-
input_features = inputs.input_features
|
110 |
-
|
111 |
-
# Create attention mask
|
112 |
-
attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
|
113 |
-
|
114 |
-
# -- does not output input_ids (i.e, processor)
|
115 |
-
# input_ids = inputs['input_ids']
|
116 |
-
# attention_mask[input_ids == processor.tokenizer.pad_token_id] = 0
|
117 |
|
|
|
|
|
118 |
|
119 |
-
#
|
120 |
-
attention_mask
|
121 |
-
|
122 |
|
123 |
# ASR model inference on the chunk
|
124 |
with torch.no_grad():
|
125 |
generated_ids = model.generate(
|
126 |
-
input_features=input_features.to(device),
|
127 |
-
attention_mask=attention_mask
|
128 |
**generate_kwargs
|
129 |
-
|
|
|
|
|
|
|
|
|
130 |
|
131 |
-
# new processor object with desired configuration
|
132 |
-
#new_processor = processor.add_special_tokens({'eos_token': '[EOS]'})
|
133 |
-
chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
134 |
-
full_text.append(chunk_text)
|
135 |
|
136 |
# Combine the transcribed text from all chunks
|
137 |
text = " ".join(full_text)
|
@@ -269,8 +268,8 @@ def save_to_pdf(text, summary):
|
|
269 |
iface = gr.Blocks()
|
270 |
|
271 |
with iface:
|
272 |
-
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt=""
|
273 |
-
gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
|
274 |
|
275 |
with gr.Tabs():
|
276 |
with gr.TabItem("Transcription"):
|
|
|
37 |
from fpdf import FPDF
|
38 |
from PIL import Image
|
39 |
|
40 |
+
|
41 |
+
PLACEHOLDER = """
|
42 |
+
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
43 |
+
<img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
|
44 |
+
<h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
|
45 |
+
<p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
|
46 |
+
</div>
|
47 |
+
"""
|
48 |
# Suppress warnings
|
49 |
warnings.filterwarnings("ignore")
|
50 |
|
|
|
112 |
if chunk_waveform.shape[0] > 1:
|
113 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
# Tokenize the input batch with the processor
|
117 |
+
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", task="transcribe")
|
118 |
|
119 |
+
# Use the attention mask directly from the tokenizer output
|
120 |
+
attention_mask = inputs.attention_mask.to(device)
|
|
|
121 |
|
122 |
# ASR model inference on the chunk
|
123 |
with torch.no_grad():
|
124 |
generated_ids = model.generate(
|
125 |
+
input_features=inputs.input_features.to(device),
|
126 |
+
attention_mask=attention_mask,
|
127 |
**generate_kwargs
|
128 |
+
)
|
129 |
+
|
130 |
+
# Decode the generated IDs to text
|
131 |
+
chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
132 |
+
full_text.append(chunk_text)
|
133 |
|
|
|
|
|
|
|
|
|
134 |
|
135 |
# Combine the transcribed text from all chunks
|
136 |
text = " ".join(full_text)
|
|
|
268 |
iface = gr.Blocks()
|
269 |
|
270 |
with iface:
|
271 |
+
#gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.55; >')
|
272 |
+
#gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
|
273 |
|
274 |
with gr.Tabs():
|
275 |
with gr.TabItem("Transcription"):
|