Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -85,7 +85,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
85 |
|
86 |
# Resample the audio to 16000 Hz if it’s not already
|
87 |
if sample_rate != 16000:
|
88 |
-
resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
89 |
waveform = resampler(waveform)
|
90 |
sample_rate = 16000
|
91 |
|
@@ -105,11 +105,11 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
105 |
if chunk_waveform.shape[0] > 1:
|
106 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
107 |
|
108 |
-
# Process the chunk
|
109 |
-
|
|
|
110 |
|
111 |
-
#
|
112 |
-
input_features = audio_input.input_features
|
113 |
attention_mask = torch.ones(input_features.shape, dtype=torch.long)
|
114 |
|
115 |
# ASR model inference on the chunk
|
@@ -136,7 +136,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
136 |
# Format of the result
|
137 |
result = (
|
138 |
f"Time taken: {output_time:.2f} seconds\n"
|
139 |
-
f"Audio duration: {audio_duration / 60:.2f
|
140 |
f"Real-time Factor (RTF): {rtf:.2f}\n"
|
141 |
f"Number of words: {len(text.split())}\n\n"
|
142 |
"Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
|
@@ -259,7 +259,7 @@ def save_to_pdf(text, summary):
|
|
259 |
iface = gr.Blocks()
|
260 |
|
261 |
with iface:
|
262 |
-
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/
|
263 |
gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
|
264 |
|
265 |
with gr.Tabs():
|
|
|
85 |
|
86 |
# Resample the audio to 16000 Hz if it’s not already
|
87 |
if sample_rate != 16000:
|
88 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
89 |
waveform = resampler(waveform)
|
90 |
sample_rate = 16000
|
91 |
|
|
|
105 |
if chunk_waveform.shape[0] > 1:
|
106 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
107 |
|
108 |
+
# Process the chunk with the tokenizer
|
109 |
+
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
|
110 |
+
input_features = inputs.input_features
|
111 |
|
112 |
+
# Explicitly set the attention mask
|
|
|
113 |
attention_mask = torch.ones(input_features.shape, dtype=torch.long)
|
114 |
|
115 |
# ASR model inference on the chunk
|
|
|
136 |
# Format of the result
|
137 |
result = (
|
138 |
f"Time taken: {output_time:.2f} seconds\n"
|
139 |
+
f"Audio duration: {audio_duration / 60:.2f minutes ({audio_duration:.2f} seconds)\n"
|
140 |
f"Real-time Factor (RTF): {rtf:.2f}\n"
|
141 |
f"Number of words: {len(text.split())}\n\n"
|
142 |
"Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
|
|
|
259 |
iface = gr.Blocks()
|
260 |
|
261 |
with iface:
|
262 |
+
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/lol.webp" alt="Banner Image" />')
|
263 |
gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
|
264 |
|
265 |
with gr.Tabs():
|