Spaces:

camparchimedes
/

nb

Build error

camparchimedes commited on Aug 22, 2024

Commit

6e9c279

verified ·

1 Parent(s): 46546e2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -104,16 +104,16 @@ def transcribe_audio(audio_file, chunk_length_s=30):
         if chunk_waveform.shape[0] > 1:
             chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
-        # Process the chunk with the tokenizer
         inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs.input_features
         # Create attention mask
         attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
-        # Check the dimensions and values of the attention mask
-        assert attention_mask.shape == (1, input_features.shape[1]), "Attention mask dimensions do not match the input features."
-        assert (attention_mask.sum().item() == input_features.shape[1]), "Attention mask has incorrect values."
         # ASR model inference on the chunk
         with torch.no_grad():
@@ -264,7 +264,7 @@ def save_to_pdf(text, summary):
 iface = gr.Blocks()
 with iface:
-    gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/banner_trans.png" width="100%" height="auto"/>')
     gr.Markdown("**Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
     with gr.Tabs():

         if chunk_waveform.shape[0] > 1:
             chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
+        # Process chunk with tokenizer
         inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs.input_features
         # Create attention mask
         attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
+        # Set the attention mask to zero for padding tokens
+        attention_mask[input_features == processor.tokenizer.pad_token_id] = 0
         # ASR model inference on the chunk
         with torch.no_grad():
 iface = gr.Blocks()
 with iface:
+    gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/resolve/main/banner_trans.png" width="100%" height="auto"/>')
     gr.Markdown("**Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
     with gr.Tabs():