camparchimedes commited on
Commit
6e9c279
·
verified ·
1 Parent(s): 46546e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -104,16 +104,16 @@ def transcribe_audio(audio_file, chunk_length_s=30):
104
  if chunk_waveform.shape[0] > 1:
105
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
106
 
107
- # Process the chunk with the tokenizer
108
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
109
  input_features = inputs.input_features
110
 
111
  # Create attention mask
112
  attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
113
 
114
- # Check the dimensions and values of the attention mask
115
- assert attention_mask.shape == (1, input_features.shape[1]), "Attention mask dimensions do not match the input features."
116
- assert (attention_mask.sum().item() == input_features.shape[1]), "Attention mask has incorrect values."
117
 
118
  # ASR model inference on the chunk
119
  with torch.no_grad():
@@ -264,7 +264,7 @@ def save_to_pdf(text, summary):
264
  iface = gr.Blocks()
265
 
266
  with iface:
267
- gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/banner_trans.png" width="100%" height="auto"/>')
268
  gr.Markdown("**Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
269
 
270
  with gr.Tabs():
 
104
  if chunk_waveform.shape[0] > 1:
105
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
106
 
107
+ # Process chunk with tokenizer
108
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
109
  input_features = inputs.input_features
110
 
111
  # Create attention mask
112
  attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
113
 
114
+ # Set the attention mask to zero for padding tokens
115
+ attention_mask[input_features == processor.tokenizer.pad_token_id] = 0
116
+
117
 
118
  # ASR model inference on the chunk
119
  with torch.no_grad():
 
264
  iface = gr.Blocks()
265
 
266
  with iface:
267
+ gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/resolve/main/banner_trans.png" width="100%" height="auto"/>')
268
  gr.Markdown("**Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
269
 
270
  with gr.Tabs():