camparchimedes commited on
Commit
8158ef0
·
verified ·
1 Parent(s): 9983004

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -5
app.py CHANGED
@@ -83,7 +83,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
83
  if waveform.shape[0] > 1:
84
  waveform = torch.mean(waveform, dim=0, keepdim=True)
85
 
86
- # Resample the audio to 16000 Hz if it’s not already
87
  if sample_rate != 16000:
88
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
89
  waveform = resampler(waveform)
@@ -109,9 +109,10 @@ def transcribe_audio(audio_file, chunk_length_s=30):
109
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
110
  input_features = inputs.input_features
111
 
112
- # Explicitly set the attention mask
113
- attention_mask = torch.ones(input_features.shape, dtype=torch.long)
114
-
 
115
  # ASR model inference on the chunk
116
  with torch.no_grad():
117
  generated_ids = model.generate(
@@ -258,7 +259,7 @@ def save_to_pdf(text, summary):
258
  iface = gr.Blocks()
259
 
260
  with iface:
261
- gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/lol.webp" alt="Banner Image" />')
262
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
263
 
264
  with gr.Tabs():
 
83
  if waveform.shape[0] > 1:
84
  waveform = torch.mean(waveform, dim=0, keepdim=True)
85
 
86
+ # Resample audio to 16000 Hz if it’s not already
87
  if sample_rate != 16000:
88
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
89
  waveform = resampler(waveform)
 
109
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
110
  input_features = inputs.input_features
111
 
112
+ # Create attention mask
113
+ attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
114
+ attention_mask[inputs.input_features == processor.pad_token_id] = 0
115
+
116
  # ASR model inference on the chunk
117
  with torch.no_grad():
118
  generated_ids = model.generate(
 
259
  iface = gr.Blocks()
260
 
261
  with iface:
262
+ gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/banner_trans.png" alt="Banner Image">')
263
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
264
 
265
  with gr.Tabs():