camparchimedes commited on
Commit
8d12d9b
·
verified ·
1 Parent(s): 05fd026

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -85,7 +85,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
85
 
86
  # Resample the audio to 16000 Hz if it’s not already
87
  if sample_rate != 16000:
88
- resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
89
  waveform = resampler(waveform)
90
  sample_rate = 16000
91
 
@@ -105,11 +105,11 @@ def transcribe_audio(audio_file, chunk_length_s=30):
105
  if chunk_waveform.shape[0] > 1:
106
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
107
 
108
- # Process the chunk
109
- audio_input = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
 
110
 
111
- # Generate attention mask
112
- input_features = audio_input.input_features
113
  attention_mask = torch.ones(input_features.shape, dtype=torch.long)
114
 
115
  # ASR model inference on the chunk
@@ -136,7 +136,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
136
  # Format of the result
137
  result = (
138
  f"Time taken: {output_time:.2f} seconds\n"
139
- f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
140
  f"Real-time Factor (RTF): {rtf:.2f}\n"
141
  f"Number of words: {len(text.split())}\n\n"
142
  "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
@@ -259,7 +259,7 @@ def save_to_pdf(text, summary):
259
  iface = gr.Blocks()
260
 
261
  with iface:
262
- gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/banner_trans.png" alt="Banner Image" />')
263
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
264
 
265
  with gr.Tabs():
 
85
 
86
  # Resample the audio to 16000 Hz if it’s not already
87
  if sample_rate != 16000:
88
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
89
  waveform = resampler(waveform)
90
  sample_rate = 16000
91
 
 
105
  if chunk_waveform.shape[0] > 1:
106
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
107
 
108
+ # Process the chunk with the tokenizer
109
+ inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
110
+ input_features = inputs.input_features
111
 
112
+ # Explicitly set the attention mask
 
113
  attention_mask = torch.ones(input_features.shape, dtype=torch.long)
114
 
115
  # ASR model inference on the chunk
 
136
  # Format of the result
137
  result = (
138
  f"Time taken: {output_time:.2f} seconds\n"
139
+ f"Audio duration: {audio_duration / 60:.2f minutes ({audio_duration:.2f} seconds)\n"
140
  f"Real-time Factor (RTF): {rtf:.2f}\n"
141
  f"Number of words: {len(text.split())}\n\n"
142
  "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
 
259
  iface = gr.Blocks()
260
 
261
  with iface:
262
+ gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/lol.webp" alt="Banner Image" />')
263
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
264
 
265
  with gr.Tabs():