camparchimedes commited on
Commit
05fd026
·
verified ·
1 Parent(s): 7ef26c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -76,10 +76,14 @@ def transcribe_audio(audio_file, chunk_length_s=30):
76
 
77
  start_time = time.time()
78
 
79
- # Load waveform/torchaudio
80
  waveform, sample_rate = torchaudio.load(audio_file)
81
 
82
- # Resample to 16000 Hz if 32K
 
 
 
 
83
  if sample_rate != 16000:
84
  resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
85
  waveform = resampler(waveform)
@@ -97,8 +101,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
97
  end = min((i + 1) * chunk_size, waveform.shape[1])
98
  chunk_waveform = waveform[:, start:end]
99
 
 
 
 
 
100
  # Process the chunk
101
- audio_input = processor(chunk_waveform, sampling_rate=sample_rate, return_tensors="pt")
102
 
103
  # Generate attention mask
104
  input_features = audio_input.input_features
@@ -114,12 +122,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
114
  chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
115
  full_text.append(chunk_text)
116
 
117
- # Combine transcribed text/all chunks
118
  text = " ".join(full_text)
119
 
120
  output_time = time.time() - start_time
121
 
122
- # Dduration (secs)
123
  audio_duration = waveform.shape[1] / sample_rate
124
 
125
  # Real-time Factor (RTF)
@@ -139,6 +147,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
139
  return text, result
140
 
141
 
 
142
  # Clean and preprocess/@summarization
143
  def clean_text(text):
144
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
@@ -250,7 +259,7 @@ def save_to_pdf(text, summary):
250
  iface = gr.Blocks()
251
 
252
  with iface:
253
- gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/banner_trans.png" alt="Banner Image" />')
254
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
255
 
256
  with gr.Tabs():
 
76
 
77
  start_time = time.time()
78
 
79
+ # Load the audio waveform using torchaudio
80
  waveform, sample_rate = torchaudio.load(audio_file)
81
 
82
+ # Convert to mono if the audio has more than one channel
83
+ if waveform.shape[0] > 1:
84
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
85
+
86
+ # Resample the audio to 16000 Hz if it’s not already
87
  if sample_rate != 16000:
88
  resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
89
  waveform = resampler(waveform)
 
101
  end = min((i + 1) * chunk_size, waveform.shape[1])
102
  chunk_waveform = waveform[:, start:end]
103
 
104
+ # Ensure the chunk waveform is properly shaped
105
+ if chunk_waveform.shape[0] > 1:
106
+ chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
107
+
108
  # Process the chunk
109
+ audio_input = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
110
 
111
  # Generate attention mask
112
  input_features = audio_input.input_features
 
122
  chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
123
  full_text.append(chunk_text)
124
 
125
+ # Combine the transcribed text from all chunks
126
  text = " ".join(full_text)
127
 
128
  output_time = time.time() - start_time
129
 
130
+ # Audio duration (in seconds)
131
  audio_duration = waveform.shape[1] / sample_rate
132
 
133
  # Real-time Factor (RTF)
 
147
  return text, result
148
 
149
 
150
+
151
  # Clean and preprocess/@summarization
152
  def clean_text(text):
153
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
 
259
  iface = gr.Blocks()
260
 
261
  with iface:
262
+ gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/banner_trans.png" alt="Banner Image" />')
263
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
264
 
265
  with gr.Tabs():