Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 19, 2024

Commit

05fd026

verified ·

1 Parent(s): 7ef26c1

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -6

app.py CHANGED Viewed

@@ -76,10 +76,14 @@ def transcribe_audio(audio_file, chunk_length_s=30):
     start_time = time.time()
-    # Load waveform/torchaudio
     waveform, sample_rate = torchaudio.load(audio_file)
-    # Resample to 16000 Hz if 32K
     if sample_rate != 16000:
         resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
         waveform = resampler(waveform)
@@ -97,8 +101,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
         end = min((i + 1) * chunk_size, waveform.shape[1])
         chunk_waveform = waveform[:, start:end]
         # Process the chunk
-        audio_input = processor(chunk_waveform, sampling_rate=sample_rate, return_tensors="pt")
         # Generate attention mask
         input_features = audio_input.input_features
@@ -114,12 +122,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
             chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
             full_text.append(chunk_text)
-    # Combine transcribed text/all chunks
     text = " ".join(full_text)
     output_time = time.time() - start_time
-    # Dduration (secs)
     audio_duration = waveform.shape[1] / sample_rate
     # Real-time Factor (RTF)
@@ -139,6 +147,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
     return text, result
 # Clean and preprocess/@summarization
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
@@ -250,7 +259,7 @@ def save_to_pdf(text, summary):
 iface = gr.Blocks()
 with iface:
-    gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/banner_trans.png" alt="Banner Image" />')
     gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
     with gr.Tabs():

     start_time = time.time()
+    # Load the audio waveform using torchaudio
     waveform, sample_rate = torchaudio.load(audio_file)
+    # Convert to mono if the audio has more than one channel
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    # Resample the audio to 16000 Hz if it’s not already
     if sample_rate != 16000:
         resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
         waveform = resampler(waveform)
         end = min((i + 1) * chunk_size, waveform.shape[1])
         chunk_waveform = waveform[:, start:end]
+        # Ensure the chunk waveform is properly shaped
+        if chunk_waveform.shape[0] > 1:
+            chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
         # Process the chunk
+        audio_input = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         # Generate attention mask
         input_features = audio_input.input_features
             chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
             full_text.append(chunk_text)
+    # Combine the transcribed text from all chunks
     text = " ".join(full_text)
     output_time = time.time() - start_time
+    # Audio duration (in seconds)
     audio_duration = waveform.shape[1] / sample_rate
     # Real-time Factor (RTF)
     return text, result
 # Clean and preprocess/@summarization
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
 iface = gr.Blocks()
 with iface:
+    gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/banner_trans.png" alt="Banner Image" />')
     gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
     with gr.Tabs():