Spaces:

nenafem
/

flask_whisper

Running

Michael Natanael commited on Apr 27

Commit

0080f77

1 Parent(s): 8a79172

change transcribe mechanism when uploading audio

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from flask import Flask, render_template, request
-from pydub import AudioSegment
 # import whisper
 import tempfile
 import os
 import time
@@ -100,20 +100,28 @@ def transcribe():
         audio_file = request.files['file']
         if audio_file:
-            # Read the uploaded mp3
-            audio = AudioSegment.from_file(audio_file, format="mp3")
-            audio = audio.set_channels(1)  # ensure mono
-            audio = audio.set_frame_rate(16000)  # ensure 16kHz sample rate
-            # Save to temp .wav
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
-                audio.export(temp_audio, format="wav")
                 temp_audio_path = temp_audio.name
             # Step 1: Transcribe
             # transcription = whisper_model.transcribe(temp_audio_path, language="id")
-            transcription = whisper_api(temp_audio_path)
-            os.remove(temp_audio_path)
             transcribed_text = transcription["text"]
             # Step 2: BERT Prediction

 from flask import Flask, render_template, request
 # import whisper
+import torchaudio
 import tempfile
 import os
 import time
         audio_file = request.files['file']
         if audio_file:
+            # Save uploaded file temporarily
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
+                temp_audio.write(audio_file.read())
                 temp_audio_path = temp_audio.name
+            # Load audio (torchaudio can handle mp3 natively without ffmpeg)
+            waveform, sample_rate = torchaudio.load(temp_audio_path)
+            # Resample to 16kHz if needed
+            if sample_rate != 16000:
+                resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+                waveform = resampler(waveform)
+            # Make sure audio is mono
+            if waveform.shape[0] > 1:
+                waveform = waveform.mean(dim=0, keepdim=True)
+            os.remove(temp_audio_path)  # cleanup temp file
             # Step 1: Transcribe
             # transcription = whisper_model.transcribe(temp_audio_path, language="id")
+            transcription = whisper_api({"raw": waveform.numpy(), "sampling_rate": 16000})
             transcribed_text = transcription["text"]
             # Step 2: BERT Prediction

requirements.txt CHANGED Viewed

@@ -13,6 +13,7 @@ Werkzeug==1.0.1
 # ffmpeg-python
 # imageio[ffmpeg]
 pydub
 accelerate
 pytorch-lightning==2.2.1
 lightning==2.4.0

 # ffmpeg-python
 # imageio[ffmpeg]
 pydub
+torchaudio
 accelerate
 pytorch-lightning==2.2.1
 lightning==2.4.0