Michael Natanael commited on
Commit
0080f77
·
1 Parent(s): 8a79172

change transcribe mechanism when uploading audio

Browse files
Files changed (2) hide show
  1. app.py +19 -11
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from flask import Flask, render_template, request
2
- from pydub import AudioSegment
3
  # import whisper
 
4
  import tempfile
5
  import os
6
  import time
@@ -100,20 +100,28 @@ def transcribe():
100
 
101
  audio_file = request.files['file']
102
  if audio_file:
103
- # Read the uploaded mp3
104
- audio = AudioSegment.from_file(audio_file, format="mp3")
105
- audio = audio.set_channels(1) # ensure mono
106
- audio = audio.set_frame_rate(16000) # ensure 16kHz sample rate
107
-
108
- # Save to temp .wav
109
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
110
- audio.export(temp_audio, format="wav")
111
  temp_audio_path = temp_audio.name
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # Step 1: Transcribe
114
  # transcription = whisper_model.transcribe(temp_audio_path, language="id")
115
- transcription = whisper_api(temp_audio_path)
116
- os.remove(temp_audio_path)
117
  transcribed_text = transcription["text"]
118
 
119
  # Step 2: BERT Prediction
 
1
  from flask import Flask, render_template, request
 
2
  # import whisper
3
+ import torchaudio
4
  import tempfile
5
  import os
6
  import time
 
100
 
101
  audio_file = request.files['file']
102
  if audio_file:
103
+ # Save uploaded file temporarily
104
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
105
+ temp_audio.write(audio_file.read())
 
 
 
 
 
106
  temp_audio_path = temp_audio.name
107
 
108
+ # Load audio (torchaudio can handle mp3 natively without ffmpeg)
109
+ waveform, sample_rate = torchaudio.load(temp_audio_path)
110
+
111
+ # Resample to 16kHz if needed
112
+ if sample_rate != 16000:
113
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
114
+ waveform = resampler(waveform)
115
+
116
+ # Make sure audio is mono
117
+ if waveform.shape[0] > 1:
118
+ waveform = waveform.mean(dim=0, keepdim=True)
119
+
120
+ os.remove(temp_audio_path) # cleanup temp file
121
+
122
  # Step 1: Transcribe
123
  # transcription = whisper_model.transcribe(temp_audio_path, language="id")
124
+ transcription = whisper_api({"raw": waveform.numpy(), "sampling_rate": 16000})
 
125
  transcribed_text = transcription["text"]
126
 
127
  # Step 2: BERT Prediction
requirements.txt CHANGED
@@ -13,6 +13,7 @@ Werkzeug==1.0.1
13
  # ffmpeg-python
14
  # imageio[ffmpeg]
15
  pydub
 
16
  accelerate
17
  pytorch-lightning==2.2.1
18
  lightning==2.4.0
 
13
  # ffmpeg-python
14
  # imageio[ffmpeg]
15
  pydub
16
+ torchaudio
17
  accelerate
18
  pytorch-lightning==2.2.1
19
  lightning==2.4.0