Spaces:
Running
Running
Michael Natanael
commited on
Commit
·
0080f77
1
Parent(s):
8a79172
change transcribe mechanism when uploading audio
Browse files- app.py +19 -11
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from flask import Flask, render_template, request
|
2 |
-
from pydub import AudioSegment
|
3 |
# import whisper
|
|
|
4 |
import tempfile
|
5 |
import os
|
6 |
import time
|
@@ -100,20 +100,28 @@ def transcribe():
|
|
100 |
|
101 |
audio_file = request.files['file']
|
102 |
if audio_file:
|
103 |
-
#
|
104 |
-
|
105 |
-
|
106 |
-
audio = audio.set_frame_rate(16000) # ensure 16kHz sample rate
|
107 |
-
|
108 |
-
# Save to temp .wav
|
109 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
110 |
-
audio.export(temp_audio, format="wav")
|
111 |
temp_audio_path = temp_audio.name
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Step 1: Transcribe
|
114 |
# transcription = whisper_model.transcribe(temp_audio_path, language="id")
|
115 |
-
transcription = whisper_api(
|
116 |
-
os.remove(temp_audio_path)
|
117 |
transcribed_text = transcription["text"]
|
118 |
|
119 |
# Step 2: BERT Prediction
|
|
|
1 |
from flask import Flask, render_template, request
|
|
|
2 |
# import whisper
|
3 |
+
import torchaudio
|
4 |
import tempfile
|
5 |
import os
|
6 |
import time
|
|
|
100 |
|
101 |
audio_file = request.files['file']
|
102 |
if audio_file:
|
103 |
+
# Save uploaded file temporarily
|
104 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
|
105 |
+
temp_audio.write(audio_file.read())
|
|
|
|
|
|
|
|
|
|
|
106 |
temp_audio_path = temp_audio.name
|
107 |
|
108 |
+
# Load audio (torchaudio can handle mp3 natively without ffmpeg)
|
109 |
+
waveform, sample_rate = torchaudio.load(temp_audio_path)
|
110 |
+
|
111 |
+
# Resample to 16kHz if needed
|
112 |
+
if sample_rate != 16000:
|
113 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
114 |
+
waveform = resampler(waveform)
|
115 |
+
|
116 |
+
# Make sure audio is mono
|
117 |
+
if waveform.shape[0] > 1:
|
118 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
119 |
+
|
120 |
+
os.remove(temp_audio_path) # cleanup temp file
|
121 |
+
|
122 |
# Step 1: Transcribe
|
123 |
# transcription = whisper_model.transcribe(temp_audio_path, language="id")
|
124 |
+
transcription = whisper_api({"raw": waveform.numpy(), "sampling_rate": 16000})
|
|
|
125 |
transcribed_text = transcription["text"]
|
126 |
|
127 |
# Step 2: BERT Prediction
|
requirements.txt
CHANGED
@@ -13,6 +13,7 @@ Werkzeug==1.0.1
|
|
13 |
# ffmpeg-python
|
14 |
# imageio[ffmpeg]
|
15 |
pydub
|
|
|
16 |
accelerate
|
17 |
pytorch-lightning==2.2.1
|
18 |
lightning==2.4.0
|
|
|
13 |
# ffmpeg-python
|
14 |
# imageio[ffmpeg]
|
15 |
pydub
|
16 |
+
torchaudio
|
17 |
accelerate
|
18 |
pytorch-lightning==2.2.1
|
19 |
lightning==2.4.0
|