Spaces:
Runtime error
Runtime error
Michael Natanael
commited on
Commit
·
0080f77
1
Parent(s):
8a79172
change transcribe mechanism when uploading audio
Browse files- app.py +19 -11
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from flask import Flask, render_template, request
|
| 2 |
-
from pydub import AudioSegment
|
| 3 |
# import whisper
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
import time
|
|
@@ -100,20 +100,28 @@ def transcribe():
|
|
| 100 |
|
| 101 |
audio_file = request.files['file']
|
| 102 |
if audio_file:
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
audio = audio.set_frame_rate(16000) # ensure 16kHz sample rate
|
| 107 |
-
|
| 108 |
-
# Save to temp .wav
|
| 109 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
| 110 |
-
audio.export(temp_audio, format="wav")
|
| 111 |
temp_audio_path = temp_audio.name
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# Step 1: Transcribe
|
| 114 |
# transcription = whisper_model.transcribe(temp_audio_path, language="id")
|
| 115 |
-
transcription = whisper_api(
|
| 116 |
-
os.remove(temp_audio_path)
|
| 117 |
transcribed_text = transcription["text"]
|
| 118 |
|
| 119 |
# Step 2: BERT Prediction
|
|
|
|
| 1 |
from flask import Flask, render_template, request
|
|
|
|
| 2 |
# import whisper
|
| 3 |
+
import torchaudio
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
import time
|
|
|
|
| 100 |
|
| 101 |
audio_file = request.files['file']
|
| 102 |
if audio_file:
|
| 103 |
+
# Save uploaded file temporarily
|
| 104 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
|
| 105 |
+
temp_audio.write(audio_file.read())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
temp_audio_path = temp_audio.name
|
| 107 |
|
| 108 |
+
# Load audio (torchaudio can handle mp3 natively without ffmpeg)
|
| 109 |
+
waveform, sample_rate = torchaudio.load(temp_audio_path)
|
| 110 |
+
|
| 111 |
+
# Resample to 16kHz if needed
|
| 112 |
+
if sample_rate != 16000:
|
| 113 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
| 114 |
+
waveform = resampler(waveform)
|
| 115 |
+
|
| 116 |
+
# Make sure audio is mono
|
| 117 |
+
if waveform.shape[0] > 1:
|
| 118 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
| 119 |
+
|
| 120 |
+
os.remove(temp_audio_path) # cleanup temp file
|
| 121 |
+
|
| 122 |
# Step 1: Transcribe
|
| 123 |
# transcription = whisper_model.transcribe(temp_audio_path, language="id")
|
| 124 |
+
transcription = whisper_api({"raw": waveform.numpy(), "sampling_rate": 16000})
|
|
|
|
| 125 |
transcribed_text = transcription["text"]
|
| 126 |
|
| 127 |
# Step 2: BERT Prediction
|
requirements.txt
CHANGED
|
@@ -13,6 +13,7 @@ Werkzeug==1.0.1
|
|
| 13 |
# ffmpeg-python
|
| 14 |
# imageio[ffmpeg]
|
| 15 |
pydub
|
|
|
|
| 16 |
accelerate
|
| 17 |
pytorch-lightning==2.2.1
|
| 18 |
lightning==2.4.0
|
|
|
|
| 13 |
# ffmpeg-python
|
| 14 |
# imageio[ffmpeg]
|
| 15 |
pydub
|
| 16 |
+
torchaudio
|
| 17 |
accelerate
|
| 18 |
pytorch-lightning==2.2.1
|
| 19 |
lightning==2.4.0
|