Spaces:

nenafem
/

flask_whisper

Running

App Files Files Community

Michael Natanael commited on Apr 27

Commit

20332fc

1 Parent(s): db65dc2

change transcribe mechanism when uploading audio

Browse files

Files changed (1) hide show

app.py +6 -17

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from flask import Flask, render_template, request
 # import whisper
-import torchaudio
 import tempfile
 import os
 import time
@@ -51,7 +50,7 @@ model = MultiClassModel.load_from_checkpoint(
 model.eval()
-def whisper_api(input_audio):
     # https://huggingface.co/openai/whisper-large-v3
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -70,13 +69,11 @@ def whisper_api(input_audio):
         model=model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
-        chunk_length_s=30,
-        batch_size=16,  # batch size for inference - set based on your device
         torch_dtype=torch_dtype,
         device=device,
     )
-    result = pipe(input_audio, return_timestamps=False, generate_kwargs={"language": "indonesian"})
     print(result["text"])
     return result
@@ -100,23 +97,15 @@ def transcribe():
         audio_file = request.files['file']
         if audio_file:
-            # Save uploaded file temporarily
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
                 temp_audio.write(audio_file.read())
                 temp_audio_path = temp_audio.name
-            # Load audio from bytes directly
-            waveform, sample_rate = torchaudio.load(temp_audio_path)
-            # Convert to mono if it is stereo
-            waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform
-            # Convert waveform to numpy
-            audio_array = waveform.squeeze(0).numpy()
-            os.remove(temp_audio_path)  # cleanup temp file
             # Step 1: Transcribe
             # transcription = whisper_model.transcribe(temp_audio_path, language="id")
-            transcription = whisper_api({"array": audio_array, "sampling_rate": sample_rate})
             transcribed_text = transcription["text"]
             # Step 2: BERT Prediction

 from flask import Flask, render_template, request
 # import whisper
 import tempfile
 import os
 import time
 model.eval()
+def whisper_api(temp_audio_path):
     # https://huggingface.co/openai/whisper-large-v3
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
         model=model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         torch_dtype=torch_dtype,
         device=device,
     )
+    result = pipe(temp_audio_path, return_timestamps=False, generate_kwargs={"language": "indonesian"})
     print(result["text"])
     return result
         audio_file = request.files['file']
         if audio_file:
+            # Save uploaded audio to temp file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
                 temp_audio.write(audio_file.read())
                 temp_audio_path = temp_audio.name
             # Step 1: Transcribe
             # transcription = whisper_model.transcribe(temp_audio_path, language="id")
+            transcription = whisper_api(temp_audio_path)
+            os.remove(temp_audio_path)
             transcribed_text = transcription["text"]
             # Step 2: BERT Prediction