Michael Natanael commited on
Commit
20332fc
·
1 Parent(s): db65dc2

change transcribe mechanism when uploading audio

Browse files
Files changed (1) hide show
  1. app.py +6 -17
app.py CHANGED
@@ -1,6 +1,5 @@
1
  from flask import Flask, render_template, request
2
  # import whisper
3
- import torchaudio
4
  import tempfile
5
  import os
6
  import time
@@ -51,7 +50,7 @@ model = MultiClassModel.load_from_checkpoint(
51
  model.eval()
52
 
53
 
54
- def whisper_api(input_audio):
55
  # https://huggingface.co/openai/whisper-large-v3
56
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
57
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -70,13 +69,11 @@ def whisper_api(input_audio):
70
  model=model,
71
  tokenizer=processor.tokenizer,
72
  feature_extractor=processor.feature_extractor,
73
- chunk_length_s=30,
74
- batch_size=16, # batch size for inference - set based on your device
75
  torch_dtype=torch_dtype,
76
  device=device,
77
  )
78
 
79
- result = pipe(input_audio, return_timestamps=False, generate_kwargs={"language": "indonesian"})
80
  print(result["text"])
81
  return result
82
 
@@ -100,23 +97,15 @@ def transcribe():
100
 
101
  audio_file = request.files['file']
102
  if audio_file:
103
- # Save uploaded file temporarily
104
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
105
  temp_audio.write(audio_file.read())
106
  temp_audio_path = temp_audio.name
107
 
108
- # Load audio from bytes directly
109
- waveform, sample_rate = torchaudio.load(temp_audio_path)
110
- # Convert to mono if it is stereo
111
- waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform
112
- # Convert waveform to numpy
113
- audio_array = waveform.squeeze(0).numpy()
114
-
115
- os.remove(temp_audio_path) # cleanup temp file
116
-
117
  # Step 1: Transcribe
118
  # transcription = whisper_model.transcribe(temp_audio_path, language="id")
119
- transcription = whisper_api({"array": audio_array, "sampling_rate": sample_rate})
 
120
  transcribed_text = transcription["text"]
121
 
122
  # Step 2: BERT Prediction
 
1
  from flask import Flask, render_template, request
2
  # import whisper
 
3
  import tempfile
4
  import os
5
  import time
 
50
  model.eval()
51
 
52
 
53
+ def whisper_api(temp_audio_path):
54
  # https://huggingface.co/openai/whisper-large-v3
55
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
56
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
69
  model=model,
70
  tokenizer=processor.tokenizer,
71
  feature_extractor=processor.feature_extractor,
 
 
72
  torch_dtype=torch_dtype,
73
  device=device,
74
  )
75
 
76
+ result = pipe(temp_audio_path, return_timestamps=False, generate_kwargs={"language": "indonesian"})
77
  print(result["text"])
78
  return result
79
 
 
97
 
98
  audio_file = request.files['file']
99
  if audio_file:
100
+ # Save uploaded audio to temp file
101
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
102
  temp_audio.write(audio_file.read())
103
  temp_audio_path = temp_audio.name
104
 
 
 
 
 
 
 
 
 
 
105
  # Step 1: Transcribe
106
  # transcription = whisper_model.transcribe(temp_audio_path, language="id")
107
+ transcription = whisper_api(temp_audio_path)
108
+ os.remove(temp_audio_path)
109
  transcribed_text = transcription["text"]
110
 
111
  # Step 2: BERT Prediction