Spaces:
Build error
Build error
Commit
·
39e4af1
1
Parent(s):
c65d563
added clean whisper asr implementation
Browse files- app.py +28 -24
- requirements.txt +1 -5
app.py
CHANGED
|
@@ -4,7 +4,8 @@ from scipy.io.wavfile import write
|
|
| 4 |
import gradio as gr
|
| 5 |
import os
|
| 6 |
from transformers import AutoProcessor, pipeline
|
| 7 |
-
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
|
|
|
|
| 8 |
from glob import glob
|
| 9 |
load_model()
|
| 10 |
|
|
@@ -12,27 +13,29 @@ BASE_PATH = os.path.dirname(os.path.abspath(__file__))
|
|
| 12 |
os.makedirs('input', exist_ok=True)
|
| 13 |
os.makedirs('separated', exist_ok=True)
|
| 14 |
|
| 15 |
-
print("Loading ASR model...")
|
| 16 |
-
processor = AutoProcessor.from_pretrained("openai/whisper-small")
|
| 17 |
-
if not os.path.exists("whisper_checkpoint"):
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
else:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
print("Whisper ASR model loaded.")
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def separator(audio, rec_audio, example):
|
| 38 |
outputs= {}
|
|
@@ -51,8 +54,9 @@ def separator(audio, rec_audio, example):
|
|
| 51 |
separated_files = [f for f in separated_files if "original.wav" not in f]
|
| 52 |
outputs['transcripts'] = []
|
| 53 |
for file in sorted(separated_files):
|
| 54 |
-
separated_audio = sio.wavfile.read(file)
|
| 55 |
-
outputs['transcripts'].append(speech_recognition_pipeline(separated_audio[1])['text'])
|
|
|
|
| 56 |
return sorted(separated_files) + outputs['transcripts']
|
| 57 |
|
| 58 |
def set_example_audio(example: list) -> dict:
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import os
|
| 6 |
from transformers import AutoProcessor, pipeline
|
| 7 |
+
# from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
|
| 8 |
+
import whisper
|
| 9 |
from glob import glob
|
| 10 |
load_model()
|
| 11 |
|
|
|
|
| 13 |
os.makedirs('input', exist_ok=True)
|
| 14 |
os.makedirs('separated', exist_ok=True)
|
| 15 |
|
| 16 |
+
# print("Loading ASR model...")
|
| 17 |
+
# processor = AutoProcessor.from_pretrained("openai/whisper-small")
|
| 18 |
+
# if not os.path.exists("whisper_checkpoint"):
|
| 19 |
+
# model = ORTModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small", from_transformers=True)
|
| 20 |
+
# speech_recognition_pipeline = pipeline(
|
| 21 |
+
# "automatic-speech-recognition",
|
| 22 |
+
# model=model,
|
| 23 |
+
# feature_extractor=processor.feature_extractor,
|
| 24 |
+
# tokenizer=processor.tokenizer,
|
| 25 |
+
# )
|
| 26 |
+
# os.makedirs('whisper_checkpoint', exist_ok=True)
|
| 27 |
+
# model.save_pretrained("whisper_checkpoint")
|
| 28 |
+
# else:
|
| 29 |
+
# model = ORTModelForSpeechSeq2Seq.from_pretrained("whisper_checkpoint", from_transformers=False)
|
| 30 |
+
# speech_recognition_pipeline = pipeline(
|
| 31 |
+
# "automatic-speech-recognition",
|
| 32 |
+
# model=model,
|
| 33 |
+
# feature_extractor=processor.feature_extractor,
|
| 34 |
+
# tokenizer=processor.tokenizer,
|
| 35 |
+
# )
|
| 36 |
+
# print("Whisper ASR model loaded.")
|
| 37 |
+
|
| 38 |
+
model = whisper.load_model("base")
|
| 39 |
|
| 40 |
def separator(audio, rec_audio, example):
|
| 41 |
outputs= {}
|
|
|
|
| 54 |
separated_files = [f for f in separated_files if "original.wav" not in f]
|
| 55 |
outputs['transcripts'] = []
|
| 56 |
for file in sorted(separated_files):
|
| 57 |
+
# separated_audio = sio.wavfile.read(file)
|
| 58 |
+
# outputs['transcripts'].append(speech_recognition_pipeline(separated_audio[1])['text'])
|
| 59 |
+
outputs['transcripts'].append(whisper.transcribe(file)["text"])
|
| 60 |
return sorted(separated_files) + outputs['transcripts']
|
| 61 |
|
| 62 |
def set_example_audio(example: list) -> dict:
|
requirements.txt
CHANGED
|
@@ -6,13 +6,9 @@ pystoi==0.3.3
|
|
| 6 |
librosa==0.7.1
|
| 7 |
numba==0.48
|
| 8 |
numpy
|
| 9 |
-
flask
|
| 10 |
-
flask-cors
|
| 11 |
-
uvicorn[standard]
|
| 12 |
asgiref
|
| 13 |
gradio
|
| 14 |
-
transformers==4.24.0
|
| 15 |
torch
|
| 16 |
torchvision
|
| 17 |
torchaudio
|
| 18 |
-
|
|
|
|
| 6 |
librosa==0.7.1
|
| 7 |
numba==0.48
|
| 8 |
numpy
|
|
|
|
|
|
|
|
|
|
| 9 |
asgiref
|
| 10 |
gradio
|
|
|
|
| 11 |
torch
|
| 12 |
torchvision
|
| 13 |
torchaudio
|
| 14 |
+
whisper
|