File size: 7,382 Bytes
b197fed 860b6bb 3f878f1 860b6bb 98a6067 b197fed 6dbaaa5 e8ee9a7 98a6067 6dbaaa5 d1326a6 6dbaaa5 d1326a6 6dbaaa5 d1326a6 6dbaaa5 98a6067 6dbaaa5 b197fed 98a6067 d1326a6 860b6bb b197fed 3f878f1 d1326a6 3f878f1 6dbaaa5 3f878f1 b197fed 3f878f1 1ba0470 860b6bb 3f878f1 6dbaaa5 1ba0470 6dbaaa5 1ba0470 6dbaaa5 d1326a6 6dbaaa5 d1326a6 6dbaaa5 d1326a6 6dbaaa5 d1326a6 6dbaaa5 d1326a6 3f878f1 6dbaaa5 3f878f1 6dbaaa5 d1326a6 6dbaaa5 d1326a6 3f878f1 d1326a6 3f878f1 6dbaaa5 3f878f1 d1326a6 6dbaaa5 3f878f1 98a6067 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import logging
from importlib.metadata import version
from timeit import default_timer as timer
import gradio as gr
import numpy as np
import onnx_asr
logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.info("onnx_asr version: %s", version("onnx_asr"))
vad = onnx_asr.load_vad("silero")
whisper = {name: onnx_asr.load_model(name) for name in ["whisper-base"]}
models_ru = {
name: onnx_asr.load_model(name)
for name in [
"gigaam-v2-ctc",
"gigaam-v2-rnnt",
"nemo-fastconformer-ru-ctc",
"nemo-fastconformer-ru-rnnt",
"alphacep/vosk-model-ru",
"alphacep/vosk-model-small-ru",
]
}
models_en = {
name: onnx_asr.load_model(name)
for name in [
"nemo-parakeet-ctc-0.6b",
"nemo-parakeet-rnnt-0.6b",
"nemo-parakeet-tdt-0.6b-v2",
]
}
models_vad = whisper | models_ru | models_en
def recognize(audio: tuple[int, np.ndarray], models, language):
if audio is None:
return None
sample_rate, waveform = audio
length = waveform.shape[-1] / sample_rate
logger.debug("recognize: length %.3f, sample_rate %s, waveform.shape %s.", length, sample_rate, waveform.shape)
try:
waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
if waveform.ndim == 2:
waveform = waveform.mean(axis=1)
results = []
for name, model in models.items():
if length > 20 and name == "alphacep/vosk-model-small-ru":
continue
start = timer()
result = model.recognize(waveform, sample_rate=sample_rate, language=language)
time = timer() - start
logger.debug("recognized by %s: result '%s', time %.3f s.", name, result, time)
results.append([name, result, f"{time:.3f} s."])
except Exception as e:
raise gr.Error(f"{e} Audio: sample_rate: {sample_rate}, waveform.shape: {waveform.shape}.") from e
else:
return results
def recognize_ru(audio: tuple[int, np.ndarray]):
return recognize(audio, models_ru | whisper, "ru")
def recognize_en(audio: tuple[int, np.ndarray]):
return recognize(audio, models_en | whisper, "en")
def recognize_with_vad(audio: tuple[int, np.ndarray], name: str):
if audio is None:
return None
sample_rate, waveform = audio
logger.debug("recognize: sample_rate %s, waveform.shape %s.", sample_rate, waveform.shape)
try:
waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
if waveform.ndim == 2:
waveform = waveform.mean(axis=1)
model = models_vad[name].with_vad(vad, batch_size=1)
results = ""
for res in model.recognize(waveform, sample_rate=sample_rate):
logger.debug("recognized by %s: result '%s'.", name, res)
results += f"[{res.start:5.1f}, {res.end:5.1f}]: {res.text}\n"
yield results
except Exception as e:
raise gr.Error(f"{e} Audio: sample_rate: {sample_rate}, waveform.shape: {waveform.shape}.") from e
with gr.Blocks() as recognize_short:
audio = gr.Audio(min_length=1, max_length=30)
with gr.Row():
gr.ClearButton(audio)
btn_ru = gr.Button("Recognize (ru)", variant="primary")
btn_en = gr.Button("Recognize (en)", variant="primary")
output = gr.Dataframe(headers=["model", "result", "time"], wrap=True)
btn_ru.click(fn=recognize_ru, inputs=audio, outputs=output)
btn_en.click(fn=recognize_en, inputs=audio, outputs=output)
with gr.Blocks() as recognize_long:
name = gr.Dropdown(models_vad.keys(), label="Model")
# lang = gr.Label()
audio = gr.Audio(min_length=1, max_length=300)
with gr.Row():
gr.ClearButton(audio)
btn = gr.Button("Recognize", variant="primary")
output = gr.TextArea(label="result")
btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)
def on_model_change(name: str):
if name in models_ru:
label = f"Model {name} support only Russian language"
elif name in models_en:
label = f"Model {name} support only English language"
else:
label = None
return gr.Audio(min_length=1, max_length=300, label=label)
name.change(on_model_change, inputs=name, outputs=audio)
with gr.Blocks() as demo:
gr.Markdown("""
# ASR demo using onnx-asr
**[onnx-asr](https://github.com/istupakov/onnx-asr)** is a Python package for Automatic Speech Recognition using ONNX models.
The package is written in pure Python with minimal dependencies (no `pytorch` or `transformers`).
""")
gr.TabbedInterface(
[recognize_short, recognize_long],
[
"Recognition of a short phrase (up to 30 sec.)",
"Recognition of a long phrase with VAD (up to 5 min.)",
],
)
with gr.Accordion("Models used in this demo:", open=False):
gr.Markdown("""
## Russian ASR models
* `gigaam-v2-ctc` - Sber GigaAM v2 CTC ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
* `gigaam-v2-rnnt` - Sber GigaAM v2 RNN-T ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
* `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
* `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
* `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
* `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
## English ASR models
* `nemo-parakeet-ctc-0.6b` - Nvidia Parakeet CTC 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-ctc-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-ctc-0.6b-onnx))
* `nemo-parakeet-rnnt-0.6b` - Nvidia Parakeet RNNT 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-rnnt-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-rnnt-0.6b-onnx))
* `nemo-parakeet-tdt-0.6b-v2` - Nvidia Parakeet TDT 0.6B V2 (en) ([origin](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2), [onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx))
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
## VAD models
* `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
""")
demo.launch()
|