Spaces:
Running
Running
import os | |
from pathlib import Path | |
import numpy as np | |
import sherpa_onnx | |
import scipy.signal | |
from opencc import OpenCC | |
from huggingface_hub import hf_hub_download | |
# Ensure Hugging Face cache is in a user-writable directory | |
CACHE_DIR = Path(__file__).parent / "hf_cache" | |
os.makedirs(CACHE_DIR, exist_ok=True) | |
converter = OpenCC('s2t') | |
# Streaming Zipformer model registry: paths relative to repo root | |
STREAMING_ZIPFORMER_MODELS = { | |
"pfluo/k2fsa-zipformer-chinese-english-mixed": { | |
"tokens": "data/lang_char_bpe/tokens.txt", | |
"encoder_fp32": "exp/encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "exp/encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "exp/decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": None, | |
"joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-20-avg-1-chunk-16-left-128.onnx", | |
"encoder_int8": "encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx", | |
"decoder_fp32": "decoder-epoch-20-avg-1-chunk-16-left-128.onnx", | |
"decoder_int8": "decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx", | |
"joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx", | |
"joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx", | |
}, | |
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": { | |
"tokens": "data/lang_char/tokens.txt", | |
"encoder_fp32": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx", | |
"encoder_int8": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx", | |
"decoder_fp32": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx", | |
"decoder_int8": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx", | |
"joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx", | |
"joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1-chunk-16-left-128.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1-chunk-16-left-128.onnx", | |
"decoder_int8": None, | |
"joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-29-avg-9-with-averaged-model.onnx", | |
"encoder_int8": "encoder-epoch-29-avg-9-with-averaged-model.int8.onnx", | |
"decoder_fp32": "decoder-epoch-29-avg-9-with-averaged-model.onnx", | |
"decoder_int8": "decoder-epoch-29-avg-9-with-averaged-model.int8.onnx", | |
"joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx", | |
"joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": { | |
"tokens": "tokens.txt", | |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx", | |
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx", | |
"decoder_fp32": "decoder-epoch-99-avg-1.onnx", | |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx", | |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx", | |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx", | |
}, | |
} | |
# Audio resampling utility | |
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: | |
return scipy.signal.resample_poly(audio, target_sr, orig_sr) | |
# Create an online recognizer for a given model and precision | |
# model_id: full HF repo ID | |
# precision: "int8" or "fp32" | |
def create_recognizer(model_id: str, precision: str): | |
if model_id not in STREAMING_ZIPFORMER_MODELS: | |
raise ValueError(f"Model '{model_id}' is not registered.") | |
entry = STREAMING_ZIPFORMER_MODELS[model_id] | |
tokens_file = entry['tokens'] | |
encoder_file = entry['encoder_int8'] if precision == 'int8' else entry['encoder_fp32'] | |
decoder_file = entry['decoder_fp32'] | |
joiner_file = entry['joiner_int8'] if precision == 'int8' else entry['joiner_fp32'] | |
tokens_path = hf_hub_download(repo_id=model_id, filename=tokens_file, cache_dir=str(CACHE_DIR)) | |
encoder_path = hf_hub_download(repo_id=model_id, filename=encoder_file, cache_dir=str(CACHE_DIR)) | |
decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR)) | |
joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR)) | |
return sherpa_onnx.OnlineRecognizer.from_transducer( | |
tokens=tokens_path, | |
encoder=encoder_path, | |
decoder=decoder_path, | |
joiner=joiner_path, | |
provider="cpu", | |
num_threads=1, | |
sample_rate=16000, | |
feature_dim=80, | |
decoding_method="greedy_search" | |
) | |
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr): | |
audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32) | |
if audio.size == 0: | |
return "", 0.0 | |
resampled = resample_audio(audio, orig_sr, 16000) | |
rms = float(np.sqrt(np.mean(resampled ** 2))) | |
stream.accept_waveform(16000, resampled) | |
if recognizer.is_ready(stream): | |
recognizer.decode_streams([stream]) | |
result = recognizer.get_result(stream) | |
return converter.convert(result), rms | |
def finalize_stream(stream, recognizer): | |
tail = np.zeros(int(0.66 * 16000), dtype=np.float32) | |
stream.accept_waveform(16000, tail) | |
stream.input_finished() | |
while recognizer.is_ready(stream): | |
recognizer.decode_streams([stream]) | |
result = recognizer.get_result(stream) | |
return converter.convert(result) | |