Spaces:
Running
Running
import numpy as np | |
import sherpa_onnx | |
import scipy.signal | |
from opencc import OpenCC | |
from huggingface_hub import hf_hub_download | |
converter = OpenCC('s2t') | |
# ASR model repository and file paths | |
REPO_ID = "pfluo/k2fsa-zipformer-chinese-english-mixed" | |
FILES = { | |
"tokens": "data/lang_char_bpe/tokens.txt", | |
"encoder": "exp/encoder-epoch-99-avg-1.int8.onnx", | |
"decoder": "exp/decoder-epoch-99-avg-1.onnx", | |
"joiner": "exp/joiner-epoch-99-avg-1.int8.onnx", | |
} | |
# Download and cache each file via HuggingFace Hub | |
LOCAL_PATHS = {} | |
for key, path in FILES.items(): | |
LOCAL_PATHS[key] = hf_hub_download( | |
repo_id=REPO_ID, | |
filename=path, | |
) | |
# Audio resampling utility | |
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: | |
return scipy.signal.resample_poly(audio, target_sr, orig_sr) | |
# Build the online recognizer with int8 weights | |
def create_recognizer(): | |
return sherpa_onnx.OnlineRecognizer.from_transducer( | |
tokens=LOCAL_PATHS['tokens'], | |
encoder=LOCAL_PATHS['encoder'], | |
decoder=LOCAL_PATHS['decoder'], | |
joiner=LOCAL_PATHS['joiner'], | |
provider="cpu", | |
num_threads=1, | |
sample_rate=16000, | |
feature_dim=80, | |
decoding_method="greedy_search" | |
) | |
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr): | |
audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32) | |
if audio.size == 0: | |
return "", 0.0 | |
resampled = resample_audio(audio, orig_sr, 16000) | |
rms = float(np.sqrt(np.mean(resampled ** 2))) | |
stream.accept_waveform(16000, resampled) | |
if recognizer.is_ready(stream): | |
recognizer.decode_streams([stream]) | |
result = recognizer.get_result(stream) | |
return converter.convert(result), rms | |
def finalize_stream(stream, recognizer): | |
tail = np.zeros(int(0.66 * 16000), dtype=np.float32) | |
stream.accept_waveform(16000, tail) | |
stream.input_finished() | |
while recognizer.is_ready(stream): | |
recognizer.decode_streams([stream]) | |
result = recognizer.get_result(stream) | |
return converter.convert(result) | |