import numpy as np import sherpa_onnx import scipy.signal from opencc import OpenCC from huggingface_hub import hf_hub_download converter = OpenCC('s2t') # ASR model repository and file paths REPO_ID = "pfluo/k2fsa-zipformer-chinese-english-mixed" FILES = { "tokens": "data/lang_char_bpe/tokens.txt", "encoder": "exp/encoder-epoch-99-avg-1.int8.onnx", "decoder": "exp/decoder-epoch-99-avg-1.onnx", "joiner": "exp/joiner-epoch-99-avg-1.int8.onnx", } # Download and cache each file via HuggingFace Hub LOCAL_PATHS = {} for key, path in FILES.items(): LOCAL_PATHS[key] = hf_hub_download( repo_id=REPO_ID, filename=path, ) # Audio resampling utility def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: return scipy.signal.resample_poly(audio, target_sr, orig_sr) # Build the online recognizer with int8 weights def create_recognizer(): return sherpa_onnx.OnlineRecognizer.from_transducer( tokens=LOCAL_PATHS['tokens'], encoder=LOCAL_PATHS['encoder'], decoder=LOCAL_PATHS['decoder'], joiner=LOCAL_PATHS['joiner'], provider="cpu", num_threads=1, sample_rate=16000, feature_dim=80, decoding_method="greedy_search" ) def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr): audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32) if audio.size == 0: return "", 0.0 resampled = resample_audio(audio, orig_sr, 16000) rms = float(np.sqrt(np.mean(resampled ** 2))) stream.accept_waveform(16000, resampled) if recognizer.is_ready(stream): recognizer.decode_streams([stream]) result = recognizer.get_result(stream) return converter.convert(result), rms def finalize_stream(stream, recognizer): tail = np.zeros(int(0.66 * 16000), dtype=np.float32) stream.accept_waveform(16000, tail) stream.input_finished() while recognizer.is_ready(stream): recognizer.decode_streams([stream]) result = recognizer.get_result(stream) return converter.convert(result)