Spaces:
Running
Running
File size: 2,084 Bytes
2318eae 221a9c5 231cd3a ab9b679 231cd3a b26d94c ab9b679 b26d94c ab9b679 b26d94c ab9b679 2318eae b26d94c 2318eae 7c3f2af 2318eae 2a8a9a5 7c3f2af 2a8a9a5 2318eae 231cd3a 2318eae 231cd3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import numpy as np
import sherpa_onnx
import scipy.signal
from opencc import OpenCC
from huggingface_hub import hf_hub_download
converter = OpenCC('s2t')
# ASR model repository and file paths
REPO_ID = "pfluo/k2fsa-zipformer-chinese-english-mixed"
FILES = {
"tokens": "data/lang_char_bpe/tokens.txt",
"encoder": "exp/encoder-epoch-99-avg-1.int8.onnx",
"decoder": "exp/decoder-epoch-99-avg-1.onnx",
"joiner": "exp/joiner-epoch-99-avg-1.int8.onnx",
}
# Download and cache each file via HuggingFace Hub
LOCAL_PATHS = {}
for key, path in FILES.items():
LOCAL_PATHS[key] = hf_hub_download(
repo_id=REPO_ID,
filename=path,
)
# Audio resampling utility
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
return scipy.signal.resample_poly(audio, target_sr, orig_sr)
# Build the online recognizer with int8 weights
def create_recognizer():
return sherpa_onnx.OnlineRecognizer.from_transducer(
tokens=LOCAL_PATHS['tokens'],
encoder=LOCAL_PATHS['encoder'],
decoder=LOCAL_PATHS['decoder'],
joiner=LOCAL_PATHS['joiner'],
provider="cpu",
num_threads=1,
sample_rate=16000,
feature_dim=80,
decoding_method="greedy_search"
)
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
if audio.size == 0:
return "", 0.0
resampled = resample_audio(audio, orig_sr, 16000)
rms = float(np.sqrt(np.mean(resampled ** 2)))
stream.accept_waveform(16000, resampled)
if recognizer.is_ready(stream):
recognizer.decode_streams([stream])
result = recognizer.get_result(stream)
return converter.convert(result), rms
def finalize_stream(stream, recognizer):
tail = np.zeros(int(0.66 * 16000), dtype=np.float32)
stream.accept_waveform(16000, tail)
stream.input_finished()
while recognizer.is_ready(stream):
recognizer.decode_streams([stream])
result = recognizer.get_result(stream)
return converter.convert(result)
|