Streaming-Zipformer / app /asr_worker.py
Luigi's picture
add fp16 version of zipformer-zh-xlarge 06-30 and zipformer-zh 06-30
70a816c
import os
from pathlib import Path
import numpy as np
import sherpa_onnx
import scipy.signal
from opencc import OpenCC
from huggingface_hub import hf_hub_download
from typing import List
import tempfile
from sentencepiece import SentencePieceProcessor
# Ensure Hugging Face cache is in a user-writable directory
CACHE_DIR = Path(__file__).parent / "hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
to_ZHTW = OpenCC('s2t')
to_ZHCN = OpenCC('t2s')
# Streaming Zipformer model registry: paths relative to repo root
STREAMING_ZIPFORMER_MODELS = {
# csukuangfj/sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder.fp16.onnx",
"encoder_int8": None,
"decoder_fp32": "decoder.fp16.onnx",
"decoder_int8": None,
"joiner_fp32": "joiner.fp16.onnx",
"joiner_int8": None,
"modeling_unit":"cjkchar",
"bpe_model": None,
},
# csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-f16-2025-06-30
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder.fp16.onnx",
"encoder_int8": None,
"decoder_fp32": "decoder.fp16.onnx",
"decoder_int8": None,
"joiner_fp32": "joiner.fp16.onnx",
"joiner_int8": None,
"modeling_unit":"cjkchar+bpe",
"bpe_model": "bpe.model",
},
# csukuangfj/sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30": {
"tokens": "tokens.txt",
"encoder_fp32": None,
"encoder_int8": "encoder.int8.onnx",
"decoder_fp32": "decoder.onnx",
"decoder_int8": None,
"joiner_fp32": None,
"joiner_int8": "joiner.int8.onnx",
"modeling_unit":"cjkchar",
"bpe_model": None,
},
# csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30": {
"tokens": "tokens.txt",
"encoder_fp32": None,
"encoder_int8": "encoder.int8.onnx",
"decoder_fp32": "decoder.onnx",
"decoder_int8": None,
"joiner_fp32": None,
"joiner_int8": "joiner.int8.onnx",
"modeling_unit":"cjkchar+bpe",
"bpe_model": "bpe.model",
},
# bilingual zh-en with char+BPE
"csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": {
"tokens": "data/lang_char_bpe/tokens.txt",
"encoder_fp32": "exp/96/encoder-epoch-99-avg-1.onnx",
"encoder_int8": "exp/96/encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "exp/96/decoder-epoch-99-avg-1.onnx",
"decoder_int8": "exp/96/decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "exp/96/joiner-epoch-99-avg-1.onnx",
"joiner_int8": "exp/96/joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"cjkchar+bpe",
"bpe_model": "data/lang_char_bpe/bpe.model",
},
# mixed Chinese+English (char+BPE)
"pfluo/k2fsa-zipformer-chinese-english-mixed": {
"tokens": "data/lang_char_bpe/tokens.txt",
"encoder_fp32": "exp/encoder-epoch-99-avg-1.onnx",
"encoder_int8": "exp/encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "exp/decoder-epoch-99-avg-1.onnx",
"decoder_int8": None,
"joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx",
"joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"cjkchar+bpe",
"bpe_model": "data/lang_char_bpe/bpe.model",
},
# Korean-only (CJK chars)
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1.onnx",
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"cjkchar",
"bpe_model": "bpe.model",
},
# multi Chinese (Hans) (CJK chars)
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-20-avg-1-chunk-16-left-128.onnx",
"encoder_int8": "encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
"decoder_fp32": "decoder-epoch-20-avg-1-chunk-16-left-128.onnx",
"decoder_int8": "decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
"joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx",
"joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
"modeling_unit":"cjkchar",
"bpe_model": "bpe.model",
},
# wenetspeech streaming (CJK chars)
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": {
"tokens": "data/lang_char/tokens.txt",
"encoder_fp32": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx",
"encoder_int8": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
"decoder_fp32": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx",
"decoder_int8": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
"joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
"joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
"modeling_unit":"cjkchar",
"bpe_model": None,
},
# English-only (BPE)
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1-chunk-16-left-128.onnx",
"encoder_int8": "encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1-chunk-16-left-128.onnx",
"decoder_int8": None,
"joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
"joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
"modeling_unit":"bpe",
"bpe_model": "bpe.model",
},
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1.onnx",
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"bpe",
"bpe_model": None,
},
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1.onnx",
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"bpe",
"bpe_model": None,
},
# older bilingual zh-en (cjkchar+BPE) – no bpe.vocab shipped
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1.onnx",
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"cjkchar+bpe",
"bpe_model": "bpe.model",
},
# French-only (BPE)
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-29-avg-9-with-averaged-model.onnx",
"encoder_int8": "encoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
"decoder_fp32": "decoder-epoch-29-avg-9-with-averaged-model.onnx",
"decoder_int8": "decoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
"joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx",
"joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx",
"modeling_unit":"bpe",
"bpe_model": None,
},
# Chinese-only small (CJK chars)
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1.onnx",
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"cjkchar",
"bpe_model": None,
},
# English-only 20M (BPE)
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
"encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
"decoder_fp32": "decoder-epoch-99-avg-1.onnx",
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
"modeling_unit":"bpe",
"bpe_model": None,
},
"csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10": {
"tokens": "tokens.txt",
"encoder_fp32": "encoder-epoch-75-avg-11-chunk-16-left-128.int8.onnx",
"encoder_int8": None,
"decoder_fp32": "decoder-epoch-75-avg-11-chunk-16-left-128.onnx",
"decoder_int8": None,
"joiner_fp32": "joiner-epoch-75-avg-11-chunk-16-left-128.int8.onnx",
"joiner_int8": None,
"modeling_unit":"cjkchar+bpe",
"bpe_model": "bpe.model",
},
}
# Audio resampling utility
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
return scipy.signal.resample_poly(audio, target_sr, orig_sr)
def choose_file(entry: dict, component: str, precision: str) -> str | None:
"""
Pick the best file for the given component ('encoder', 'decoder', or 'joiner')
without checking whether the path exists (e.g. before downloading).
1) Look up the two candidates.
2) If exactly one is present (non-None), return it.
3) Otherwise, if precision=='int8' and that candidate exists, return int8.
Else return fp32.
"""
e8 = entry.get(f'{component}_int8')
e32 = entry.get(f'{component}_fp32')
# 1) If exactly one is present, pick it
if (e8 is None) != (e32 is None):
return e8 or e32
# 2) Otherwise, fallback to β€œint8 if requested & available, else fp32”
return e8 if precision == 'int8' and e8 else e32
# Create an online recognizer for a given model and precision
# model_id: full HF repo ID
# precision: "int8" or "fp32"
def create_recognizer(
model_id: str,
precision: str,
hotwords: List[str] = None,
hotwords_score: float = 0.0,
ep_rule1: float = 2.4,
ep_rule2: float = 1.2,
ep_rule3: int = 300,
):
if model_id not in STREAMING_ZIPFORMER_MODELS:
raise ValueError(f"Model '{model_id}' is not registered.")
entry = STREAMING_ZIPFORMER_MODELS[model_id]
tokens_file = entry['tokens']
encoder_file = choose_file(entry, 'encoder', precision)
decoder_file = choose_file(entry, 'decoder', precision)
joiner_file = choose_file(entry, 'joiner', precision)
tokens_path = hf_hub_download(repo_id=model_id, filename=tokens_file, cache_dir=str(CACHE_DIR))
encoder_path = hf_hub_download(repo_id=model_id, filename=encoder_file, cache_dir=str(CACHE_DIR))
decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR))
joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR))
# Prepare BPE vocab from .model if provided
modeling_unit = entry.get("modeling_unit")
bpe_model_rel = entry.get("bpe_model")
bpe_vocab_path = None
if bpe_model_rel:
try:
bpe_model_path = hf_hub_download(model_id, bpe_model_rel, cache_dir=str(CACHE_DIR))
print(f"[DEBUG] Downloaded bpe model: {bpe_model_path}")
# === export_bpe_vocab.py logic starts here ===
sp = SentencePieceProcessor()
sp.Load(str(bpe_model_path))
vocab_file = Path(CACHE_DIR) / f"{Path(bpe_model_rel).stem}.vocab"
with open(vocab_file, "w", encoding="utf-8") as vf:
for idx in range(sp.get_piece_size()):
piece = sp.id_to_piece(idx)
score = sp.get_score(idx)
vf.write(f"{piece}\t{score}\n")
bpe_vocab_path = str(vocab_file)
print(f"[DEBUG] Converted bpe model to vocab: {bpe_vocab_path}")
# === export_bpe_vocab.py logic ends here ===
except Exception as e:
print(f"[WARNING] Failed to build BPE vocab from '{bpe_model_rel}': {e}")
bpe_vocab_path = None
# Decide if we should use beam-search hotword biasing
has_hot = bool(hotwords and hotwords_score > 0.0)
use_beam = has_hot and ("bpe" not in modeling_unit or bpe_vocab_path is not None)
if use_beam:
# Write hotword list to a temp file (one entry per line)
tf = tempfile.NamedTemporaryFile(
mode="w", delete=False, suffix=".txt", dir=str(CACHE_DIR)
)
for w in hotwords:
# Remove backslashes and angle-bracket tokens
clean = w.replace("\\", "").replace("<unk>", "").strip()
clean = to_ZHCN.convert(clean) # convert all hotword into zh-cn for zh-cn models
if clean: # only write non-empty lines
tf.write(f"{clean}\n")
tf.flush()
tf.close()
hotwords_file_path = tf.name
print(f"[DEBUG asr_worker] Written {len(hotwords)} hotwords to {hotwords_file_path} with score {hotwords_score}")
# Create beam-search recognizer with biasing :contentReference[oaicite:0]{index=0}
return sherpa_onnx.OnlineRecognizer.from_transducer(
tokens=tokens_path,
encoder=encoder_path,
decoder=decoder_path,
joiner=joiner_path,
provider="cpu",
num_threads=1,
sample_rate=16000,
feature_dim=80,
decoding_method="modified_beam_search",
hotwords_file=hotwords_file_path,
hotwords_score=hotwords_score,
modeling_unit=modeling_unit,
bpe_vocab=bpe_vocab_path,
# endpoint detection parameters
enable_endpoint_detection=True,
rule1_min_trailing_silence=ep_rule1,
rule2_min_trailing_silence=ep_rule2,
rule3_min_utterance_length=ep_rule3,
)
# β€”β€”β€” Fallback to original greedy-search (no hotword biasing) β€”β€”β€”
return sherpa_onnx.OnlineRecognizer.from_transducer(
tokens=tokens_path,
encoder=encoder_path,
decoder=decoder_path,
joiner=joiner_path,
provider="cpu",
num_threads=1,
sample_rate=16000,
feature_dim=80,
decoding_method="greedy_search",
# endpoint detection parameters
enable_endpoint_detection=True,
rule1_min_trailing_silence=ep_rule1,
rule2_min_trailing_silence=ep_rule2,
rule3_min_utterance_length=ep_rule3,
)
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
if audio.size == 0:
return "", 0.0
resampled = resample_audio(audio, orig_sr, 16000)
rms = float(np.sqrt(np.mean(resampled ** 2)))
stream.accept_waveform(16000, resampled)
if recognizer.is_ready(stream):
recognizer.decode_streams([stream])
result = recognizer.get_result(stream)
return to_ZHTW.convert(result), rms