File size: 2,084 Bytes
2318eae
 
221a9c5
231cd3a
ab9b679
231cd3a
 
 
b26d94c
ab9b679
 
 
 
 
 
 
 
b26d94c
 
 
 
 
 
 
ab9b679
b26d94c
 
 
ab9b679
 
2318eae
 
b26d94c
 
 
 
2318eae
 
 
 
 
 
 
7c3f2af
2318eae
2a8a9a5
 
 
7c3f2af
2a8a9a5
 
 
2318eae
 
 
231cd3a
2318eae
 
 
 
 
 
 
231cd3a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import sherpa_onnx
import scipy.signal
from opencc import OpenCC
from huggingface_hub import hf_hub_download

converter = OpenCC('s2t')

# ASR model repository and file paths
REPO_ID = "pfluo/k2fsa-zipformer-chinese-english-mixed"
FILES = {
    "tokens": "data/lang_char_bpe/tokens.txt",
    "encoder": "exp/encoder-epoch-99-avg-1.int8.onnx",
    "decoder": "exp/decoder-epoch-99-avg-1.onnx",
    "joiner": "exp/joiner-epoch-99-avg-1.int8.onnx",
}

# Download and cache each file via HuggingFace Hub
LOCAL_PATHS = {}
for key, path in FILES.items():
    LOCAL_PATHS[key] = hf_hub_download(
        repo_id=REPO_ID,
        filename=path,
    )

# Audio resampling utility
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
    return scipy.signal.resample_poly(audio, target_sr, orig_sr)

# Build the online recognizer with int8 weights
def create_recognizer():
    return sherpa_onnx.OnlineRecognizer.from_transducer(
        tokens=LOCAL_PATHS['tokens'],
        encoder=LOCAL_PATHS['encoder'],
        decoder=LOCAL_PATHS['decoder'],
        joiner=LOCAL_PATHS['joiner'],
        provider="cpu",
        num_threads=1,
        sample_rate=16000,
        feature_dim=80,
        decoding_method="greedy_search"
    )

def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
    audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
    if audio.size == 0:
        return "", 0.0

    resampled = resample_audio(audio, orig_sr, 16000)
    rms = float(np.sqrt(np.mean(resampled ** 2)))

    stream.accept_waveform(16000, resampled)
    if recognizer.is_ready(stream):
        recognizer.decode_streams([stream])
    result = recognizer.get_result(stream)
    return converter.convert(result), rms

def finalize_stream(stream, recognizer):
    tail = np.zeros(int(0.66 * 16000), dtype=np.float32)
    stream.accept_waveform(16000, tail)
    stream.input_finished()
    while recognizer.is_ready(stream):
        recognizer.decode_streams([stream])
    result = recognizer.get_result(stream)
    return converter.convert(result)