Spaces:
Running
Running
add hotwords (ie. speech context) demo
Browse files- app/asr_worker.py +91 -2
- app/main.py +15 -3
- app/static/index.html +62 -4
app/asr_worker.py
CHANGED
@@ -5,6 +5,8 @@ import sherpa_onnx
|
|
5 |
import scipy.signal
|
6 |
from opencc import OpenCC
|
7 |
from huggingface_hub import hf_hub_download
|
|
|
|
|
8 |
|
9 |
# Ensure Hugging Face cache is in a user-writable directory
|
10 |
CACHE_DIR = Path(__file__).parent / "hf_cache"
|
@@ -14,6 +16,7 @@ converter = OpenCC('s2t')
|
|
14 |
|
15 |
# Streaming Zipformer model registry: paths relative to repo root
|
16 |
STREAMING_ZIPFORMER_MODELS = {
|
|
|
17 |
"csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": {
|
18 |
"tokens": "data/lang_char_bpe/tokens.txt",
|
19 |
"encoder_fp32": "exp/96/encoder-epoch-99-avg-1.onnx",
|
@@ -22,7 +25,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
22 |
"decoder_int8": "exp/96/decoder-epoch-99-avg-1.int8.onnx",
|
23 |
"joiner_fp32": "exp/96/joiner-epoch-99-avg-1.onnx",
|
24 |
"joiner_int8": "exp/96/joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
25 |
},
|
|
|
26 |
"pfluo/k2fsa-zipformer-chinese-english-mixed": {
|
27 |
"tokens": "data/lang_char_bpe/tokens.txt",
|
28 |
"encoder_fp32": "exp/encoder-epoch-99-avg-1.onnx",
|
@@ -31,7 +37,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
31 |
"decoder_int8": None,
|
32 |
"joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx",
|
33 |
"joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
34 |
},
|
|
|
35 |
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": {
|
36 |
"tokens": "tokens.txt",
|
37 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
@@ -40,7 +49,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
40 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
41 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
42 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
43 |
},
|
|
|
44 |
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": {
|
45 |
"tokens": "tokens.txt",
|
46 |
"encoder_fp32": "encoder-epoch-20-avg-1-chunk-16-left-128.onnx",
|
@@ -49,7 +61,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
49 |
"decoder_int8": "decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
|
50 |
"joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx",
|
51 |
"joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
|
|
|
|
|
52 |
},
|
|
|
53 |
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": {
|
54 |
"tokens": "data/lang_char/tokens.txt",
|
55 |
"encoder_fp32": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx",
|
@@ -58,7 +73,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
58 |
"decoder_int8": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
59 |
"joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
|
60 |
"joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
|
|
|
|
61 |
},
|
|
|
62 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": {
|
63 |
"tokens": "tokens.txt",
|
64 |
"encoder_fp32": "encoder-epoch-99-avg-1-chunk-16-left-128.onnx",
|
@@ -67,6 +85,8 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
67 |
"decoder_int8": None,
|
68 |
"joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
|
69 |
"joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
|
|
|
|
|
70 |
},
|
71 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": {
|
72 |
"tokens": "tokens.txt",
|
@@ -76,6 +96,8 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
76 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
77 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
78 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
79 |
},
|
80 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": {
|
81 |
"tokens": "tokens.txt",
|
@@ -85,7 +107,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
85 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
86 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
87 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
88 |
},
|
|
|
89 |
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": {
|
90 |
"tokens": "tokens.txt",
|
91 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
@@ -94,7 +119,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
94 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
95 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
96 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
97 |
},
|
|
|
98 |
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": {
|
99 |
"tokens": "tokens.txt",
|
100 |
"encoder_fp32": "encoder-epoch-29-avg-9-with-averaged-model.onnx",
|
@@ -103,7 +131,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
103 |
"decoder_int8": "decoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
104 |
"joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx",
|
105 |
"joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
|
|
|
|
106 |
},
|
|
|
107 |
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": {
|
108 |
"tokens": "tokens.txt",
|
109 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
@@ -112,7 +143,10 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
112 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
113 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
114 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
115 |
},
|
|
|
116 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": {
|
117 |
"tokens": "tokens.txt",
|
118 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
@@ -121,6 +155,8 @@ STREAMING_ZIPFORMER_MODELS = {
|
|
121 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
122 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
123 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
|
|
|
|
124 |
},
|
125 |
}
|
126 |
|
@@ -131,7 +167,12 @@ def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarra
|
|
131 |
# Create an online recognizer for a given model and precision
|
132 |
# model_id: full HF repo ID
|
133 |
# precision: "int8" or "fp32"
|
134 |
-
def create_recognizer(
|
|
|
|
|
|
|
|
|
|
|
135 |
if model_id not in STREAMING_ZIPFORMER_MODELS:
|
136 |
raise ValueError(f"Model '{model_id}' is not registered.")
|
137 |
entry = STREAMING_ZIPFORMER_MODELS[model_id]
|
@@ -146,6 +187,54 @@ def create_recognizer(model_id: str, precision: str):
|
|
146 |
decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR))
|
147 |
joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR))
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
return sherpa_onnx.OnlineRecognizer.from_transducer(
|
150 |
tokens=tokens_path,
|
151 |
encoder=encoder_path,
|
@@ -155,7 +244,7 @@ def create_recognizer(model_id: str, precision: str):
|
|
155 |
num_threads=1,
|
156 |
sample_rate=16000,
|
157 |
feature_dim=80,
|
158 |
-
decoding_method="greedy_search"
|
159 |
)
|
160 |
|
161 |
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
|
|
|
5 |
import scipy.signal
|
6 |
from opencc import OpenCC
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
+
from typing import List
|
9 |
+
import tempfile
|
10 |
|
11 |
# Ensure Hugging Face cache is in a user-writable directory
|
12 |
CACHE_DIR = Path(__file__).parent / "hf_cache"
|
|
|
16 |
|
17 |
# Streaming Zipformer model registry: paths relative to repo root
|
18 |
STREAMING_ZIPFORMER_MODELS = {
|
19 |
+
# bilingual zh-en with char+BPE
|
20 |
"csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": {
|
21 |
"tokens": "data/lang_char_bpe/tokens.txt",
|
22 |
"encoder_fp32": "exp/96/encoder-epoch-99-avg-1.onnx",
|
|
|
25 |
"decoder_int8": "exp/96/decoder-epoch-99-avg-1.int8.onnx",
|
26 |
"joiner_fp32": "exp/96/joiner-epoch-99-avg-1.onnx",
|
27 |
"joiner_int8": "exp/96/joiner-epoch-99-avg-1.int8.onnx",
|
28 |
+
"modeling_unit":"cjkchar+bpe",
|
29 |
+
"bpe_vocab": "data/lang_char_bpe/bpe.vocab",
|
30 |
},
|
31 |
+
# mixed Chinese+English (char+BPE)
|
32 |
"pfluo/k2fsa-zipformer-chinese-english-mixed": {
|
33 |
"tokens": "data/lang_char_bpe/tokens.txt",
|
34 |
"encoder_fp32": "exp/encoder-epoch-99-avg-1.onnx",
|
|
|
37 |
"decoder_int8": None,
|
38 |
"joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx",
|
39 |
"joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx",
|
40 |
+
"modeling_unit":"cjkchar+bpe",
|
41 |
+
"bpe_vocab": "data/lang_char_bpe/bpe.vocab",
|
42 |
},
|
43 |
+
# Korean-only (CJK chars)
|
44 |
"k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": {
|
45 |
"tokens": "tokens.txt",
|
46 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
|
|
49 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
50 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
51 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
52 |
+
"modeling_unit":"cjkchar",
|
53 |
+
"bpe_vocab": None,
|
54 |
},
|
55 |
+
# multi Chinese (Hans) (CJK chars)
|
56 |
"k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": {
|
57 |
"tokens": "tokens.txt",
|
58 |
"encoder_fp32": "encoder-epoch-20-avg-1-chunk-16-left-128.onnx",
|
|
|
61 |
"decoder_int8": "decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
|
62 |
"joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx",
|
63 |
"joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
|
64 |
+
"modeling_unit":"cjkchar",
|
65 |
+
"bpe_vocab": None,
|
66 |
},
|
67 |
+
# wenetspeech streaming (CJK chars)
|
68 |
"pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": {
|
69 |
"tokens": "data/lang_char/tokens.txt",
|
70 |
"encoder_fp32": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx",
|
|
|
73 |
"decoder_int8": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
74 |
"joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
|
75 |
"joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
76 |
+
"modeling_unit":"cjkchar",
|
77 |
+
"bpe_vocab": None,
|
78 |
},
|
79 |
+
# English-only (BPE)
|
80 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": {
|
81 |
"tokens": "tokens.txt",
|
82 |
"encoder_fp32": "encoder-epoch-99-avg-1-chunk-16-left-128.onnx",
|
|
|
85 |
"decoder_int8": None,
|
86 |
"joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
|
87 |
"joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
|
88 |
+
"modeling_unit":"bpe",
|
89 |
+
"bpe_vocab": None,
|
90 |
},
|
91 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": {
|
92 |
"tokens": "tokens.txt",
|
|
|
96 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
97 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
98 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
99 |
+
"modeling_unit":"bpe",
|
100 |
+
"bpe_vocab": None,
|
101 |
},
|
102 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": {
|
103 |
"tokens": "tokens.txt",
|
|
|
107 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
108 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
109 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
110 |
+
"modeling_unit":"bpe",
|
111 |
+
"bpe_vocab": None,
|
112 |
},
|
113 |
+
# older bilingual zh-en (cjkchar+BPE) β no bpe.vocab shipped
|
114 |
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": {
|
115 |
"tokens": "tokens.txt",
|
116 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
|
|
119 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
120 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
121 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
122 |
+
"modeling_unit":"cjkchar+bpe",
|
123 |
+
"bpe_vocab": None,
|
124 |
},
|
125 |
+
# French-only (BPE)
|
126 |
"shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": {
|
127 |
"tokens": "tokens.txt",
|
128 |
"encoder_fp32": "encoder-epoch-29-avg-9-with-averaged-model.onnx",
|
|
|
131 |
"decoder_int8": "decoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
132 |
"joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx",
|
133 |
"joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
134 |
+
"modeling_unit":"bpe",
|
135 |
+
"bpe_vocab": None,
|
136 |
},
|
137 |
+
# Chinese-only small (CJK chars)
|
138 |
"csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": {
|
139 |
"tokens": "tokens.txt",
|
140 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
|
|
143 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
144 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
145 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
146 |
+
"modeling_unit":"cjkchar",
|
147 |
+
"bpe_vocab": None,
|
148 |
},
|
149 |
+
# English-only 20M (BPE)
|
150 |
"csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": {
|
151 |
"tokens": "tokens.txt",
|
152 |
"encoder_fp32": "encoder-epoch-99-avg-1.onnx",
|
|
|
155 |
"decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
|
156 |
"joiner_fp32": "joiner-epoch-99-avg-1.onnx",
|
157 |
"joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
|
158 |
+
"modeling_unit":"bpe",
|
159 |
+
"bpe_vocab": None,
|
160 |
},
|
161 |
}
|
162 |
|
|
|
167 |
# Create an online recognizer for a given model and precision
|
168 |
# model_id: full HF repo ID
|
169 |
# precision: "int8" or "fp32"
|
170 |
+
def create_recognizer(
|
171 |
+
model_id: str,
|
172 |
+
precision: str,
|
173 |
+
hotwords: List[str] = None,
|
174 |
+
hotwords_score: float = 0.0,
|
175 |
+
):
|
176 |
if model_id not in STREAMING_ZIPFORMER_MODELS:
|
177 |
raise ValueError(f"Model '{model_id}' is not registered.")
|
178 |
entry = STREAMING_ZIPFORMER_MODELS[model_id]
|
|
|
187 |
decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR))
|
188 |
joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR))
|
189 |
|
190 |
+
# βββ Download BPE vocab if this model has one βββ
|
191 |
+
modeling_unit = entry.get("modeling_unit")
|
192 |
+
bpe_rel_path = entry.get("bpe_vocab")
|
193 |
+
bpe_vocab_path = None
|
194 |
+
if bpe_rel_path:
|
195 |
+
try:
|
196 |
+
bpe_vocab_path = hf_hub_download(
|
197 |
+
repo_id=model_id,
|
198 |
+
filename=bpe_rel_path,
|
199 |
+
cache_dir=str(CACHE_DIR),
|
200 |
+
)
|
201 |
+
print(f"[DEBUG asr_worker] Downloaded bpe_vocab: {bpe_vocab_path}")
|
202 |
+
except Exception as e:
|
203 |
+
print(f"[WARNING asr_worker] Could not download bpe_vocab '{bpe_rel_path}': {e}")
|
204 |
+
bpe_vocab_path = None
|
205 |
+
|
206 |
+
# βββ Decide whether to use beam search with hotword biasing βββ
|
207 |
+
use_beam = (hotwords and hotwords_score > 0.0) and bpe_vocab_path
|
208 |
+
if use_beam:
|
209 |
+
# Write hotword list to a temp file (one entry per line)
|
210 |
+
tf = tempfile.NamedTemporaryFile(
|
211 |
+
mode="w", delete=False, suffix=".txt", dir=str(CACHE_DIR)
|
212 |
+
)
|
213 |
+
for w in hotwords:
|
214 |
+
tf.write(f"{w}\\n")
|
215 |
+
tf.flush()
|
216 |
+
tf.close()
|
217 |
+
hotwords_file_path = tf.name
|
218 |
+
print(f"[DEBUG asr_worker] Written {len(hotwords)} hotwords to {hotwords_file_path} with score {hotwords_score}")
|
219 |
+
|
220 |
+
# Create beam-search recognizer with biasing :contentReference[oaicite:0]{index=0}
|
221 |
+
return sherpa_onnx.OnlineRecognizer.from_transducer(
|
222 |
+
tokens=tokens_path,
|
223 |
+
encoder=encoder_path,
|
224 |
+
decoder=decoder_path,
|
225 |
+
joiner=joiner_path,
|
226 |
+
provider="cpu",
|
227 |
+
num_threads=1,
|
228 |
+
sample_rate=16000,
|
229 |
+
feature_dim=80,
|
230 |
+
decoding_method="modified_beam_search",
|
231 |
+
hotwords_file=hotwords_file_path,
|
232 |
+
hotwords_score=hotwords_score,
|
233 |
+
modeling_unit=modeling_unit,
|
234 |
+
bpe_vocab=bpe_vocab_path,
|
235 |
+
)
|
236 |
+
|
237 |
+
# βββ Fallback to original greedy-search (no hotword biasing) βββ
|
238 |
return sherpa_onnx.OnlineRecognizer.from_transducer(
|
239 |
tokens=tokens_path,
|
240 |
encoder=encoder_path,
|
|
|
244 |
num_threads=1,
|
245 |
sample_rate=16000,
|
246 |
feature_dim=80,
|
247 |
+
decoding_method="greedy_search",
|
248 |
)
|
249 |
|
250 |
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
|
app/main.py
CHANGED
@@ -42,15 +42,27 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
42 |
print(f"[ERROR main] JSON parse failed: {e}")
|
43 |
continue
|
44 |
if config_msg.get("type") == "config":
|
|
|
45 |
orig_sr = int(config_msg["sampleRate"])
|
46 |
print(f"[INFO main] Set original sample rate to {orig_sr}")
|
47 |
|
48 |
-
#
|
49 |
-
model_id
|
50 |
precision = config_msg.get("precision")
|
51 |
print(f"[INFO main] Selected model: {model_id}, precision: {precision}")
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
stream = recognizer.create_stream()
|
55 |
print("[INFO main] WebSocket connection accepted; created a streaming context.")
|
56 |
continue
|
|
|
42 |
print(f"[ERROR main] JSON parse failed: {e}")
|
43 |
continue
|
44 |
if config_msg.get("type") == "config":
|
45 |
+
# 1) sample rate
|
46 |
orig_sr = int(config_msg["sampleRate"])
|
47 |
print(f"[INFO main] Set original sample rate to {orig_sr}")
|
48 |
|
49 |
+
# 2) model & precision
|
50 |
+
model_id = config_msg.get("model")
|
51 |
precision = config_msg.get("precision")
|
52 |
print(f"[INFO main] Selected model: {model_id}, precision: {precision}")
|
53 |
|
54 |
+
# 3) hotwords & boost score
|
55 |
+
hotwords = config_msg.get("hotwords", [])
|
56 |
+
hotwords_score = float(config_msg.get("hotwordsScore", 0.0))
|
57 |
+
print(f"[INFO main] Hotwords: {hotwords}, score: {hotwords_score}")
|
58 |
+
|
59 |
+
# 4) create recognizer with biasing
|
60 |
+
recognizer = create_recognizer(
|
61 |
+
model_id,
|
62 |
+
precision,
|
63 |
+
hotwords=hotwords,
|
64 |
+
hotwords_score=hotwords_score
|
65 |
+
)
|
66 |
stream = recognizer.create_stream()
|
67 |
print("[INFO main] WebSocket connection accepted; created a streaming context.")
|
68 |
continue
|
app/static/index.html
CHANGED
@@ -21,6 +21,13 @@
|
|
21 |
margin-bottom: 1rem;
|
22 |
font-size: 2rem;
|
23 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
#vol {
|
26 |
width: 300px;
|
@@ -86,11 +93,19 @@
|
|
86 |
font-weight: bold;
|
87 |
color: #2f3640;
|
88 |
}
|
89 |
-
.controls select
|
|
|
|
|
90 |
padding: 0.3rem;
|
91 |
border-radius: 5px;
|
92 |
border: 1px solid #dcdde1;
|
93 |
background: white;
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
}
|
95 |
|
96 |
.model-info {
|
@@ -139,6 +154,22 @@
|
|
139 |
</select>
|
140 |
</div>
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
<div class="model-info" id="modelInfo">
|
143 |
Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB
|
144 |
</div>
|
@@ -178,11 +209,24 @@
|
|
178 |
const transcript = document.getElementById("transcript");
|
179 |
const modelSelect = document.getElementById("modelSelect");
|
180 |
const precisionSelect = document.getElementById("precisionSelect");
|
|
|
|
|
|
|
|
|
181 |
const modelLangs = document.getElementById("modelLangs");
|
182 |
const modelSize = document.getElementById("modelSize");
|
183 |
const micNameElem = document.getElementById("micName");
|
184 |
const sampleRateElem = document.getElementById("sampleRate");
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
function updateModelInfo() {
|
187 |
const meta = MODEL_METADATA[modelSelect.value];
|
188 |
if (Array.isArray(meta.language)) {
|
@@ -199,7 +243,9 @@
|
|
199 |
type: "config",
|
200 |
sampleRate: orig_sample_rate,
|
201 |
model: modelSelect.value,
|
202 |
-
precision: precisionSelect.value
|
|
|
|
|
203 |
}));
|
204 |
} else {
|
205 |
console.warn("WebSocket not open yet. Cannot send config.");
|
@@ -218,7 +264,7 @@
|
|
218 |
updateModelInfo();
|
219 |
|
220 |
// Now that we know the sample rate, open the WS
|
221 |
-
ws = new WebSocket(`
|
222 |
ws.onopen = () => sendConfig();
|
223 |
ws.onerror = err => console.error("WebSocket error:", err);
|
224 |
ws.onclose = () => console.log("WebSocket closed");
|
@@ -238,8 +284,20 @@
|
|
238 |
modelSelect.addEventListener("change", () => {
|
239 |
updateModelInfo();
|
240 |
sendConfig();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
});
|
242 |
-
precisionSelect.addEventListener("change", sendConfig);
|
243 |
|
244 |
const source = context.createMediaStreamSource(stream);
|
245 |
const processor = context.createScriptProcessor(4096, 1, 1);
|
|
|
21 |
margin-bottom: 1rem;
|
22 |
font-size: 2rem;
|
23 |
}
|
24 |
+
|
25 |
+
/* Added for Hotword Bias status */
|
26 |
+
#hotwordStatus {
|
27 |
+
font-size: 0.9rem;
|
28 |
+
color: #e1b12c;
|
29 |
+
font-weight: bold;
|
30 |
+
}
|
31 |
|
32 |
#vol {
|
33 |
width: 300px;
|
|
|
93 |
font-weight: bold;
|
94 |
color: #2f3640;
|
95 |
}
|
96 |
+
.controls select,
|
97 |
+
.controls input[type="number"],
|
98 |
+
.controls textarea {
|
99 |
padding: 0.3rem;
|
100 |
border-radius: 5px;
|
101 |
border: 1px solid #dcdde1;
|
102 |
background: white;
|
103 |
+
font-size: 1rem;
|
104 |
+
}
|
105 |
+
.controls textarea {
|
106 |
+
flex: 1;
|
107 |
+
resize: vertical;
|
108 |
+
min-height: 4rem;
|
109 |
}
|
110 |
|
111 |
.model-info {
|
|
|
154 |
</select>
|
155 |
</div>
|
156 |
|
157 |
+
<div class="controls">
|
158 |
+
<!-- Hotwords List Input -->
|
159 |
+
<label for="hotwordsList">Hotwords:</label>
|
160 |
+
<textarea id="hotwordsList" placeholder="Enter one hotword per line"></textarea>
|
161 |
+
<!-- Global Boost Score Input -->
|
162 |
+
<label for="boostScore">Boost Score:</label>
|
163 |
+
<input type="number" id="boostScore" min="0" max="10" step="0.1" value="2.0" />
|
164 |
+
<!-- β NEW button to submit hotword changes -->
|
165 |
+
<button id="applyHotwords">Apply Hotwords</button>
|
166 |
+
</div>
|
167 |
+
|
168 |
+
<!-- β NEW indicator showing whether biasing is ON or OFF -->
|
169 |
+
<div class="controls">
|
170 |
+
<span id="hotwordStatus">Hotword Bias: Off</span>
|
171 |
+
</div>
|
172 |
+
|
173 |
<div class="model-info" id="modelInfo">
|
174 |
Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB
|
175 |
</div>
|
|
|
209 |
const transcript = document.getElementById("transcript");
|
210 |
const modelSelect = document.getElementById("modelSelect");
|
211 |
const precisionSelect = document.getElementById("precisionSelect");
|
212 |
+
const hotwordsList = document.getElementById("hotwordsList");
|
213 |
+
const boostScore = document.getElementById("boostScore");
|
214 |
+
const applyBtn = document.getElementById("applyHotwords");
|
215 |
+
const hotwordStatus = document.getElementById("hotwordStatus");
|
216 |
const modelLangs = document.getElementById("modelLangs");
|
217 |
const modelSize = document.getElementById("modelSize");
|
218 |
const micNameElem = document.getElementById("micName");
|
219 |
const sampleRateElem = document.getElementById("sampleRate");
|
220 |
|
221 |
+
// β Helper to toggle the status text
|
222 |
+
function updateHotwordStatus() {
|
223 |
+
const enabled = hotwordsList.value.split(/\r?\n/).filter(Boolean).length > 0
|
224 |
+
&& parseFloat(boostScore.value) > 0;
|
225 |
+
hotwordStatus.textContent = enabled
|
226 |
+
? "Hotword Bias: On"
|
227 |
+
: "Hotword Bias: Off";
|
228 |
+
}
|
229 |
+
|
230 |
function updateModelInfo() {
|
231 |
const meta = MODEL_METADATA[modelSelect.value];
|
232 |
if (Array.isArray(meta.language)) {
|
|
|
243 |
type: "config",
|
244 |
sampleRate: orig_sample_rate,
|
245 |
model: modelSelect.value,
|
246 |
+
precision: precisionSelect.value,
|
247 |
+
hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean),
|
248 |
+
hotwordsScore: parseFloat(boostScore.value)
|
249 |
}));
|
250 |
} else {
|
251 |
console.warn("WebSocket not open yet. Cannot send config.");
|
|
|
264 |
updateModelInfo();
|
265 |
|
266 |
// Now that we know the sample rate, open the WS
|
267 |
+
ws = new WebSocket(`ws://${location.host}/ws`);
|
268 |
ws.onopen = () => sendConfig();
|
269 |
ws.onerror = err => console.error("WebSocket error:", err);
|
270 |
ws.onclose = () => console.log("WebSocket closed");
|
|
|
284 |
modelSelect.addEventListener("change", () => {
|
285 |
updateModelInfo();
|
286 |
sendConfig();
|
287 |
+
updateHotwordStatus();
|
288 |
+
});
|
289 |
+
precisionSelect.addEventListener("change", () => {
|
290 |
+
sendConfig();
|
291 |
+
updateHotwordStatus();
|
292 |
+
});
|
293 |
+
// hotwordsList.addEventListener("input", sendConfig);
|
294 |
+
// boostScore.addEventListener("input", sendConfig);
|
295 |
+
|
296 |
+
// β Re-send config & update indicator when the button is clicked
|
297 |
+
applyBtn.addEventListener("click", () => {
|
298 |
+
sendConfig();
|
299 |
+
updateHotwordStatus();
|
300 |
});
|
|
|
301 |
|
302 |
const source = context.createMediaStreamSource(stream);
|
303 |
const processor = context.createScriptProcessor(4096, 1, 1);
|