Spaces:
Running
Running
add endpoint detection
Browse files- app/asr_worker.py +13 -0
- app/main.py +29 -3
- app/static/index.html +124 -23
app/asr_worker.py
CHANGED
@@ -185,6 +185,9 @@ def create_recognizer(
|
|
185 |
precision: str,
|
186 |
hotwords: List[str] = None,
|
187 |
hotwords_score: float = 0.0,
|
|
|
|
|
|
|
188 |
):
|
189 |
if model_id not in STREAMING_ZIPFORMER_MODELS:
|
190 |
raise ValueError(f"Model '{model_id}' is not registered.")
|
@@ -262,6 +265,11 @@ def create_recognizer(
|
|
262 |
hotwords_score=hotwords_score,
|
263 |
modeling_unit=modeling_unit,
|
264 |
bpe_vocab=bpe_vocab_path,
|
|
|
|
|
|
|
|
|
|
|
265 |
)
|
266 |
|
267 |
# ——— Fallback to original greedy-search (no hotword biasing) ———
|
@@ -275,6 +283,11 @@ def create_recognizer(
|
|
275 |
sample_rate=16000,
|
276 |
feature_dim=80,
|
277 |
decoding_method="greedy_search",
|
|
|
|
|
|
|
|
|
|
|
278 |
)
|
279 |
|
280 |
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
|
|
|
185 |
precision: str,
|
186 |
hotwords: List[str] = None,
|
187 |
hotwords_score: float = 0.0,
|
188 |
+
ep_rule1: float = 2.4,
|
189 |
+
ep_rule2: float = 1.2,
|
190 |
+
ep_rule3: int = 300,
|
191 |
):
|
192 |
if model_id not in STREAMING_ZIPFORMER_MODELS:
|
193 |
raise ValueError(f"Model '{model_id}' is not registered.")
|
|
|
265 |
hotwords_score=hotwords_score,
|
266 |
modeling_unit=modeling_unit,
|
267 |
bpe_vocab=bpe_vocab_path,
|
268 |
+
# endpoint detection parameters
|
269 |
+
enable_endpoint_detection=True,
|
270 |
+
rule1_min_trailing_silence=ep_rule1,
|
271 |
+
rule2_min_trailing_silence=ep_rule2,
|
272 |
+
rule3_min_utterance_length=ep_rule3,
|
273 |
)
|
274 |
|
275 |
# ——— Fallback to original greedy-search (no hotword biasing) ———
|
|
|
283 |
sample_rate=16000,
|
284 |
feature_dim=80,
|
285 |
decoding_method="greedy_search",
|
286 |
+
# endpoint detection parameters
|
287 |
+
enable_endpoint_detection=True,
|
288 |
+
rule1_min_trailing_silence=ep_rule1,
|
289 |
+
rule2_min_trailing_silence=ep_rule2,
|
290 |
+
rule3_min_utterance_length=ep_rule3,
|
291 |
)
|
292 |
|
293 |
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
|
app/main.py
CHANGED
@@ -56,12 +56,21 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
56 |
hotwords_score = float(config_msg.get("hotwordsScore", 0.0))
|
57 |
print(f"[INFO main] Hotwords: {hotwords}, score: {hotwords_score}")
|
58 |
|
59 |
-
# 4)
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
recognizer = create_recognizer(
|
61 |
model_id,
|
62 |
precision,
|
63 |
hotwords=hotwords,
|
64 |
-
hotwords_score=hotwords_score
|
|
|
|
|
|
|
65 |
)
|
66 |
stream = recognizer.create_stream()
|
67 |
print("[INFO main] WebSocket connection accepted; created a streaming context.")
|
@@ -78,8 +87,20 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
78 |
result, rms = stream_audio(raw_audio, stream, recognizer, orig_sr)
|
79 |
vol_to_send = min(rms, 1.0)
|
80 |
# print(f"[INFO main] Sending → partial='{result[:30]}…', volume={vol_to_send:.4f}")
|
|
|
81 |
await websocket.send_json({"partial": result, "volume": vol_to_send})
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
elif kind == "websocket.receive_bytes":
|
85 |
raw_audio = data["bytes"]
|
@@ -95,6 +116,11 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
95 |
"partial": result,
|
96 |
"volume": min(rms, 1.0)
|
97 |
})
|
|
|
|
|
|
|
|
|
|
|
98 |
except Exception as e:
|
99 |
print(f"[ERROR main] Unexpected exception: {e}")
|
100 |
try:
|
|
|
56 |
hotwords_score = float(config_msg.get("hotwordsScore", 0.0))
|
57 |
print(f"[INFO main] Hotwords: {hotwords}, score: {hotwords_score}")
|
58 |
|
59 |
+
# 4) Parse endpoint detection rules
|
60 |
+
ep1 = float(config_msg.get("epRule1", 2.4))
|
61 |
+
ep2 = float(config_msg.get("epRule2", 1.2))
|
62 |
+
ep3 = int( config_msg.get("epRule3", 300))
|
63 |
+
print(f"[INFO main] Endpoint rules: rule1={ep1}s, rule2={ep2}s, rule3={ep3}ms")
|
64 |
+
|
65 |
+
# 5) create recognizer with endpoint settings & biasing
|
66 |
recognizer = create_recognizer(
|
67 |
model_id,
|
68 |
precision,
|
69 |
hotwords=hotwords,
|
70 |
+
hotwords_score=hotwords_score,
|
71 |
+
ep_rule1=ep1,
|
72 |
+
ep_rule2=ep2,
|
73 |
+
ep_rule3=ep3
|
74 |
)
|
75 |
stream = recognizer.create_stream()
|
76 |
print("[INFO main] WebSocket connection accepted; created a streaming context.")
|
|
|
87 |
result, rms = stream_audio(raw_audio, stream, recognizer, orig_sr)
|
88 |
vol_to_send = min(rms, 1.0)
|
89 |
# print(f"[INFO main] Sending → partial='{result[:30]}…', volume={vol_to_send:.4f}")
|
90 |
+
# 1) send the interim
|
91 |
await websocket.send_json({"partial": result, "volume": vol_to_send})
|
92 |
+
|
93 |
+
# 2) DEBUG: log when endpoint is seen
|
94 |
+
is_ep = recognizer.is_endpoint(stream)
|
95 |
+
# print(f"[DEBUG main] is_endpoint={is_ep}")
|
96 |
+
|
97 |
+
# 3) if endpoint, emit final and reset
|
98 |
+
if is_ep:
|
99 |
+
if result.strip():
|
100 |
+
print(f"[DEBUG main] Emitting final: {result!r}")
|
101 |
+
await websocket.send_json({"final": result})
|
102 |
+
recognizer.reset(stream)
|
103 |
+
continue
|
104 |
|
105 |
elif kind == "websocket.receive_bytes":
|
106 |
raw_audio = data["bytes"]
|
|
|
116 |
"partial": result,
|
117 |
"volume": min(rms, 1.0)
|
118 |
})
|
119 |
+
# -- INSERT: emit final on endpoint detection --
|
120 |
+
if recognizer.is_endpoint(stream):
|
121 |
+
if result.strip():
|
122 |
+
await websocket.send_json({"final": result})
|
123 |
+
recognizer.reset(stream)
|
124 |
except Exception as e:
|
125 |
print(f"[ERROR main] Unexpected exception: {e}")
|
126 |
try:
|
app/static/index.html
CHANGED
@@ -4,6 +4,24 @@
|
|
4 |
<meta charset="UTF-8" />
|
5 |
<title>🎤 Real-Time ASR Demo</title>
|
6 |
<style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
body {
|
8 |
font-family: "Segoe UI", sans-serif;
|
9 |
background-color: #f5f6fa;
|
@@ -157,6 +175,10 @@
|
|
157 |
</select>
|
158 |
</div>
|
159 |
|
|
|
|
|
|
|
|
|
160 |
<div class="controls">
|
161 |
<!-- Hotwords List Input -->
|
162 |
<label for="hotwordsList">Hotwords:</label>
|
@@ -173,8 +195,19 @@
|
|
173 |
<span id="hotwordStatus">Hotword Bias: Off</span>
|
174 |
</div>
|
175 |
|
176 |
-
<div class="
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
</div>
|
179 |
|
180 |
<div class="mic-info">
|
@@ -242,21 +275,6 @@
|
|
242 |
modelSize.textContent = meta.size;
|
243 |
}
|
244 |
|
245 |
-
function sendConfig() {
|
246 |
-
if (ws && ws.readyState === WebSocket.OPEN) {
|
247 |
-
ws.send(JSON.stringify({
|
248 |
-
type: "config",
|
249 |
-
sampleRate: orig_sample_rate,
|
250 |
-
model: modelSelect.value,
|
251 |
-
precision: precisionSelect.value,
|
252 |
-
hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean),
|
253 |
-
hotwordsScore: parseFloat(boostScore.value)
|
254 |
-
}));
|
255 |
-
} else {
|
256 |
-
console.warn("WebSocket not open yet. Cannot send config.");
|
257 |
-
}
|
258 |
-
}
|
259 |
-
|
260 |
navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
|
261 |
const context = new AudioContext();
|
262 |
orig_sample_rate = context.sampleRate;
|
@@ -270,20 +288,38 @@
|
|
270 |
|
271 |
// Now that we know the sample rate, open the WS
|
272 |
ws = new WebSocket(`wss://${location.host}/ws`);
|
273 |
-
ws.onopen
|
274 |
ws.onerror = err => console.error("WebSocket error:", err);
|
275 |
ws.onclose = () => console.log("WebSocket closed");
|
|
|
|
|
276 |
ws.onmessage = e => {
|
277 |
const msg = JSON.parse(e.data);
|
|
|
|
|
278 |
if (msg.volume !== undefined) {
|
279 |
vol.value = Math.min(msg.volume, 1.0);
|
280 |
}
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
|
|
|
|
286 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
};
|
288 |
|
289 |
modelSelect.addEventListener("change", () => {
|
@@ -315,6 +351,71 @@
|
|
315 |
ws.send(new Float32Array(input).buffer);
|
316 |
};
|
317 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
</script>
|
319 |
</body>
|
320 |
</html>
|
|
|
4 |
<meta charset="UTF-8" />
|
5 |
<title>🎤 Real-Time ASR Demo</title>
|
6 |
<style>
|
7 |
+
/* Ensure the transcript preserves spacing and scrolls */
|
8 |
+
#transcript {
|
9 |
+
white-space: pre-wrap;
|
10 |
+
overflow-y: auto;
|
11 |
+
}
|
12 |
+
|
13 |
+
/* Finalized utterances in green, with a bit of right-margin */
|
14 |
+
#transcript .final {
|
15 |
+
color: green;
|
16 |
+
display: inline;
|
17 |
+
margin-right: 0.5em;
|
18 |
+
}
|
19 |
+
|
20 |
+
/* Interim utterance in red */
|
21 |
+
#transcript .interim {
|
22 |
+
color: red;
|
23 |
+
display: inline;
|
24 |
+
}
|
25 |
body {
|
26 |
font-family: "Segoe UI", sans-serif;
|
27 |
background-color: #f5f6fa;
|
|
|
175 |
</select>
|
176 |
</div>
|
177 |
|
178 |
+
<div class="model-info" id="modelInfo">
|
179 |
+
Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB
|
180 |
+
</div>
|
181 |
+
|
182 |
<div class="controls">
|
183 |
<!-- Hotwords List Input -->
|
184 |
<label for="hotwordsList">Hotwords:</label>
|
|
|
195 |
<span id="hotwordStatus">Hotword Bias: Off</span>
|
196 |
</div>
|
197 |
|
198 |
+
<div class="controls">
|
199 |
+
<!-- ⬇️ INSERT START: Endpoint Detection Controls ⬇️ -->
|
200 |
+
<label for="epRule1">Rule 1 (silence ≥ s):</label>
|
201 |
+
<input type="number" id="epRule1" step="0.1" value="2.4">
|
202 |
+
|
203 |
+
<label for="epRule2">Rule 2 (silence ≥ s):</label>
|
204 |
+
<input type="number" id="epRule2" step="0.1" value="1.2">
|
205 |
+
|
206 |
+
<label for="epRule3">Rule 3 (min utterance ms):</label>
|
207 |
+
<input type="number" id="epRule3" step="50" value="300">
|
208 |
+
|
209 |
+
<button id="applyEndpointConfig">Apply Endpoint Config</button>
|
210 |
+
<!-- ⬆️ INSERT END: Endpoint Detection Controls ⬆️ -->
|
211 |
</div>
|
212 |
|
213 |
<div class="mic-info">
|
|
|
275 |
modelSize.textContent = meta.size;
|
276 |
}
|
277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
|
279 |
const context = new AudioContext();
|
280 |
orig_sample_rate = context.sampleRate;
|
|
|
288 |
|
289 |
// Now that we know the sample rate, open the WS
|
290 |
ws = new WebSocket(`wss://${location.host}/ws`);
|
291 |
+
ws.onopen = () => sendConfig();
|
292 |
ws.onerror = err => console.error("WebSocket error:", err);
|
293 |
ws.onclose = () => console.log("WebSocket closed");
|
294 |
+
|
295 |
+
// Unified handler for partial + final messages
|
296 |
ws.onmessage = e => {
|
297 |
const msg = JSON.parse(e.data);
|
298 |
+
|
299 |
+
// 1) update volume bar
|
300 |
if (msg.volume !== undefined) {
|
301 |
vol.value = Math.min(msg.volume, 1.0);
|
302 |
}
|
303 |
+
|
304 |
+
// 2) distinguish “final” vs “partial”
|
305 |
+
if (msg.final !== undefined) {
|
306 |
+
finalUtterances.push(msg.final.trim());
|
307 |
+
currentInterim = "";
|
308 |
+
} else if (msg.partial !== undefined) {
|
309 |
+
currentInterim = msg.partial;
|
310 |
}
|
311 |
+
|
312 |
+
// 3) rebuild the full, colored transcript
|
313 |
+
transcript.innerHTML =
|
314 |
+
finalUtterances
|
315 |
+
.map(u => `<span class="final">${u}</span>`)
|
316 |
+
.join("") /* margin in CSS handles spacing */
|
317 |
+
+ (currentInterim
|
318 |
+
? ` <span class="interim">${currentInterim}</span>`
|
319 |
+
: "");
|
320 |
+
|
321 |
+
// 4) auto-scroll to newest text
|
322 |
+
transcript.scrollTop = transcript.scrollHeight;
|
323 |
};
|
324 |
|
325 |
modelSelect.addEventListener("change", () => {
|
|
|
351 |
ws.send(new Float32Array(input).buffer);
|
352 |
};
|
353 |
});
|
354 |
+
|
355 |
+
// 2) Declare state for final/interim rendering
|
356 |
+
const finalUtterances = [];
|
357 |
+
let currentInterim = "";
|
358 |
+
|
359 |
+
// 3) Grab your new inputs + button
|
360 |
+
const epRule1Input = document.getElementById("epRule1");
|
361 |
+
const epRule2Input = document.getElementById("epRule2");
|
362 |
+
const epRule3Input = document.getElementById("epRule3");
|
363 |
+
const applyEndpointBtn = document.getElementById("applyEndpointConfig");
|
364 |
+
|
365 |
+
// 4) Extend sendConfig() to include epRule1/2/3
|
366 |
+
function sendConfig() {
|
367 |
+
if (ws && ws.readyState === WebSocket.OPEN) {
|
368 |
+
ws.send(JSON.stringify({
|
369 |
+
type: "config",
|
370 |
+
sampleRate: orig_sample_rate,
|
371 |
+
model: modelSelect.value,
|
372 |
+
precision: precisionSelect.value,
|
373 |
+
hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean),
|
374 |
+
hotwordsScore: parseFloat(boostScore.value),
|
375 |
+
|
376 |
+
// ← new endpoint fields
|
377 |
+
epRule1: parseFloat(epRule1Input.value),
|
378 |
+
epRule2: parseFloat(epRule2Input.value),
|
379 |
+
epRule3: parseInt( epRule3Input.value, 10),
|
380 |
+
}));
|
381 |
+
}
|
382 |
+
}
|
383 |
+
|
384 |
+
// 5) Re-send config when user clicks “Apply Endpoint Config”
|
385 |
+
applyEndpointBtn.addEventListener("click", () => {
|
386 |
+
sendConfig();
|
387 |
+
});
|
388 |
+
|
389 |
+
// 6) Replace your existing ws.onmessage handler with this:
|
390 |
+
ws.onmessage = e => {
|
391 |
+
const msg = JSON.parse(e.data);
|
392 |
+
|
393 |
+
if (msg.volume !== undefined) {
|
394 |
+
vol.value = Math.min(msg.volume, 1.0);
|
395 |
+
}
|
396 |
+
|
397 |
+
if (msg.final !== undefined) {
|
398 |
+
// endpoint fired → lock in the final utterance
|
399 |
+
finalUtterances.push(msg.final.trim());
|
400 |
+
currentInterim = "";
|
401 |
+
} else if (msg.partial !== undefined) {
|
402 |
+
// update the rolling interim
|
403 |
+
currentInterim = msg.partial;
|
404 |
+
}
|
405 |
+
|
406 |
+
// rebuild the full transcript: green finals + red interim
|
407 |
+
transcript.innerHTML =
|
408 |
+
finalUtterances
|
409 |
+
.map(u => `<span class="final">${u}</span>`)
|
410 |
+
.join("") // no explicit space here, margin handles it
|
411 |
+
+ (currentInterim
|
412 |
+
? `<span class="interim">${currentInterim}</span>`
|
413 |
+
: "");
|
414 |
+
|
415 |
+
// always scroll to bottom
|
416 |
+
transcript.scrollTop = transcript.scrollHeight;
|
417 |
+
};
|
418 |
+
|
419 |
</script>
|
420 |
</body>
|
421 |
</html>
|