Spaces:

Luigi
/

Streaming-Zipformer

Running

App Files Files Community

Luigi commited on Jun 11

Commit

548b7ed

1 Parent(s): 30a3b5d

add endpoint detection

Browse files

Files changed (3) hide show

app/asr_worker.py +13 -0
app/main.py +29 -3
app/static/index.html +124 -23

app/asr_worker.py CHANGED Viewed

@@ -185,6 +185,9 @@ def create_recognizer(
     precision: str,
     hotwords: List[str] = None,
     hotwords_score: float = 0.0,
 ):
     if model_id not in STREAMING_ZIPFORMER_MODELS:
         raise ValueError(f"Model '{model_id}' is not registered.")
@@ -262,6 +265,11 @@ def create_recognizer(
                 hotwords_score=hotwords_score,
                 modeling_unit=modeling_unit,
                 bpe_vocab=bpe_vocab_path,
             )
     # ——— Fallback to original greedy-search (no hotword biasing) ———
@@ -275,6 +283,11 @@ def create_recognizer(
         sample_rate=16000,
         feature_dim=80,
         decoding_method="greedy_search",
     )
 def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):

     precision: str,
     hotwords: List[str] = None,
     hotwords_score: float = 0.0,
+    ep_rule1: float = 2.4,
+    ep_rule2: float = 1.2,
+    ep_rule3: int   = 300,
 ):
     if model_id not in STREAMING_ZIPFORMER_MODELS:
         raise ValueError(f"Model '{model_id}' is not registered.")
                 hotwords_score=hotwords_score,
                 modeling_unit=modeling_unit,
                 bpe_vocab=bpe_vocab_path,
+                # endpoint detection parameters
+                enable_endpoint_detection=True,
+                rule1_min_trailing_silence=ep_rule1,
+                rule2_min_trailing_silence=ep_rule2,
+                rule3_min_utterance_length=ep_rule3,
             )
     # ——— Fallback to original greedy-search (no hotword biasing) ———
         sample_rate=16000,
         feature_dim=80,
         decoding_method="greedy_search",
+        # endpoint detection parameters
+        enable_endpoint_detection=True,
+        rule1_min_trailing_silence=ep_rule1,
+        rule2_min_trailing_silence=ep_rule2,
+        rule3_min_utterance_length=ep_rule3,
     )
 def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):

app/main.py CHANGED Viewed

@@ -56,12 +56,21 @@ async def websocket_endpoint(websocket: WebSocket):
                     hotwords_score = float(config_msg.get("hotwordsScore", 0.0))
                     print(f"[INFO main] Hotwords: {hotwords}, score: {hotwords_score}")
-                    # 4) create recognizer with biasing
                     recognizer = create_recognizer(
                         model_id,
                         precision,
                         hotwords=hotwords,
-                        hotwords_score=hotwords_score
                     )
                     stream = recognizer.create_stream()
                     print("[INFO main] WebSocket connection accepted; created a streaming context.")
@@ -78,8 +87,20 @@ async def websocket_endpoint(websocket: WebSocket):
                 result, rms = stream_audio(raw_audio, stream, recognizer, orig_sr)
                 vol_to_send = min(rms, 1.0)
                 # print(f"[INFO main] Sending → partial='{result[:30]}…', volume={vol_to_send:.4f}")
                 await websocket.send_json({"partial": result, "volume": vol_to_send})
-                continue
             elif kind == "websocket.receive_bytes":
                 raw_audio = data["bytes"]
@@ -95,6 +116,11 @@ async def websocket_endpoint(websocket: WebSocket):
                     "partial": result,
                     "volume": min(rms, 1.0)
                 })
     except Exception as e:
         print(f"[ERROR main] Unexpected exception: {e}")
         try:

                     hotwords_score = float(config_msg.get("hotwordsScore", 0.0))
                     print(f"[INFO main] Hotwords: {hotwords}, score: {hotwords_score}")
+                    # 4) Parse endpoint detection rules
+                    ep1 = float(config_msg.get("epRule1", 2.4))
+                    ep2 = float(config_msg.get("epRule2", 1.2))
+                    ep3 = int(  config_msg.get("epRule3", 300))
+                    print(f"[INFO main] Endpoint rules: rule1={ep1}s, rule2={ep2}s, rule3={ep3}ms")
+                    # 5) create recognizer with endpoint settings & biasing
                     recognizer = create_recognizer(
                         model_id,
                         precision,
                         hotwords=hotwords,
+                        hotwords_score=hotwords_score,
+                        ep_rule1=ep1,
+                        ep_rule2=ep2,
+                        ep_rule3=ep3
                     )
                     stream = recognizer.create_stream()
                     print("[INFO main] WebSocket connection accepted; created a streaming context.")
                 result, rms = stream_audio(raw_audio, stream, recognizer, orig_sr)
                 vol_to_send = min(rms, 1.0)
                 # print(f"[INFO main] Sending → partial='{result[:30]}…', volume={vol_to_send:.4f}")
+                # 1) send the interim
                 await websocket.send_json({"partial": result, "volume": vol_to_send})
+                # 2) DEBUG: log when endpoint is seen
+                is_ep = recognizer.is_endpoint(stream)
+                # print(f"[DEBUG main] is_endpoint={is_ep}")
+                # 3) if endpoint, emit final and reset
+                if is_ep:
+                    if result.strip():
+                        print(f"[DEBUG main] Emitting final: {result!r}")
+                        await websocket.send_json({"final": result})
+                    recognizer.reset(stream)
+                    continue
             elif kind == "websocket.receive_bytes":
                 raw_audio = data["bytes"]
                     "partial": result,
                     "volume": min(rms, 1.0)
                 })
+                # -- INSERT: emit final on endpoint detection --
+                if recognizer.is_endpoint(stream):
+                    if result.strip():
+                        await websocket.send_json({"final": result})
+                    recognizer.reset(stream)
     except Exception as e:
         print(f"[ERROR main] Unexpected exception: {e}")
         try:

app/static/index.html CHANGED Viewed

@@ -4,6 +4,24 @@
   <meta charset="UTF-8" />
   <title>🎤 Real-Time ASR Demo</title>
   <style>
     body {
       font-family: "Segoe UI", sans-serif;
       background-color: #f5f6fa;
@@ -157,6 +175,10 @@
     </select>
   </div>
   <div class="controls">
     <!-- Hotwords List Input -->
     <label for="hotwordsList">Hotwords:</label>
@@ -173,8 +195,19 @@
     <span id="hotwordStatus">Hotword Bias: Off</span>
   </div>
-  <div class="model-info" id="modelInfo">
-    Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB
   </div>
   <div class="mic-info">
@@ -242,21 +275,6 @@
       modelSize.textContent = meta.size;
     }
-    function sendConfig() {
-      if (ws && ws.readyState === WebSocket.OPEN) {
-        ws.send(JSON.stringify({
-          type: "config",
-          sampleRate: orig_sample_rate,
-          model: modelSelect.value,
-          precision: precisionSelect.value,
-          hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean),
-          hotwordsScore: parseFloat(boostScore.value)
-        }));
-      } else {
-        console.warn("WebSocket not open yet. Cannot send config.");
-      }
-    }
     navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
       const context = new AudioContext();
       orig_sample_rate = context.sampleRate;
@@ -270,20 +288,38 @@
       // Now that we know the sample rate, open the WS
       ws = new WebSocket(`wss://${location.host}/ws`);
-      ws.onopen = () => sendConfig();
       ws.onerror = err => console.error("WebSocket error:", err);
       ws.onclose = () => console.log("WebSocket closed");
       ws.onmessage = e => {
         const msg = JSON.parse(e.data);
         if (msg.volume !== undefined) {
           vol.value = Math.min(msg.volume, 1.0);
         }
-        if (msg.partial) {
-          // replace content…
-          transcript.textContent = msg.partial;
-          // …then scroll to bottom
-          transcript.scrollTop = transcript.scrollHeight;
         }
       };
       modelSelect.addEventListener("change", () => {
@@ -315,6 +351,71 @@
         ws.send(new Float32Array(input).buffer);
       };
     });
   </script>
 </body>
 </html>

   <meta charset="UTF-8" />
   <title>🎤 Real-Time ASR Demo</title>
   <style>
+    /* Ensure the transcript preserves spacing and scrolls */
+    #transcript {
+      white-space: pre-wrap;
+      overflow-y: auto;
+    }
+    /* Finalized utterances in green, with a bit of right-margin */
+    #transcript .final {
+      color: green;
+      display: inline;
+      margin-right: 0.5em;
+    }
+    /* Interim utterance in red */
+    #transcript .interim {
+      color: red;
+      display: inline;
+    }
     body {
       font-family: "Segoe UI", sans-serif;
       background-color: #f5f6fa;
     </select>
   </div>
+  <div class="model-info" id="modelInfo">
+    Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB
+  </div>
   <div class="controls">
     <!-- Hotwords List Input -->
     <label for="hotwordsList">Hotwords:</label>
     <span id="hotwordStatus">Hotword Bias: Off</span>
   </div>
+  <div class="controls">
+    <!-- ⬇️ INSERT START: Endpoint Detection Controls ⬇️ -->
+    <label for="epRule1">Rule 1 (silence ≥ s):</label>
+    <input type="number" id="epRule1" step="0.1" value="2.4">
+    <label for="epRule2">Rule 2 (silence ≥ s):</label>
+    <input type="number" id="epRule2" step="0.1" value="1.2">
+    <label for="epRule3">Rule 3 (min utterance ms):</label>
+    <input type="number" id="epRule3" step="50" value="300">
+    <button id="applyEndpointConfig">Apply Endpoint Config</button>
+    <!-- ⬆️ INSERT END: Endpoint Detection Controls ⬆️ -->
   </div>
   <div class="mic-info">
       modelSize.textContent = meta.size;
     }
     navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
       const context = new AudioContext();
       orig_sample_rate = context.sampleRate;
       // Now that we know the sample rate, open the WS
       ws = new WebSocket(`wss://${location.host}/ws`);
+      ws.onopen  = () => sendConfig();
       ws.onerror = err => console.error("WebSocket error:", err);
       ws.onclose = () => console.log("WebSocket closed");
+      // Unified handler for partial + final messages
       ws.onmessage = e => {
         const msg = JSON.parse(e.data);
+        // 1) update volume bar
         if (msg.volume !== undefined) {
           vol.value = Math.min(msg.volume, 1.0);
         }
+        // 2) distinguish “final” vs “partial”
+        if (msg.final !== undefined) {
+          finalUtterances.push(msg.final.trim());
+          currentInterim = "";
+        } else if (msg.partial !== undefined) {
+          currentInterim = msg.partial;
         }
+        // 3) rebuild the full, colored transcript
+        transcript.innerHTML =
+          finalUtterances
+            .map(u => `<span class="final">${u}</span>`)
+            .join("") /* margin in CSS handles spacing */
+          + (currentInterim
+             ? ` <span class="interim">${currentInterim}</span>`
+             : "");
+        // 4) auto-scroll to newest text
+        transcript.scrollTop = transcript.scrollHeight;
       };
       modelSelect.addEventListener("change", () => {
         ws.send(new Float32Array(input).buffer);
       };
     });
+  // 2) Declare state for final/interim rendering
+  const finalUtterances = [];
+  let currentInterim = "";
+  // 3) Grab your new inputs + button
+  const epRule1Input = document.getElementById("epRule1");
+  const epRule2Input = document.getElementById("epRule2");
+  const epRule3Input = document.getElementById("epRule3");
+  const applyEndpointBtn = document.getElementById("applyEndpointConfig");
+  // 4) Extend sendConfig() to include epRule1/2/3
+  function sendConfig() {
+    if (ws && ws.readyState === WebSocket.OPEN) {
+      ws.send(JSON.stringify({
+        type:         "config",
+        sampleRate:   orig_sample_rate,
+        model:        modelSelect.value,
+        precision:    precisionSelect.value,
+        hotwords:     hotwordsList.value.split(/\r?\n/).filter(Boolean),
+        hotwordsScore: parseFloat(boostScore.value),
+        // ← new endpoint fields
+        epRule1: parseFloat(epRule1Input.value),
+        epRule2: parseFloat(epRule2Input.value),
+        epRule3: parseInt(  epRule3Input.value, 10),
+      }));
+    }
+  }
+  // 5) Re-send config when user clicks “Apply Endpoint Config”
+  applyEndpointBtn.addEventListener("click", () => {
+    sendConfig();
+  });
+  // 6) Replace your existing ws.onmessage handler with this:
+  ws.onmessage = e => {
+    const msg = JSON.parse(e.data);
+    if (msg.volume !== undefined) {
+      vol.value = Math.min(msg.volume, 1.0);
+    }
+    if (msg.final !== undefined) {
+      // endpoint fired → lock in the final utterance
+      finalUtterances.push(msg.final.trim());
+      currentInterim = "";
+    } else if (msg.partial !== undefined) {
+      // update the rolling interim
+      currentInterim = msg.partial;
+    }
+    // rebuild the full transcript: green finals + red interim
+    transcript.innerHTML =
+      finalUtterances
+        .map(u => `<span class="final">${u}</span>`)
+        .join("")   // no explicit space here, margin handles it
+      + (currentInterim
+        ? `<span class="interim">${currentInterim}</span>`
+        : "");
+    // always scroll to bottom
+    transcript.scrollTop = transcript.scrollHeight;
+  };
   </script>
 </body>
 </html>