Spaces:

Luigi
/

Streaming-Zipformer

Running

App Files Files Community

Luigi commited on Jun 8

Commit

454a10d

1 Parent(s): 0586d3c

add more models

Browse files

Files changed (3) hide show

app/asr_worker.py +133 -23
app/main.py +25 -13
app/static/index.html +50 -1

app/asr_worker.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import numpy as np
 import sherpa_onnx
 import scipy.signal
 from opencc import OpenCC
 from huggingface_hub import hf_hub_download
-from pathlib import Path
 # Ensure Hugging Face cache is in a user-writable directory
 CACHE_DIR = Path(__file__).parent / "hf_cache"
@@ -12,35 +12,145 @@ os.makedirs(CACHE_DIR, exist_ok=True)
 converter = OpenCC('s2t')
-# ASR model repository and file paths
-REPO_ID = "pfluo/k2fsa-zipformer-chinese-english-mixed"
-FILES = {
-    "tokens": "data/lang_char_bpe/tokens.txt",
-    "encoder": "exp/encoder-epoch-99-avg-1.int8.onnx",
-    "decoder": "exp/decoder-epoch-99-avg-1.onnx",
-    "joiner": "exp/joiner-epoch-99-avg-1.int8.onnx",
 }
-# Download and cache each file via HuggingFace Hub
-LOCAL_PATHS = {}
-for key, path in FILES.items():
-    LOCAL_PATHS[key] = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=path,
-        cache_dir=str(CACHE_DIR),
-    )
 # Audio resampling utility
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
     return scipy.signal.resample_poly(audio, target_sr, orig_sr)
-# Build the online recognizer with int8 weights
-def create_recognizer():
     return sherpa_onnx.OnlineRecognizer.from_transducer(
-        tokens=LOCAL_PATHS['tokens'],
-        encoder=LOCAL_PATHS['encoder'],
-        decoder=LOCAL_PATHS['decoder'],
-        joiner=LOCAL_PATHS['joiner'],
         provider="cpu",
         num_threads=1,
         sample_rate=16000,

 import os
+from pathlib import Path
 import numpy as np
 import sherpa_onnx
 import scipy.signal
 from opencc import OpenCC
 from huggingface_hub import hf_hub_download
 # Ensure Hugging Face cache is in a user-writable directory
 CACHE_DIR = Path(__file__).parent / "hf_cache"
 converter = OpenCC('s2t')
+# Streaming Zipformer model registry: paths relative to repo root
+STREAMING_ZIPFORMER_MODELS = {
+    "pfluo/k2fsa-zipformer-chinese-english-mixed": {
+        "tokens": "data/lang_char_bpe/tokens.txt",
+        "encoder_fp32": "exp/encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "exp/encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "exp/decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": None,
+        "joiner_fp32": "exp/joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "exp/joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-20-avg-1-chunk-16-left-128.onnx",
+        "encoder_int8": "encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
+        "decoder_fp32": "decoder-epoch-20-avg-1-chunk-16-left-128.onnx",
+        "decoder_int8": "decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
+        "joiner_fp32": "joiner-epoch-20-avg-1-chunk-16-left-128.onnx",
+        "joiner_int8": "joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx",
+    },
+    "pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": {
+        "tokens": "data/lang_char/tokens.txt",
+        "encoder_fp32": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx",
+        "encoder_int8": "exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
+        "decoder_fp32": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx",
+        "decoder_int8": "exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
+        "joiner_fp32": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
+        "joiner_int8": "exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
+    },
+    "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1-chunk-16-left-128.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1-chunk-16-left-128.onnx",
+        "decoder_int8": None,
+        "joiner_fp32": "joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
+    },
+    "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-29-avg-9-with-averaged-model.onnx",
+        "encoder_int8": "encoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
+        "decoder_fp32": "decoder-epoch-29-avg-9-with-averaged-model.onnx",
+        "decoder_int8": "decoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
+        "joiner_fp32": "joiner-epoch-29-avg-9-with-averaged-model.onnx",
+        "joiner_int8": "joiner-epoch-29-avg-9-with-averaged-model.int8.onnx",
+    },
+    "sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
+    "csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": {
+        "tokens": "tokens.txt",
+        "encoder_fp32": "encoder-epoch-99-avg-1.onnx",
+        "encoder_int8": "encoder-epoch-99-avg-1.int8.onnx",
+        "decoder_fp32": "decoder-epoch-99-avg-1.onnx",
+        "decoder_int8": "decoder-epoch-99-avg-1.int8.onnx",
+        "joiner_fp32": "joiner-epoch-99-avg-1.onnx",
+        "joiner_int8": "joiner-epoch-99-avg-1.int8.onnx",
+    },
 }
 # Audio resampling utility
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
     return scipy.signal.resample_poly(audio, target_sr, orig_sr)
+# Create an online recognizer for a given model and precision
+# model_id: full HF repo ID
+# precision: "int8" or "fp32"
+def create_recognizer(model_id: str, precision: str):
+    if model_id not in STREAMING_ZIPFORMER_MODELS:
+        raise ValueError(f"Model '{model_id}' is not registered.")
+    entry = STREAMING_ZIPFORMER_MODELS[model_id]
+    tokens_file = entry['tokens']
+    encoder_file = entry['encoder_int8'] if precision == 'int8' else entry['encoder_fp32']
+    decoder_file = entry['decoder_fp32']
+    joiner_file = entry['joiner_int8'] if precision == 'int8' else entry['joiner_fp32']
+    tokens_path = hf_hub_download(repo_id=model_id, filename=tokens_file, cache_dir=str(CACHE_DIR))
+    encoder_path = hf_hub_download(repo_id=model_id, filename=encoder_file, cache_dir=str(CACHE_DIR))
+    decoder_path = hf_hub_download(repo_id=model_id, filename=decoder_file, cache_dir=str(CACHE_DIR))
+    joiner_path = hf_hub_download(repo_id=model_id, filename=joiner_file, cache_dir=str(CACHE_DIR))
     return sherpa_onnx.OnlineRecognizer.from_transducer(
+        tokens=tokens_path,
+        encoder=encoder_path,
+        decoder=decoder_path,
+        joiner=joiner_path,
         provider="cpu",
         num_threads=1,
         sample_rate=16000,

app/main.py CHANGED Viewed

@@ -8,8 +8,6 @@ app = FastAPI()
 app.mount("/static", StaticFiles(directory="app/static"), name="static")
-recognizer = create_recognizer()
 @app.get("/")
 async def root():
     with open("app/static/index.html") as f:
@@ -22,24 +20,24 @@ async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
     print("[DEBUG main] ▶ WebSocket.accept() returned → client is connected!")
-    # Immediately create a new stream per client
-    stream = recognizer.create_stream()
     orig_sr = 48000  # default fallback
-    print("[INFO main] WebSocket connection accepted; created a streaming context.")
     try:
         while True:
             data = await websocket.receive()
             kind = data.get("type")
-            # Debug: log any event we don't handle explicitly
             if kind not in ("websocket.receive", "websocket.receive_bytes"):
                 print(f"[DEBUG main] Received control/frame: {data}")
-                # If client cleanly disconnected, finalize and break
                 if kind == "websocket.disconnect":
-                    print(f"[INFO main] Client disconnected (code={data.get('code')}). Flushing final transcript...")
-                    final = finalize_stream(stream, recognizer)
-                    await websocket.send_json({"final": final})
                     break
                 continue
@@ -54,7 +52,20 @@ async def websocket_endpoint(websocket: WebSocket):
                 if config_msg.get("type") == "config":
                     orig_sr = int(config_msg["sampleRate"])
                     print(f"[INFO main] Set original sample rate to {orig_sr}")
-                    continue
             # If it’s a text payload but with bytes (some FastAPI versions put audio under 'text'!)
             if kind == "websocket.receive" and "bytes" in data:
@@ -82,7 +93,8 @@ async def websocket_endpoint(websocket: WebSocket):
                 })
     except Exception as e:
         print(f"[ERROR main] Unexpected exception: {e}")
-        final = finalize_stream(stream, recognizer)
-        await websocket.send_json({"final": final})
         await websocket.close()
         print("[INFO main] WebSocket closed, cleanup complete.")

 app.mount("/static", StaticFiles(directory="app/static"), name="static")
 @app.get("/")
 async def root():
     with open("app/static/index.html") as f:
     await websocket.accept()
     print("[DEBUG main] ▶ WebSocket.accept() returned → client is connected!")
+    recognizer = None
+    stream = None
     orig_sr = 48000  # default fallback
     try:
         while True:
             data = await websocket.receive()
             kind = data.get("type")
+            # Handle control frames
             if kind not in ("websocket.receive", "websocket.receive_bytes"):
                 print(f"[DEBUG main] Received control/frame: {data}")
                 if kind == "websocket.disconnect":
+                    # On client disconnect, flush final transcript if possible
+                    if stream and recognizer:
+                        print(f"[INFO main] Client disconnected (code={data.get('code')}). Flushing final transcript...")
+                        final = finalize_stream(stream, recognizer)
+                        await websocket.send_json({"final": final})
                     break
                 continue
                 if config_msg.get("type") == "config":
                     orig_sr = int(config_msg["sampleRate"])
                     print(f"[INFO main] Set original sample rate to {orig_sr}")
+                    # New: dynamic model & precision
+                    model_id = config_msg.get("model")
+                    precision = config_msg.get("precision")
+                    print(f"[INFO main] Selected model: {model_id}, precision: {precision}")
+                    recognizer = create_recognizer(model_id, precision)
+                    stream = recognizer.create_stream()
+                    print("[INFO main] WebSocket connection accepted; created a streaming context.")
+                continue
+            # Don't process audio until after config
+            if recognizer is None or stream is None:
+                continue
             # If it’s a text payload but with bytes (some FastAPI versions put audio under 'text'!)
             if kind == "websocket.receive" and "bytes" in data:
                 })
     except Exception as e:
         print(f"[ERROR main] Unexpected exception: {e}")
+        if stream and recognizer:
+            final = finalize_stream(stream, recognizer)
+            await websocket.send_json({"final": final})
         await websocket.close()
         print("[INFO main] WebSocket closed, cleanup complete.")

app/static/index.html CHANGED Viewed

@@ -70,10 +70,52 @@
       font-size: 1.4rem;
       color: #e84118;
     }
   </style>
 </head>
 <body>
   <h1>🎤 Speak into your microphone</h1>
   <progress id="vol" max="1" value="0"></progress>
   <div class="output">
@@ -89,6 +131,8 @@
     const vol = document.getElementById("vol");
     const partial = document.getElementById("partial");
     const finalText = document.getElementById("final");
     navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
       const context = new AudioContext();
@@ -96,7 +140,12 @@
       ws.onopen = () => {
         console.log("[DEBUG client] WebSocket.onopen fired!");
-        ws.send(JSON.stringify({ type: "config", sampleRate: orig_sample_rate }));
       };
       ws.onerror = err => {
         console.error("[DEBUG client] WebSocket.onerror:", err);

       font-size: 1.4rem;
       color: #e84118;
     }
+    .controls {
+      display: flex;
+      gap: 1rem;
+      margin-bottom: 1rem;
+      align-items: center;
+    }
+    .controls label {
+      font-weight: bold;
+      color: #2f3640;
+    }
+    .controls select {
+      padding: 0.3rem;
+      border-radius: 5px;
+      border: 1px solid #dcdde1;
+      background: white;
+    }
   </style>
 </head>
 <body>
   <h1>🎤 Speak into your microphone</h1>
+  <div class="controls">
+    <label for="modelSelect">Model:</label>
+    <select id="modelSelect">
+      <option value="pfluo/k2fsa-zipformer-chinese-english-mixed">k2fsa-chinese-english-mixed</option>
+      <option value="k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16">sherpa-onnx-zipformer-korean</option>
+      <option value="k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12">zipformer-multi-zh-hans</option>
+      <option value="pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615">icefall-zipformer-wenetspeech</option>
+      <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26">zipformer-en-06-26</option>
+      <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21">zipformer-en-06-21</option>
+      <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21">zipformer-en-02-21</option>
+      <option value="csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20">zipformer-zh-en</option>
+      <option value="shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14">zipformer-fr</option>
+      <option value="sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16">zipformer-small-zh-en</option>
+      <option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23">zipformer-zh-14M</option>
+      <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17">zipformer-en-20M</option>
+    </select>
+    <label for="precisionSelect">Precision:</label>
+    <select id="precisionSelect">
+      <option value="fp32">FP32</option>
+      <option value="int8">INT8</option>
+    </select>
+  </div>
   <progress id="vol" max="1" value="0"></progress>
   <div class="output">
     const vol = document.getElementById("vol");
     const partial = document.getElementById("partial");
     const finalText = document.getElementById("final");
+    const modelSelect = document.getElementById("modelSelect");
+    const precisionSelect = document.getElementById("precisionSelect");
     navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
       const context = new AudioContext();
       ws.onopen = () => {
         console.log("[DEBUG client] WebSocket.onopen fired!");
+        ws.send(JSON.stringify({
+          type: "config",
+          sampleRate: orig_sample_rate,
+          model: modelSelect.value,
+          precision: precisionSelect.value
+        }));
       };
       ws.onerror = err => {
         console.error("[DEBUG client] WebSocket.onerror:", err);