Spaces:

Luigi
/

Streaming-Zipformer

Running

Luigi commited on Jun 6

Commit

2318eae

1 Parent(s): f8643a1

commit app, Dockerfile and requirements.txt

Files changed (7) hide show

Dockerfile ADDED Viewed

+# Hugging Face Spaces: FastAPI + ASR (CPU-only)
+FROM python:3.10-slim
+# Install system deps
+RUN apt-get update && apt-get install -y \
+    ffmpeg libsndfile1 git curl build-essential && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /code
+# Copy code
+COPY ./app /code/app
+COPY ./models /code/models
+COPY requirements.txt ./
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose default port for HF Spaces
+EXPOSE 7860
+# Entrypoint for FastAPI app
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__pycache__/asr_worker.cpython-312.pyc ADDED Viewed

Binary file (2.33 kB). View file

app/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (2.16 kB). View file

app/asr_worker.py ADDED Viewed

+import numpy as np
+import sherpa_onnx
+from pathlib import Path
+MODEL_DIR = Path("models/zipformer_bilingual")
+def create_recognizer():
+    return sherpa_onnx.OnlineRecognizer.from_transducer(
+        tokens=str(MODEL_DIR / "tokens.txt"),
+        encoder=str(MODEL_DIR / "encoder-epoch-99-avg-1.onnx"),
+        decoder=str(MODEL_DIR / "decoder-epoch-99-avg-1.onnx"),
+        joiner=str(MODEL_DIR / "joiner-epoch-99-avg-1.onnx"),
+        provider="cpu",
+        num_threads=1,
+        sample_rate=16000,
+        feature_dim=80,
+        decoding_method="greedy_search"
+    )
+def stream_audio(raw_pcm_bytes, stream, recognizer):
+    audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
+    rms = float(np.sqrt(np.mean(audio ** 2)))
+    stream.accept_waveform(16000, audio)
+    if recognizer.is_ready(stream):
+        recognizer.decode_streams([stream])
+    result = recognizer.get_result(stream)
+    return result, rms
+def finalize_stream(stream, recognizer):
+    tail = np.zeros(int(0.66 * 16000), dtype=np.float32)
+    stream.accept_waveform(16000, tail)
+    stream.input_finished()
+    while recognizer.is_ready(stream):
+        recognizer.decode_streams([stream])
+    return recognizer.get_result(stream)

app/main.py ADDED Viewed

+from fastapi import FastAPI, WebSocket
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse
+from app.asr_worker import create_recognizer, stream_audio, finalize_stream
+app = FastAPI()
+app.mount("/static", StaticFiles(directory="app/static"), name="static")
+recognizer = create_recognizer()
+@app.get("/")
+async def root():
+    with open("app/static/index.html") as f:
+        return HTMLResponse(f.read())
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    stream = recognizer.create_stream()
+    try:
+        while True:
+            data = await websocket.receive_bytes()
+            result, rms = stream_audio(data, stream, recognizer)
+            await websocket.send_json({
+                "partial": result,
+                "volume": min(rms * 5.0, 1.0)
+            })
+    except Exception:
+        final = finalize_stream(stream, recognizer)
+        await websocket.send_json({"final": final})
+        await websocket.close()

app/static/index.html ADDED Viewed

+<!DOCTYPE html>
+<html>
+<head>
+  <title>FastAPI Real-Time ASR</title>
+</head>
+<body>
+  <h1>🎤 Speak into your mic...</h1>
+  <div>Volume: <progress id="vol" max="1" value="0"></progress></div>
+  <p>Partial: <span id="partial"></span></p>
+  <p>Final: <b id="final"></b></p>
+  <script>
+    const ws = new WebSocket("ws://" + location.host + "/ws");
+    const vol = document.getElementById("vol");
+    const partial = document.getElementById("partial");
+    const finalText = document.getElementById("final");
+    navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
+      const context = new AudioContext();
+      const source = context.createMediaStreamSource(stream);
+      const processor = context.createScriptProcessor(4096, 1, 1);
+      source.connect(processor);
+      processor.connect(context.destination);
+      processor.onaudioprocess = e => {
+        const input = e.inputBuffer.getChannelData(0);
+        ws.send(new Float32Array(input).buffer);
+      };
+    });
+    ws.onmessage = e => {
+      const msg = JSON.parse(e.data);
+      if (msg.partial) {
+        partial.textContent = msg.partial;
+        vol.value = msg.volume;
+      } else if (msg.final) {
+        finalText.textContent = msg.final;
+      }
+    };
+  </script>
+</body>
+</html>

requirements.txt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a7c3c10c8ec533e73405e503c3004146a36153ae701934132aecbe689e9e666
+size 44