Luigi commited on
Commit
2318eae
·
1 Parent(s): f8643a1

commit app, Dockerfile and requirements.txt

Browse files
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces: FastAPI + ASR (CPU-only)
2
+ FROM python:3.10-slim
3
+
4
+ # Install system deps
5
+ RUN apt-get update && apt-get install -y \
6
+ ffmpeg libsndfile1 git curl build-essential && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /code
10
+
11
+ # Copy code
12
+ COPY ./app /code/app
13
+ COPY ./models /code/models
14
+ COPY requirements.txt ./
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Expose default port for HF Spaces
20
+ EXPOSE 7860
21
+
22
+ # Entrypoint for FastAPI app
23
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/__pycache__/asr_worker.cpython-312.pyc ADDED
Binary file (2.33 kB). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (2.16 kB). View file
 
app/asr_worker.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import sherpa_onnx
3
+ from pathlib import Path
4
+
5
+ MODEL_DIR = Path("models/zipformer_bilingual")
6
+
7
+ def create_recognizer():
8
+ return sherpa_onnx.OnlineRecognizer.from_transducer(
9
+ tokens=str(MODEL_DIR / "tokens.txt"),
10
+ encoder=str(MODEL_DIR / "encoder-epoch-99-avg-1.onnx"),
11
+ decoder=str(MODEL_DIR / "decoder-epoch-99-avg-1.onnx"),
12
+ joiner=str(MODEL_DIR / "joiner-epoch-99-avg-1.onnx"),
13
+ provider="cpu",
14
+ num_threads=1,
15
+ sample_rate=16000,
16
+ feature_dim=80,
17
+ decoding_method="greedy_search"
18
+ )
19
+
20
+ def stream_audio(raw_pcm_bytes, stream, recognizer):
21
+ audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
22
+ rms = float(np.sqrt(np.mean(audio ** 2)))
23
+ stream.accept_waveform(16000, audio)
24
+ if recognizer.is_ready(stream):
25
+ recognizer.decode_streams([stream])
26
+ result = recognizer.get_result(stream)
27
+ return result, rms
28
+
29
+ def finalize_stream(stream, recognizer):
30
+ tail = np.zeros(int(0.66 * 16000), dtype=np.float32)
31
+ stream.accept_waveform(16000, tail)
32
+ stream.input_finished()
33
+ while recognizer.is_ready(stream):
34
+ recognizer.decode_streams([stream])
35
+ return recognizer.get_result(stream)
app/main.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, WebSocket
2
+ from fastapi.staticfiles import StaticFiles
3
+ from fastapi.responses import HTMLResponse
4
+ from app.asr_worker import create_recognizer, stream_audio, finalize_stream
5
+
6
+ app = FastAPI()
7
+
8
+ app.mount("/static", StaticFiles(directory="app/static"), name="static")
9
+
10
+ recognizer = create_recognizer()
11
+
12
+ @app.get("/")
13
+ async def root():
14
+ with open("app/static/index.html") as f:
15
+ return HTMLResponse(f.read())
16
+
17
+
18
+ @app.websocket("/ws")
19
+ async def websocket_endpoint(websocket: WebSocket):
20
+ await websocket.accept()
21
+ stream = recognizer.create_stream()
22
+
23
+ try:
24
+ while True:
25
+ data = await websocket.receive_bytes()
26
+ result, rms = stream_audio(data, stream, recognizer)
27
+ await websocket.send_json({
28
+ "partial": result,
29
+ "volume": min(rms * 5.0, 1.0)
30
+ })
31
+ except Exception:
32
+ final = finalize_stream(stream, recognizer)
33
+ await websocket.send_json({"final": final})
34
+ await websocket.close()
app/static/index.html ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>FastAPI Real-Time ASR</title>
5
+ </head>
6
+ <body>
7
+ <h1>🎤 Speak into your mic...</h1>
8
+ <div>Volume: <progress id="vol" max="1" value="0"></progress></div>
9
+ <p>Partial: <span id="partial"></span></p>
10
+ <p>Final: <b id="final"></b></p>
11
+ <script>
12
+ const ws = new WebSocket("ws://" + location.host + "/ws");
13
+ const vol = document.getElementById("vol");
14
+ const partial = document.getElementById("partial");
15
+ const finalText = document.getElementById("final");
16
+
17
+ navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
18
+ const context = new AudioContext();
19
+ const source = context.createMediaStreamSource(stream);
20
+ const processor = context.createScriptProcessor(4096, 1, 1);
21
+ source.connect(processor);
22
+ processor.connect(context.destination);
23
+
24
+ processor.onaudioprocess = e => {
25
+ const input = e.inputBuffer.getChannelData(0);
26
+ ws.send(new Float32Array(input).buffer);
27
+ };
28
+ });
29
+
30
+ ws.onmessage = e => {
31
+ const msg = JSON.parse(e.data);
32
+ if (msg.partial) {
33
+ partial.textContent = msg.partial;
34
+ vol.value = msg.volume;
35
+ } else if (msg.final) {
36
+ finalText.textContent = msg.final;
37
+ }
38
+ };
39
+ </script>
40
+ </body>
41
+ </html>
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7c3c10c8ec533e73405e503c3004146a36153ae701934132aecbe689e9e666
3
+ size 44