Spaces:
Sleeping
Sleeping
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <title>🎤 Real-Time ASR Demo</title> | |
| <style> | |
| /* Page layout */ | |
| body { | |
| font-family: "Segoe UI", sans-serif; | |
| background-color: #f5f6fa; | |
| margin: 0; | |
| padding: 2rem; | |
| color: #2f3640; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| } | |
| h1 { | |
| font-size: 2rem; | |
| margin-bottom: 1rem; | |
| } | |
| section { | |
| width: 100%; | |
| max-width: 900px; | |
| margin-bottom: 1.5rem; | |
| background: white; | |
| border-radius: 8px; | |
| padding: 1rem; | |
| box-shadow: 0 0 8px rgba(0,0,0,0.1); | |
| } | |
| section h2 { | |
| margin-top: 0; | |
| font-size: 1.2rem; | |
| border-bottom: 1px solid #dcdde1; | |
| padding-bottom: 0.5rem; | |
| color: #2f3640; | |
| } | |
| /* Grid for controls */ | |
| .controls-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
| gap: 1rem; | |
| margin-top: 1rem; | |
| } | |
| .control-item { | |
| display: flex; | |
| flex-direction: column; | |
| } | |
| .control-item > label { | |
| font-weight: 600; | |
| margin-bottom: 0.3rem; | |
| } | |
| .control-item > select, | |
| .control-item > input, | |
| .control-item > textarea { | |
| padding: 0.5rem; | |
| border: 1px solid #dcdde1; | |
| border-radius: 5px; | |
| font-size: 1rem; | |
| background: white; | |
| } | |
| .control-item > textarea { | |
| resize: vertical; | |
| min-height: 4rem; | |
| } | |
| .control-item > button { | |
| margin-top: 0.5rem; | |
| padding: 0.5rem; | |
| border: none; | |
| border-radius: 5px; | |
| background-color: #44bd32; | |
| color: white; | |
| font-size: 1rem; | |
| cursor: pointer; | |
| transition: background-color 0.2s; | |
| } | |
| .control-item > button:hover { | |
| background-color: #4cd137; | |
| } | |
| /* Status text */ | |
| #hotwordStatus { | |
| margin-top: 0.5rem; | |
| font-size: 0.9rem; | |
| color: #e1b12c; | |
| font-weight: bold; | |
| text-align: center; | |
| } | |
| /* Mic info and volume */ | |
| .mic-info { | |
| font-size: 0.9rem; | |
| color: #353b48; | |
| margin-top: 1rem; | |
| } | |
| .mic-info .label { | |
| font-weight: bold; | |
| } | |
| #vol { | |
| width: 100%; | |
| max-width: 500px; | |
| height: 20px; | |
| margin-top: 0.5rem; | |
| appearance: none; | |
| } | |
| #vol::-webkit-progress-bar { | |
| background-color: #dcdde1; | |
| border-radius: 8px; | |
| } | |
| #vol::-webkit-progress-value { | |
| background-color: #44bd32; | |
| border-radius: 8px; | |
| transition: width 0.2s; | |
| } | |
| #vol::-moz-progress-bar { | |
| background-color: #44bd32; | |
| border-radius: 8px; | |
| transition: width 0.2s; | |
| } | |
| /* Transcript */ | |
| .transcript-container { | |
| margin-top: 0.5rem; | |
| padding: 0.5rem; | |
| background: #fff; | |
| border: 1px solid #dcdde1; | |
| border-radius: 8px; | |
| max-height: 300px; | |
| overflow-y: auto; | |
| white-space: pre-wrap; | |
| font-size: 1.1rem; | |
| color: #353b48; | |
| } | |
| .transcript-container .final { | |
| color: green; | |
| display: inline; | |
| margin-right: 0.5em; | |
| } | |
| .transcript-container .interim { | |
| color: red; | |
| display: inline; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>🎤 Speak into Your Microphone</h1> | |
| <section class="section--settings"> | |
| <h2>Recognition Settings</h2> | |
| <div class="controls-grid"> | |
| <div class="control-item"> | |
| <label for="modelSelect">Model</label> | |
| <select id="modelSelect"> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30">zipformer-zh-int8-2025-06-30</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30">zipformer-zh-fp16-2025-06-30</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30">zipformer-zh-xlarge-fp16-2025-06-30</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30">zipformer-zh-xlarge-int8-2025-06-30</option> | |
| <option value="csukuangfj/k2fsa-zipformer-bilingual-zh-en-t">k2fsa-small-bilingual-zh-en</option> | |
| <option value="pfluo/k2fsa-zipformer-chinese-english-mixed">k2fsa-chinese-english-mixed</option> | |
| <option value="k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16">sherpa-onnx-zipformer-korean</option> | |
| <option value="k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12">zipformer-multi-zh-hans</option> | |
| <option value="pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615">icefall-zipformer-wenetspeech</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26">zipformer-en-06-26</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21">zipformer-en-06-21</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21">zipformer-en-02-21</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20">zipformer-zh-en</option> | |
| <option value="shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14">zipformer-fr</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23">zipformer-zh-14M</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17">zipformer-en-20M</option> | |
| <option value="csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10">zipformer-ar_en_id_ja_ru_th_vi_zh</option> | |
| </select> | |
| </div> | |
| <div class="control-item"> | |
| <label for="precisionSelect">Precision</label> | |
| <select id="precisionSelect"> | |
| <option value="fp32">FP32</option> | |
| <option value="int8">INT8</option> | |
| </select> | |
| </div> | |
| </div> | |
| <div class="model-info"> | |
| Languages: <span id="modelLangs"></span> | Size: <span id="modelSize"></span> MB | |
| </div> | |
| </section> | |
| <section class="section--hotwords"> | |
| <h2>Hotword Settings</h2> | |
| <div class="controls-grid"> | |
| <div class="control-item"> | |
| <label for="hotwordsList">Hotwords</label> | |
| <textarea id="hotwordsList" placeholder="One per line"></textarea> | |
| </div> | |
| <div class="control-item"> | |
| <label for="boostScore">Boost Score: <span id="boostValue">2.0</span></label> | |
| <input type="range" id="boostScore" min="0" max="10" step="0.1" value="2.0" /> | |
| </div> | |
| <div class="control-item"> | |
| <button id="applyHotwords">Apply Hotwords</button> | |
| </div> | |
| </div> | |
| <div id="hotwordStatus">Hotword Bias: Off</div> | |
| </section> | |
| <section class="section--endpoint"> | |
| <h2>Endpoint Detection</h2> | |
| <div class="controls-grid"> | |
| <div class="control-item"> | |
| <label for="epRule1">Rule 1 (silence ≥ s)</label> | |
| <input type="number" id="epRule1" step="0.1" value="2.4" /> | |
| </div> | |
| <div class="control-item"> | |
| <label for="epRule2">Rule 2 (silence ≥ s)</label> | |
| <input type="number" id="epRule2" step="0.1" value="1.2" /> | |
| </div> | |
| <div class="control-item"> | |
| <label for="epRule3">Rule 3 (min utt ms)</label> | |
| <input type="number" id="epRule3" step="50" value="300" /> | |
| </div> | |
| <div class="control-item"> | |
| <button id="applyEndpointConfig">Apply Endpoint Config</button> | |
| </div> | |
| </div> | |
| </section> | |
| <section class="section--mic"> | |
| <h2>Microphone</h2> | |
| <div class="mic-info"> | |
| <span class="label">Device:</span> <span id="micName">Detecting…</span><br> | |
| <span class="label">Sample Rate:</span> <span id="sampleRate">-</span> Hz | |
| </div> | |
| <progress id="vol" max="1" value="0"></progress> | |
| </section> | |
| <section class="section--transcript"> | |
| <h2>Transcript</h2> | |
| <div id="transcript" class="transcript-container">…</div> | |
| </section> | |
| <script> | |
| const MODEL_METADATA = { | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30": {language: ["zh"], size: 336}, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30": {language: ["zh"], size: 1570}, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30": { language: ["zh"], size: 168 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30": { language: ["zh"], size: 773 }, | |
| "csukuangfj/k2fsa-zipformer-bilingual-zh-en-t": { language: ["zh", "en"], size: 115 }, | |
| "pfluo/k2fsa-zipformer-chinese-english-mixed": { language: ["zh", "en"], size: 342 }, | |
| "k2-fsa/sherpa-onnx-streaming-zipformer-korean-2024-06-16": { language: "korean", size: 300 }, | |
| "k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12": { language: "zh-Hans", size: 258 }, | |
| "pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615": { language: "zh (WenetSpeech)", size: 273 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26": { language: "english", size: 340 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21": { language: "english", size: 340 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21": { language: "english", size: 341 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20": { language: ["zh", "en"], size: 342 }, | |
| "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": { language: "french", size: 282 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23": { language: "zh", size: 53 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17": { language: "en", size: 88 }, | |
| "csukuangfj/sherpa-onnx-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10": {language: ["ar","en","id","ja","ru","th","vi","zh"], size: 338} | |
| }; | |
| let orig_sample_rate; | |
| let ws; | |
| const vol = document.getElementById("vol"); | |
| const transcript = document.getElementById("transcript"); | |
| const modelSelect = document.getElementById("modelSelect"); | |
| const precisionSelect = document.getElementById("precisionSelect"); | |
| const hotwordsList = document.getElementById("hotwordsList"); | |
| const boostScore = document.getElementById("boostScore"); | |
| const boostValue = document.getElementById("boostValue"); | |
| const applyBtn = document.getElementById("applyHotwords"); | |
| const hotwordStatus = document.getElementById("hotwordStatus"); | |
| const modelLangs = document.getElementById("modelLangs"); | |
| const modelSize = document.getElementById("modelSize"); | |
| const micNameElem = document.getElementById("micName"); | |
| const sampleRateElem = document.getElementById("sampleRate"); | |
| // ← Helper to toggle the status text | |
| function updateHotwordStatus() { | |
| const enabled = hotwordsList.value.split(/\r?\n/).filter(Boolean).length > 0 | |
| && parseFloat(boostScore.value) > 0; | |
| hotwordStatus.textContent = enabled | |
| ? "Hotword Bias: On" | |
| : "Hotword Bias: Off"; | |
| } | |
| function updateModelInfo() { | |
| const meta = MODEL_METADATA[modelSelect.value]; | |
| if (Array.isArray(meta.language)) { | |
| modelLangs.textContent = meta.language.join(", "); | |
| } else { | |
| modelLangs.textContent = meta.language; | |
| } | |
| modelSize.textContent = meta.size; | |
| } | |
| navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => { | |
| const context = new AudioContext(); | |
| orig_sample_rate = context.sampleRate; | |
| // Update mic info in UI | |
| const track = stream.getAudioTracks()[0]; | |
| micNameElem.textContent = track.label || 'Unknown'; | |
| sampleRateElem.textContent = orig_sample_rate; | |
| updateModelInfo(); | |
| // Now that we know the sample rate, open the WS | |
| ws = new WebSocket(`wss://${location.host}/ws`); | |
| ws.onopen = () => sendConfig(); | |
| ws.onerror = err => console.error("WebSocket error:", err); | |
| ws.onclose = () => console.log("WebSocket closed"); | |
| // Unified handler for partial + final messages | |
| ws.onmessage = e => { | |
| const msg = JSON.parse(e.data); | |
| // 1) update volume bar | |
| if (msg.volume !== undefined) { | |
| vol.value = Math.min(msg.volume, 1.0); | |
| } | |
| // 2) distinguish “final” vs “partial” | |
| if (msg.final !== undefined) { | |
| finalUtterances.push(msg.final.trim()); | |
| currentInterim = ""; | |
| } else if (msg.partial !== undefined) { | |
| currentInterim = msg.partial; | |
| } | |
| // 3) rebuild the full, colored transcript | |
| transcript.innerHTML = | |
| finalUtterances | |
| .map(u => `<span class="final">${u}</span>`) | |
| .join("") /* margin in CSS handles spacing */ | |
| + (currentInterim | |
| ? ` <span class="interim">${currentInterim}</span>` | |
| : ""); | |
| // 4) auto-scroll to newest text | |
| transcript.scrollTop = transcript.scrollHeight; | |
| }; | |
| modelSelect.addEventListener("change", () => { | |
| updateModelInfo(); | |
| sendConfig(); | |
| updateHotwordStatus(); | |
| }); | |
| precisionSelect.addEventListener("change", () => { | |
| sendConfig(); | |
| updateHotwordStatus(); | |
| }); | |
| applyBtn.addEventListener("click", () => { | |
| sendConfig(); | |
| updateHotwordStatus(); | |
| }); | |
| // Update boost display and status on slider input | |
| boostScore.addEventListener("input", () => { | |
| boostValue.textContent = boostScore.value; | |
| updateHotwordStatus(); | |
| }); | |
| const source = context.createMediaStreamSource(stream); | |
| const processor = context.createScriptProcessor(4096, 1, 1); | |
| source.connect(processor); | |
| processor.connect(context.destination); | |
| processor.onaudioprocess = e => { | |
| const input = e.inputBuffer.getChannelData(0); | |
| ws.send(new Float32Array(input).buffer); | |
| }; | |
| }); | |
| // 2) Declare state for final/interim rendering | |
| const finalUtterances = []; | |
| let currentInterim = ""; | |
| // 3) Grab your new inputs + button | |
| const epRule1Input = document.getElementById("epRule1"); | |
| const epRule2Input = document.getElementById("epRule2"); | |
| const epRule3Input = document.getElementById("epRule3"); | |
| const applyEndpointBtn = document.getElementById("applyEndpointConfig"); | |
| // 4) Extend sendConfig() to include epRule1/2/3 | |
| function sendConfig() { | |
| if (ws && ws.readyState === WebSocket.OPEN) { | |
| ws.send(JSON.stringify({ | |
| type: "config", | |
| sampleRate: orig_sample_rate, | |
| model: modelSelect.value, | |
| precision: precisionSelect.value, | |
| hotwords: hotwordsList.value.split(/\r?\n/).filter(Boolean), | |
| hotwordsScore: parseFloat(boostScore.value), | |
| // ← new endpoint fields | |
| epRule1: parseFloat(epRule1Input.value), | |
| epRule2: parseFloat(epRule2Input.value), | |
| epRule3: parseInt( epRule3Input.value, 10), | |
| })); | |
| } | |
| } | |
| // 5) Re-send config when user clicks “Apply Endpoint Config” | |
| applyEndpointBtn.addEventListener("click", () => { | |
| sendConfig(); | |
| }); | |
| // 6) Replace your existing ws.onmessage handler with this: | |
| ws.onmessage = e => { | |
| const msg = JSON.parse(e.data); | |
| if (msg.volume !== undefined) { | |
| vol.value = Math.min(msg.volume, 1.0); | |
| } | |
| if (msg.final !== undefined) { | |
| // endpoint fired → lock in the final utterance | |
| finalUtterances.push(msg.final.trim()); | |
| currentInterim = ""; | |
| } else if (msg.partial !== undefined) { | |
| // update the rolling interim | |
| currentInterim = msg.partial; | |
| } | |
| // rebuild the full transcript: green finals + red interim | |
| transcript.innerHTML = | |
| finalUtterances | |
| .map(u => `<span class="final">${u}</span>`) | |
| .join("") // no explicit space here, margin handles it | |
| + (currentInterim | |
| ? `<span class="interim">${currentInterim}</span>` | |
| : ""); | |
| // always scroll to bottom | |
| transcript.scrollTop = transcript.scrollHeight; | |
| }; | |
| </script> | |
| </body> | |
| </html> | |