Refactor PCM conversion to a dedicated function; immediate chunk addition to the diarization queue
Browse files- README.md +2 -2
- whisper_fastapi_online_server.py +16 -7
README.md
CHANGED
@@ -30,8 +30,8 @@ This project is based on [Whisper Streaming](https://github.com/ufal/whisper_str
|
|
30 |
1. **Clone the Repository**:
|
31 |
|
32 |
```bash
|
33 |
-
git clone https://github.com/QuentinFuxa/
|
34 |
-
cd
|
35 |
```
|
36 |
|
37 |
|
|
|
30 |
1. **Clone the Repository**:
|
31 |
|
32 |
```bash
|
33 |
+
git clone https://github.com/QuentinFuxa/WhisperLiveKit
|
34 |
+
cd WhisperLiveKit
|
35 |
```
|
36 |
|
37 |
|
whisper_fastapi_online_server.py
CHANGED
@@ -190,6 +190,16 @@ app.add_middleware(
|
|
190 |
with open("web/live_transcription.html", "r", encoding="utf-8") as f:
|
191 |
html = f.read()
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
async def start_ffmpeg_decoder():
|
194 |
"""
|
195 |
Start an FFmpeg process in async streaming mode that reads WebM from stdin
|
@@ -444,24 +454,23 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
444 |
logger.info("FFmpeg stdout closed.")
|
445 |
break
|
446 |
pcm_buffer.extend(chunk)
|
|
|
|
|
|
|
|
|
447 |
if len(pcm_buffer) >= BYTES_PER_SEC:
|
448 |
if len(pcm_buffer) > MAX_BYTES_PER_SEC:
|
449 |
logger.warning(
|
450 |
f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
|
451 |
The model probably struggles to keep up. Consider using a smaller model.
|
452 |
""")
|
453 |
-
|
454 |
-
pcm_array = (
|
455 |
-
np.frombuffer(pcm_buffer[:MAX_BYTES_PER_SEC], dtype=np.int16).astype(np.float32)
|
456 |
-
/ 32768.0
|
457 |
-
)
|
458 |
pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
|
459 |
|
460 |
if args.transcription and transcription_queue:
|
461 |
await transcription_queue.put(pcm_array.copy())
|
462 |
|
463 |
-
if args.diarization and diarization_queue:
|
464 |
-
await diarization_queue.put(pcm_array.copy())
|
465 |
|
466 |
if not args.transcription and not args.diarization:
|
467 |
await asyncio.sleep(0.1)
|
|
|
190 |
with open("web/live_transcription.html", "r", encoding="utf-8") as f:
|
191 |
html = f.read()
|
192 |
|
193 |
+
def convert_pcm_to_float(pcm_buffer):
|
194 |
+
"""
|
195 |
+
Converts a PCM buffer in s16le format to a normalized NumPy array.
|
196 |
+
Arg: pcm_buffer. PCM buffer containing raw audio data in s16le format
|
197 |
+
Returns: np.ndarray. NumPy array of float32 type normalized between -1.0 and 1.0
|
198 |
+
"""
|
199 |
+
pcm_array = (np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32)
|
200 |
+
/ 32768.0)
|
201 |
+
return pcm_array
|
202 |
+
|
203 |
async def start_ffmpeg_decoder():
|
204 |
"""
|
205 |
Start an FFmpeg process in async streaming mode that reads WebM from stdin
|
|
|
454 |
logger.info("FFmpeg stdout closed.")
|
455 |
break
|
456 |
pcm_buffer.extend(chunk)
|
457 |
+
|
458 |
+
if args.diarization and diarization_queue:
|
459 |
+
await diarization_queue.put(convert_pcm_to_float(pcm_buffer).copy())
|
460 |
+
|
461 |
if len(pcm_buffer) >= BYTES_PER_SEC:
|
462 |
if len(pcm_buffer) > MAX_BYTES_PER_SEC:
|
463 |
logger.warning(
|
464 |
f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
|
465 |
The model probably struggles to keep up. Consider using a smaller model.
|
466 |
""")
|
467 |
+
|
468 |
+
pcm_array = convert_pcm_to_float(pcm_buffer[:MAX_BYTES_PER_SEC])
|
|
|
|
|
|
|
469 |
pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
|
470 |
|
471 |
if args.transcription and transcription_queue:
|
472 |
await transcription_queue.put(pcm_array.copy())
|
473 |
|
|
|
|
|
474 |
|
475 |
if not args.transcription and not args.diarization:
|
476 |
await asyncio.sleep(0.1)
|