Batch unprocessed audio to reduce Whisper streaming calls
Browse files- whisper_fastapi_online_server.py +13 -10
whisper_fastapi_online_server.py
CHANGED
|
@@ -3,6 +3,7 @@ import argparse
|
|
| 3 |
import asyncio
|
| 4 |
import numpy as np
|
| 5 |
import ffmpeg
|
|
|
|
| 6 |
|
| 7 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 8 |
from fastapi.responses import HTMLResponse
|
|
@@ -69,22 +70,24 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 69 |
nonlocal pcm_buffer
|
| 70 |
loop = asyncio.get_event_loop()
|
| 71 |
full_transcription = ""
|
|
|
|
| 72 |
while True:
|
| 73 |
try:
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
pcm_buffer.extend(chunk)
|
| 80 |
|
| 81 |
-
|
| 82 |
-
three_sec_chunk = pcm_buffer[:BYTES_PER_SEC]
|
| 83 |
-
del pcm_buffer[:BYTES_PER_SEC]
|
| 84 |
-
|
| 85 |
# Convert int16 -> float32
|
| 86 |
-
pcm_array = np.frombuffer(
|
| 87 |
-
|
| 88 |
online.insert_audio_chunk(pcm_array)
|
| 89 |
transcription = online.process_iter()[2]
|
| 90 |
if args.vac:
|
|
|
|
| 3 |
import asyncio
|
| 4 |
import numpy as np
|
| 5 |
import ffmpeg
|
| 6 |
+
from time import time
|
| 7 |
|
| 8 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 9 |
from fastapi.responses import HTMLResponse
|
|
|
|
| 70 |
nonlocal pcm_buffer
|
| 71 |
loop = asyncio.get_event_loop()
|
| 72 |
full_transcription = ""
|
| 73 |
+
beg = time()
|
| 74 |
while True:
|
| 75 |
try:
|
| 76 |
+
elapsed_time = int(time() - beg)
|
| 77 |
+
beg = time()
|
| 78 |
+
chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 32000*elapsed_time)
|
| 79 |
+
if not chunk: # The first chunk will be almost empty, FFmpeg is still starting up
|
| 80 |
+
chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)
|
| 81 |
+
if not chunk: # FFmpeg might have closed
|
| 82 |
+
print("FFmpeg stdout closed.")
|
| 83 |
+
break
|
| 84 |
|
| 85 |
pcm_buffer.extend(chunk)
|
| 86 |
|
| 87 |
+
if len(pcm_buffer) >= BYTES_PER_SEC:
|
|
|
|
|
|
|
|
|
|
| 88 |
# Convert int16 -> float32
|
| 89 |
+
pcm_array = np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
|
| 90 |
+
pcm_buffer = bytearray()
|
| 91 |
online.insert_audio_chunk(pcm_array)
|
| 92 |
transcription = online.process_iter()[2]
|
| 93 |
if args.vac:
|