qfuxa commited on
Commit
c960d15
·
1 Parent(s): 99019f1

Batch unprocessed audio to reduce Whisper streaming calls

Browse files
Files changed (1) hide show
  1. whisper_fastapi_online_server.py +13 -10
whisper_fastapi_online_server.py CHANGED
@@ -3,6 +3,7 @@ import argparse
3
  import asyncio
4
  import numpy as np
5
  import ffmpeg
 
6
 
7
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
8
  from fastapi.responses import HTMLResponse
@@ -69,22 +70,24 @@ async def websocket_endpoint(websocket: WebSocket):
69
  nonlocal pcm_buffer
70
  loop = asyncio.get_event_loop()
71
  full_transcription = ""
 
72
  while True:
73
  try:
74
- chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)
75
- if not chunk: # FFmpeg might have closed
76
- print("FFmpeg stdout closed.")
77
- break
 
 
 
 
78
 
79
  pcm_buffer.extend(chunk)
80
 
81
- while len(pcm_buffer) >= BYTES_PER_SEC:
82
- three_sec_chunk = pcm_buffer[:BYTES_PER_SEC]
83
- del pcm_buffer[:BYTES_PER_SEC]
84
-
85
  # Convert int16 -> float32
86
- pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0
87
-
88
  online.insert_audio_chunk(pcm_array)
89
  transcription = online.process_iter()[2]
90
  if args.vac:
 
3
  import asyncio
4
  import numpy as np
5
  import ffmpeg
6
+ from time import time
7
 
8
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
9
  from fastapi.responses import HTMLResponse
 
70
  nonlocal pcm_buffer
71
  loop = asyncio.get_event_loop()
72
  full_transcription = ""
73
+ beg = time()
74
  while True:
75
  try:
76
+ elapsed_time = int(time() - beg)
77
+ beg = time()
78
+ chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 32000*elapsed_time)
79
+ if not chunk: # The first chunk will be almost empty, FFmpeg is still starting up
80
+ chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)
81
+ if not chunk: # FFmpeg might have closed
82
+ print("FFmpeg stdout closed.")
83
+ break
84
 
85
  pcm_buffer.extend(chunk)
86
 
87
+ if len(pcm_buffer) >= BYTES_PER_SEC:
 
 
 
88
  # Convert int16 -> float32
89
+ pcm_array = np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
90
+ pcm_buffer = bytearray()
91
  online.insert_audio_chunk(pcm_array)
92
  transcription = online.process_iter()[2]
93
  if args.vac: