qfuxa commited on
Commit
bee27c6
·
1 Parent(s): aa4480b

better buffer gestion

Browse files
Files changed (2) hide show
  1. README.md +4 -2
  2. whisper_fastapi_online_server.py +14 -16
README.md CHANGED
@@ -68,7 +68,9 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str
68
  python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000
69
  ```
70
 
71
- - `--host` and `--port` let you specify the server’s IP/port.
 
 
72
 
73
  4. **Open the Provided HTML**:
74
 
@@ -88,7 +90,7 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str
88
 
89
  If you want to **deploy** this setup:
90
 
91
- 1. **Host the FastAPI app** behind a production-grade HTTP(S) server (like **Uvicorn + Nginx** or Docker).
92
  2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host.
93
  3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket).
94
 
 
68
  python whisper_fastapi_online_server.py --host 0.0.0.0 --port 8000
69
  ```
70
 
71
+ - `--host` and `--port` let you specify the server’s IP/port.
72
+ - `-min-chunk-size` sets the minimum chunk size for audio processing. Make sure this value aligns with the chunk size selected in the frontend. If not aligned, the system will work but may unnecessarily over-process audio data.
73
+ - For a full list of configurable options, run `python whisper_fastapi_online_server.py -h`
74
 
75
  4. **Open the Provided HTML**:
76
 
 
90
 
91
  If you want to **deploy** this setup:
92
 
93
+ 1. **Host the FastAPI app** behind a production-grade HTTP(S) server (like **Uvicorn + Nginx** or Docker). If you use HTTPS, use "wss" instead of "ws" in WebSocket URL.
94
  2. The **HTML/JS page** can be served by the same FastAPI app or a separate static host.
95
  3. Users open the page in **Chrome/Firefox** (any modern browser that supports MediaRecorder + WebSocket).
96
 
whisper_fastapi_online_server.py CHANGED
@@ -20,27 +20,24 @@ app.add_middleware(
20
  )
21
 
22
 
23
- # Argument parsing
24
- parser = argparse.ArgumentParser()
25
- parser.add_argument("--host", type=str, default='localhost')
26
- parser.add_argument("--port", type=int, default=8000)
27
  parser.add_argument("--warmup-file", type=str, dest="warmup_file",
28
  help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
29
  add_shared_args(parser)
30
  args = parser.parse_args()
31
 
32
- # Initialize Whisper
33
  asr, online = asr_factory(args)
34
 
35
  # Load demo HTML for the root endpoint
36
- with open("live_transcription.html", "r") as f:
37
  html = f.read()
38
 
39
  @app.get("/")
40
  async def get():
41
  return HTMLResponse(html)
42
 
43
- # Streaming constants
44
  SAMPLE_RATE = 16000
45
  CHANNELS = 1
46
  SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size)
@@ -67,11 +64,11 @@ async def websocket_endpoint(websocket: WebSocket):
67
 
68
  ffmpeg_process = await start_ffmpeg_decoder()
69
  pcm_buffer = bytearray()
70
-
71
  # Continuously read decoded PCM from ffmpeg stdout in a background task
72
  async def ffmpeg_stdout_reader():
73
  nonlocal pcm_buffer
74
  loop = asyncio.get_event_loop()
 
75
  while True:
76
  try:
77
  chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)
@@ -81,7 +78,6 @@ async def websocket_endpoint(websocket: WebSocket):
81
 
82
  pcm_buffer.extend(chunk)
83
 
84
- # Process in 3-second batches
85
  while len(pcm_buffer) >= BYTES_PER_SEC:
86
  three_sec_chunk = pcm_buffer[:BYTES_PER_SEC]
87
  del pcm_buffer[:BYTES_PER_SEC]
@@ -89,15 +85,17 @@ async def websocket_endpoint(websocket: WebSocket):
89
  # Convert int16 -> float32
90
  pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0
91
 
92
- # Send PCM data to Whisper
93
  online.insert_audio_chunk(pcm_array)
94
- transcription = online.process_iter()
95
- buffer = online.to_flush(online.transcript_buffer.buffer)
96
-
97
- # Return partial transcription results to the client
 
 
 
98
  await websocket.send_json({
99
- "transcription": transcription[2],
100
- "buffer": buffer[2]
101
  })
102
  except Exception as e:
103
  print(f"Exception in ffmpeg_stdout_reader: {e}")
 
20
  )
21
 
22
 
23
+ parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
24
+ parser.add_argument("--host", type=str, default='localhost', help="The host address to bind the server to.")
25
+ parser.add_argument("--port", type=int, default=8000, help="The port number to bind the server to.")
 
26
  parser.add_argument("--warmup-file", type=str, dest="warmup_file",
27
  help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
28
  add_shared_args(parser)
29
  args = parser.parse_args()
30
 
 
31
  asr, online = asr_factory(args)
32
 
33
  # Load demo HTML for the root endpoint
34
+ with open("src/live_transcription.html", "r") as f:
35
  html = f.read()
36
 
37
  @app.get("/")
38
  async def get():
39
  return HTMLResponse(html)
40
 
 
41
  SAMPLE_RATE = 16000
42
  CHANNELS = 1
43
  SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size)
 
64
 
65
  ffmpeg_process = await start_ffmpeg_decoder()
66
  pcm_buffer = bytearray()
 
67
  # Continuously read decoded PCM from ffmpeg stdout in a background task
68
  async def ffmpeg_stdout_reader():
69
  nonlocal pcm_buffer
70
  loop = asyncio.get_event_loop()
71
+ full_transcription = ""
72
  while True:
73
  try:
74
  chunk = await loop.run_in_executor(None, ffmpeg_process.stdout.read, 4096)
 
78
 
79
  pcm_buffer.extend(chunk)
80
 
 
81
  while len(pcm_buffer) >= BYTES_PER_SEC:
82
  three_sec_chunk = pcm_buffer[:BYTES_PER_SEC]
83
  del pcm_buffer[:BYTES_PER_SEC]
 
85
  # Convert int16 -> float32
86
  pcm_array = np.frombuffer(three_sec_chunk, dtype=np.int16).astype(np.float32) / 32768.0
87
 
 
88
  online.insert_audio_chunk(pcm_array)
89
+ transcription = online.process_iter()[2]
90
+ if args.vac:
91
+ buffer = online.online.to_flush(online.online.transcript_buffer.buffer)[2] # We need to access the underlying online object to get the buffer
92
+ else:
93
+ buffer = online.to_flush(online.transcript_buffer.buffer)[2]
94
+ if buffer in full_transcription: # With VAC, the buffer is not updated until the next chunk is processed
95
+ buffer = ""
96
  await websocket.send_json({
97
+ "transcription": transcription,
98
+ "buffer": buffer
99
  })
100
  except Exception as e:
101
  print(f"Exception in ffmpeg_stdout_reader: {e}")