qfuxa commited on
Commit
6cccf9e
·
1 Parent(s): dc3273d

Refactor PCM conversion to a dedicated function; immediate chunk addition to the diarization queue

Browse files
Files changed (2) hide show
  1. README.md +2 -2
  2. whisper_fastapi_online_server.py +16 -7
README.md CHANGED
@@ -30,8 +30,8 @@ This project is based on [Whisper Streaming](https://github.com/ufal/whisper_str
30
  1. **Clone the Repository**:
31
 
32
  ```bash
33
- git clone https://github.com/QuentinFuxa/whisper_streaming_web
34
- cd whisper_streaming_web
35
  ```
36
 
37
 
 
30
  1. **Clone the Repository**:
31
 
32
  ```bash
33
+ git clone https://github.com/QuentinFuxa/WhisperLiveKit
34
+ cd WhisperLiveKit
35
  ```
36
 
37
 
whisper_fastapi_online_server.py CHANGED
@@ -190,6 +190,16 @@ app.add_middleware(
190
  with open("web/live_transcription.html", "r", encoding="utf-8") as f:
191
  html = f.read()
192
 
 
 
 
 
 
 
 
 
 
 
193
  async def start_ffmpeg_decoder():
194
  """
195
  Start an FFmpeg process in async streaming mode that reads WebM from stdin
@@ -444,24 +454,23 @@ async def websocket_endpoint(websocket: WebSocket):
444
  logger.info("FFmpeg stdout closed.")
445
  break
446
  pcm_buffer.extend(chunk)
 
 
 
 
447
  if len(pcm_buffer) >= BYTES_PER_SEC:
448
  if len(pcm_buffer) > MAX_BYTES_PER_SEC:
449
  logger.warning(
450
  f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
451
  The model probably struggles to keep up. Consider using a smaller model.
452
  """)
453
- # Convert int16 -> float32
454
- pcm_array = (
455
- np.frombuffer(pcm_buffer[:MAX_BYTES_PER_SEC], dtype=np.int16).astype(np.float32)
456
- / 32768.0
457
- )
458
  pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
459
 
460
  if args.transcription and transcription_queue:
461
  await transcription_queue.put(pcm_array.copy())
462
 
463
- if args.diarization and diarization_queue:
464
- await diarization_queue.put(pcm_array.copy())
465
 
466
  if not args.transcription and not args.diarization:
467
  await asyncio.sleep(0.1)
 
190
  with open("web/live_transcription.html", "r", encoding="utf-8") as f:
191
  html = f.read()
192
 
193
+ def convert_pcm_to_float(pcm_buffer):
194
+ """
195
+ Converts a PCM buffer in s16le format to a normalized NumPy array.
196
+ Arg: pcm_buffer. PCM buffer containing raw audio data in s16le format
197
+ Returns: np.ndarray. NumPy array of float32 type normalized between -1.0 and 1.0
198
+ """
199
+ pcm_array = (np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32)
200
+ / 32768.0)
201
+ return pcm_array
202
+
203
  async def start_ffmpeg_decoder():
204
  """
205
  Start an FFmpeg process in async streaming mode that reads WebM from stdin
 
454
  logger.info("FFmpeg stdout closed.")
455
  break
456
  pcm_buffer.extend(chunk)
457
+
458
+ if args.diarization and diarization_queue:
459
+ await diarization_queue.put(convert_pcm_to_float(pcm_buffer).copy())
460
+
461
  if len(pcm_buffer) >= BYTES_PER_SEC:
462
  if len(pcm_buffer) > MAX_BYTES_PER_SEC:
463
  logger.warning(
464
  f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
465
  The model probably struggles to keep up. Consider using a smaller model.
466
  """)
467
+
468
+ pcm_array = convert_pcm_to_float(pcm_buffer[:MAX_BYTES_PER_SEC])
 
 
 
469
  pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
470
 
471
  if args.transcription and transcription_queue:
472
  await transcription_queue.put(pcm_array.copy())
473
 
 
 
474
 
475
  if not args.transcription and not args.diarization:
476
  await asyncio.sleep(0.1)