Quentin Fuxa commited on
Commit
9e3b2a9
·
2 Parent(s): 26ff2a6 d8ecad0

Merge pull request #128 from QuentinFuxa/vac-update

Browse files
LICENSE CHANGED
@@ -1,10 +1,6 @@
1
  MIT License
2
 
3
  Copyright (c) 2025 Quentin Fuxa.
4
- Based on:
5
- - The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
6
- - The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
7
- - The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
8
 
9
  Permission is hereby granted, free of charge, to any person obtaining a copy
10
  of this software and associated documentation files (the "Software"), to deal
@@ -26,8 +22,7 @@ SOFTWARE.
26
 
27
  ---
28
 
29
- Third-party components included in this software:
30
-
31
- - **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming
32
- - **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad
33
- - **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart
 
1
  MIT License
2
 
3
  Copyright (c) 2025 Quentin Fuxa.
 
 
 
 
4
 
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
  of this software and associated documentation files (the "Software"), to deal
 
22
 
23
  ---
24
 
25
+ Based on:
26
+ - **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming. The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
27
+ - **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad. The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
28
+ - **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart. The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
 
README.md CHANGED
@@ -9,8 +9,8 @@
9
  <p align="center">
10
  <a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
11
  <a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
12
- <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-dark_green"></a>
13
- <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/QuentinFuxa/WhisperLiveKit?color=blue"></a>
14
  </p>
15
 
16
  ## 🚀 Overview
 
9
  <p align="center">
10
  <a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
11
  <a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
12
+ <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9--3.13-dark_green"></a>
13
+ <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT-dark_green"></a>
14
  </p>
15
 
16
  ## 🚀 Overview
whisperlivekit/audio_processor.py CHANGED
@@ -83,10 +83,33 @@ class AudioProcessor:
83
 
84
  def start_ffmpeg_decoder(self):
85
  """Start FFmpeg process for WebM to PCM conversion."""
86
- return (ffmpeg.input("pipe:0", format="webm")
87
- .output("pipe:1", format="s16le", acodec="pcm_s16le",
88
- ac=self.channels, ar=str(self.sample_rate))
89
- .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  async def restart_ffmpeg(self):
92
  """Restart the FFmpeg process after failure."""
 
83
 
84
  def start_ffmpeg_decoder(self):
85
  """Start FFmpeg process for WebM to PCM conversion."""
86
+ try:
87
+ return (ffmpeg.input("pipe:0", format="webm")
88
+ .output("pipe:1", format="s16le", acodec="pcm_s16le",
89
+ ac=self.channels, ar=str(self.sample_rate))
90
+ .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
91
+ except FileNotFoundError:
92
+ error = """
93
+ FFmpeg is not installed or not found in your system's PATH.
94
+ Please install FFmpeg to enable audio processing.
95
+
96
+ Installation instructions:
97
+
98
+ # Ubuntu/Debian:
99
+ sudo apt update && sudo apt install ffmpeg
100
+
101
+ # macOS (using Homebrew):
102
+ brew install ffmpeg
103
+
104
+ # Windows:
105
+ # 1. Download the latest static build from https://ffmpeg.org/download.html
106
+ # 2. Extract the archive (e.g., to C:\\FFmpeg).
107
+ # 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
108
+
109
+ After installation, please restart the application.
110
+ """
111
+ logger.error(error)
112
+ raise FileNotFoundError(error)
113
 
114
  async def restart_ffmpeg(self):
115
  """Restart the FFmpeg process after failure."""
whisperlivekit/whisper_streaming_custom/online_asr.py CHANGED
@@ -343,15 +343,15 @@ class OnlineASRProcessor:
343
  )
344
  sentences.append(sentence)
345
  return sentences
346
- def finish(self) -> Transcript:
 
347
  """
348
  Flush the remaining transcript when processing ends.
349
  """
350
  remaining_tokens = self.transcript_buffer.buffer
351
- final_transcript = self.concatenate_tokens(remaining_tokens)
352
- logger.debug(f"Final non-committed transcript: {final_transcript}")
353
  self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
354
- return final_transcript
355
 
356
  def concatenate_tokens(
357
  self,
@@ -384,7 +384,8 @@ class VACOnlineASRProcessor:
384
  def __init__(self, online_chunk_size: float, *args, **kwargs):
385
  self.online_chunk_size = online_chunk_size
386
  self.online = OnlineASRProcessor(*args, **kwargs)
387
-
 
388
  # Load a VAD model (e.g. Silero VAD)
389
  import torch
390
  model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
@@ -455,7 +456,7 @@ class VACOnlineASRProcessor:
455
  self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
456
  self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
457
 
458
- def process_iter(self) -> Transcript:
459
  """
460
  Depending on the VAD status and the amount of accumulated audio,
461
  process the current audio chunk.
@@ -467,9 +468,9 @@ class VACOnlineASRProcessor:
467
  return self.online.process_iter()
468
  else:
469
  logger.debug("No online update, only VAD")
470
- return Transcript(None, None, "")
471
 
472
- def finish(self) -> Transcript:
473
  """Finish processing by flushing any remaining text."""
474
  result = self.online.finish()
475
  self.current_online_chunk_buffer_size = 0
@@ -480,4 +481,4 @@ class VACOnlineASRProcessor:
480
  """
481
  Get the unvalidated buffer in string format.
482
  """
483
- return self.online.concatenate_tokens(self.online.transcript_buffer.buffer).text
 
343
  )
344
  sentences.append(sentence)
345
  return sentences
346
+
347
+ def finish(self) -> List[ASRToken]:
348
  """
349
  Flush the remaining transcript when processing ends.
350
  """
351
  remaining_tokens = self.transcript_buffer.buffer
352
+ logger.debug(f"Final non-committed tokens: {remaining_tokens}")
 
353
  self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
354
+ return remaining_tokens
355
 
356
  def concatenate_tokens(
357
  self,
 
384
  def __init__(self, online_chunk_size: float, *args, **kwargs):
385
  self.online_chunk_size = online_chunk_size
386
  self.online = OnlineASRProcessor(*args, **kwargs)
387
+ self.asr = self.online.asr
388
+
389
  # Load a VAD model (e.g. Silero VAD)
390
  import torch
391
  model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
 
456
  self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
457
  self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
458
 
459
+ def process_iter(self) -> List[ASRToken]:
460
  """
461
  Depending on the VAD status and the amount of accumulated audio,
462
  process the current audio chunk.
 
468
  return self.online.process_iter()
469
  else:
470
  logger.debug("No online update, only VAD")
471
+ return []
472
 
473
+ def finish(self) -> List[ASRToken]:
474
  """Finish processing by flushing any remaining text."""
475
  result = self.online.finish()
476
  self.current_online_chunk_buffer_size = 0
 
481
  """
482
  Get the unvalidated buffer in string format.
483
  """
484
+ return self.online.concatenate_tokens(self.online.transcript_buffer.buffer)