Merge pull request #128 from QuentinFuxa/vac-update
Browse files- LICENSE +4 -9
- README.md +2 -2
- whisperlivekit/audio_processor.py +27 -4
- whisperlivekit/whisper_streaming_custom/online_asr.py +10 -9
LICENSE
CHANGED
@@ -1,10 +1,6 @@
|
|
1 |
MIT License
|
2 |
|
3 |
Copyright (c) 2025 Quentin Fuxa.
|
4 |
-
Based on:
|
5 |
-
- The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
|
6 |
-
- The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
7 |
-
- The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
|
8 |
|
9 |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
10 |
of this software and associated documentation files (the "Software"), to deal
|
@@ -26,8 +22,7 @@ SOFTWARE.
|
|
26 |
|
27 |
---
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
- **
|
32 |
-
- **
|
33 |
-
- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart
|
|
|
1 |
MIT License
|
2 |
|
3 |
Copyright (c) 2025 Quentin Fuxa.
|
|
|
|
|
|
|
|
|
4 |
|
5 |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
of this software and associated documentation files (the "Software"), to deal
|
|
|
22 |
|
23 |
---
|
24 |
|
25 |
+
Based on:
|
26 |
+
- **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming. The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
|
27 |
+
- **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad. The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
28 |
+
- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart. The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
|
|
README.md
CHANGED
@@ -9,8 +9,8 @@
|
|
9 |
<p align="center">
|
10 |
<a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
|
11 |
<a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
|
12 |
-
<a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9
|
13 |
-
<a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/
|
14 |
</p>
|
15 |
|
16 |
## 🚀 Overview
|
|
|
9 |
<p align="center">
|
10 |
<a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
|
11 |
<a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
|
12 |
+
<a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9--3.13-dark_green"></a>
|
13 |
+
<a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT-dark_green"></a>
|
14 |
</p>
|
15 |
|
16 |
## 🚀 Overview
|
whisperlivekit/audio_processor.py
CHANGED
@@ -83,10 +83,33 @@ class AudioProcessor:
|
|
83 |
|
84 |
def start_ffmpeg_decoder(self):
|
85 |
"""Start FFmpeg process for WebM to PCM conversion."""
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
async def restart_ffmpeg(self):
|
92 |
"""Restart the FFmpeg process after failure."""
|
|
|
83 |
|
84 |
def start_ffmpeg_decoder(self):
|
85 |
"""Start FFmpeg process for WebM to PCM conversion."""
|
86 |
+
try:
|
87 |
+
return (ffmpeg.input("pipe:0", format="webm")
|
88 |
+
.output("pipe:1", format="s16le", acodec="pcm_s16le",
|
89 |
+
ac=self.channels, ar=str(self.sample_rate))
|
90 |
+
.run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
|
91 |
+
except FileNotFoundError:
|
92 |
+
error = """
|
93 |
+
FFmpeg is not installed or not found in your system's PATH.
|
94 |
+
Please install FFmpeg to enable audio processing.
|
95 |
+
|
96 |
+
Installation instructions:
|
97 |
+
|
98 |
+
# Ubuntu/Debian:
|
99 |
+
sudo apt update && sudo apt install ffmpeg
|
100 |
+
|
101 |
+
# macOS (using Homebrew):
|
102 |
+
brew install ffmpeg
|
103 |
+
|
104 |
+
# Windows:
|
105 |
+
# 1. Download the latest static build from https://ffmpeg.org/download.html
|
106 |
+
# 2. Extract the archive (e.g., to C:\\FFmpeg).
|
107 |
+
# 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
|
108 |
+
|
109 |
+
After installation, please restart the application.
|
110 |
+
"""
|
111 |
+
logger.error(error)
|
112 |
+
raise FileNotFoundError(error)
|
113 |
|
114 |
async def restart_ffmpeg(self):
|
115 |
"""Restart the FFmpeg process after failure."""
|
whisperlivekit/whisper_streaming_custom/online_asr.py
CHANGED
@@ -343,15 +343,15 @@ class OnlineASRProcessor:
|
|
343 |
)
|
344 |
sentences.append(sentence)
|
345 |
return sentences
|
346 |
-
|
|
|
347 |
"""
|
348 |
Flush the remaining transcript when processing ends.
|
349 |
"""
|
350 |
remaining_tokens = self.transcript_buffer.buffer
|
351 |
-
|
352 |
-
logger.debug(f"Final non-committed transcript: {final_transcript}")
|
353 |
self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
|
354 |
-
return
|
355 |
|
356 |
def concatenate_tokens(
|
357 |
self,
|
@@ -384,7 +384,8 @@ class VACOnlineASRProcessor:
|
|
384 |
def __init__(self, online_chunk_size: float, *args, **kwargs):
|
385 |
self.online_chunk_size = online_chunk_size
|
386 |
self.online = OnlineASRProcessor(*args, **kwargs)
|
387 |
-
|
|
|
388 |
# Load a VAD model (e.g. Silero VAD)
|
389 |
import torch
|
390 |
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
@@ -455,7 +456,7 @@ class VACOnlineASRProcessor:
|
|
455 |
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
456 |
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
457 |
|
458 |
-
def process_iter(self) ->
|
459 |
"""
|
460 |
Depending on the VAD status and the amount of accumulated audio,
|
461 |
process the current audio chunk.
|
@@ -467,9 +468,9 @@ class VACOnlineASRProcessor:
|
|
467 |
return self.online.process_iter()
|
468 |
else:
|
469 |
logger.debug("No online update, only VAD")
|
470 |
-
return
|
471 |
|
472 |
-
def finish(self) ->
|
473 |
"""Finish processing by flushing any remaining text."""
|
474 |
result = self.online.finish()
|
475 |
self.current_online_chunk_buffer_size = 0
|
@@ -480,4 +481,4 @@ class VACOnlineASRProcessor:
|
|
480 |
"""
|
481 |
Get the unvalidated buffer in string format.
|
482 |
"""
|
483 |
-
return self.online.concatenate_tokens(self.online.transcript_buffer.buffer)
|
|
|
343 |
)
|
344 |
sentences.append(sentence)
|
345 |
return sentences
|
346 |
+
|
347 |
+
def finish(self) -> List[ASRToken]:
|
348 |
"""
|
349 |
Flush the remaining transcript when processing ends.
|
350 |
"""
|
351 |
remaining_tokens = self.transcript_buffer.buffer
|
352 |
+
logger.debug(f"Final non-committed tokens: {remaining_tokens}")
|
|
|
353 |
self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
|
354 |
+
return remaining_tokens
|
355 |
|
356 |
def concatenate_tokens(
|
357 |
self,
|
|
|
384 |
def __init__(self, online_chunk_size: float, *args, **kwargs):
|
385 |
self.online_chunk_size = online_chunk_size
|
386 |
self.online = OnlineASRProcessor(*args, **kwargs)
|
387 |
+
self.asr = self.online.asr
|
388 |
+
|
389 |
# Load a VAD model (e.g. Silero VAD)
|
390 |
import torch
|
391 |
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
|
|
456 |
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
457 |
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
458 |
|
459 |
+
def process_iter(self) -> List[ASRToken]:
|
460 |
"""
|
461 |
Depending on the VAD status and the amount of accumulated audio,
|
462 |
process the current audio chunk.
|
|
|
468 |
return self.online.process_iter()
|
469 |
else:
|
470 |
logger.debug("No online update, only VAD")
|
471 |
+
return []
|
472 |
|
473 |
+
def finish(self) -> List[ASRToken]:
|
474 |
"""Finish processing by flushing any remaining text."""
|
475 |
result = self.online.finish()
|
476 |
self.current_online_chunk_buffer_size = 0
|
|
|
481 |
"""
|
482 |
Get the unvalidated buffer in string format.
|
483 |
"""
|
484 |
+
return self.online.concatenate_tokens(self.online.transcript_buffer.buffer)
|