Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

Quentin Fuxa commited on May 28

Commit

9e3b2a9

2 Parent(s): 26ff2a6 d8ecad0

Merge pull request #128 from QuentinFuxa/vac-update

Browse files

Files changed (4) hide show

LICENSE +4 -9
README.md +2 -2
whisperlivekit/audio_processor.py +27 -4
whisperlivekit/whisper_streaming_custom/online_asr.py +10 -9

LICENSE CHANGED Viewed

@@ -1,10 +1,6 @@
 MIT License
 Copyright (c) 2025 Quentin Fuxa.
-Based on:
-- The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
-- The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
-- The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,8 +22,7 @@ SOFTWARE.
 ---
-Third-party components included in this software:
-- **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming
-- **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad
-- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart

 MIT License
 Copyright (c) 2025 Quentin Fuxa.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 ---
+Based on:
+- **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming. The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
+- **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad. The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
+- **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart. The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE

README.md CHANGED Viewed

@@ -9,8 +9,8 @@
 <p align="center">
   <a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
   <a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
-  <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-dark_green"></a>
-  <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/QuentinFuxa/WhisperLiveKit?color=blue"></a>
 </p>
 ## 🚀 Overview

 <p align="center">
   <a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
   <a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
+  <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9--3.13-dark_green"></a>
+  <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT-dark_green"></a>
 </p>
 ## 🚀 Overview

whisperlivekit/audio_processor.py CHANGED Viewed

@@ -83,10 +83,33 @@ class AudioProcessor:
     def start_ffmpeg_decoder(self):
         """Start FFmpeg process for WebM to PCM conversion."""
-        return (ffmpeg.input("pipe:0", format="webm")
-                .output("pipe:1", format="s16le", acodec="pcm_s16le",
-                        ac=self.channels, ar=str(self.sample_rate))
-                .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
     async def restart_ffmpeg(self):
         """Restart the FFmpeg process after failure."""

     def start_ffmpeg_decoder(self):
         """Start FFmpeg process for WebM to PCM conversion."""
+        try:
+            return (ffmpeg.input("pipe:0", format="webm")
+                    .output("pipe:1", format="s16le", acodec="pcm_s16le",
+                            ac=self.channels, ar=str(self.sample_rate))
+                    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))
+        except FileNotFoundError:
+            error = """
+            FFmpeg is not installed or not found in your system's PATH.
+            Please install FFmpeg to enable audio processing.
+            Installation instructions:
+            # Ubuntu/Debian:
+            sudo apt update && sudo apt install ffmpeg
+            # macOS (using Homebrew):
+            brew install ffmpeg
+            # Windows:
+            # 1. Download the latest static build from https://ffmpeg.org/download.html
+            # 2. Extract the archive (e.g., to C:\\FFmpeg).
+            # 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
+            After installation, please restart the application.
+            """
+            logger.error(error)
+            raise FileNotFoundError(error)
     async def restart_ffmpeg(self):
         """Restart the FFmpeg process after failure."""

whisperlivekit/whisper_streaming_custom/online_asr.py CHANGED Viewed

@@ -343,15 +343,15 @@ class OnlineASRProcessor:
                 )
                 sentences.append(sentence)
         return sentences
-    def finish(self) -> Transcript:
         """
         Flush the remaining transcript when processing ends.
         """
         remaining_tokens = self.transcript_buffer.buffer
-        final_transcript = self.concatenate_tokens(remaining_tokens)
-        logger.debug(f"Final non-committed transcript: {final_transcript}")
         self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
-        return final_transcript
     def concatenate_tokens(
         self,
@@ -384,7 +384,8 @@ class VACOnlineASRProcessor:
     def __init__(self, online_chunk_size: float, *args, **kwargs):
         self.online_chunk_size = online_chunk_size
         self.online = OnlineASRProcessor(*args, **kwargs)
         # Load a VAD model (e.g. Silero VAD)
         import torch
         model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
@@ -455,7 +456,7 @@ class VACOnlineASRProcessor:
                 self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
                 self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
-    def process_iter(self) -> Transcript:
         """
         Depending on the VAD status and the amount of accumulated audio,
         process the current audio chunk.
@@ -467,9 +468,9 @@ class VACOnlineASRProcessor:
             return self.online.process_iter()
         else:
             logger.debug("No online update, only VAD")
-            return Transcript(None, None, "")
-    def finish(self) -> Transcript:
         """Finish processing by flushing any remaining text."""
         result = self.online.finish()
         self.current_online_chunk_buffer_size = 0
@@ -480,4 +481,4 @@ class VACOnlineASRProcessor:
         """
         Get the unvalidated buffer in string format.
         """
-        return self.online.concatenate_tokens(self.online.transcript_buffer.buffer).text

                 )
                 sentences.append(sentence)
         return sentences
+    def finish(self) -> List[ASRToken]:
         """
         Flush the remaining transcript when processing ends.
         """
         remaining_tokens = self.transcript_buffer.buffer
+        logger.debug(f"Final non-committed tokens: {remaining_tokens}")
         self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
+        return remaining_tokens
     def concatenate_tokens(
         self,
     def __init__(self, online_chunk_size: float, *args, **kwargs):
         self.online_chunk_size = online_chunk_size
         self.online = OnlineASRProcessor(*args, **kwargs)
+        self.asr = self.online.asr
         # Load a VAD model (e.g. Silero VAD)
         import torch
         model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
                 self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
                 self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
+    def process_iter(self) -> List[ASRToken]:
         """
         Depending on the VAD status and the amount of accumulated audio,
         process the current audio chunk.
             return self.online.process_iter()
         else:
             logger.debug("No online update, only VAD")
+            return []
+    def finish(self) -> List[ASRToken]:
         """Finish processing by flushing any remaining text."""
         result = self.online.finish()
         self.current_online_chunk_buffer_size = 0
         """
         Get the unvalidated buffer in string format.
         """
+        return self.online.concatenate_tokens(self.online.transcript_buffer.buffer)