script to lib
Browse files- README.md +12 -5
- parse_args.py +0 -52
- setup.py +44 -0
- whisper_fastapi_online_server.py +17 -27
- whisperlivekit/__init__.py +4 -0
- audio_processor.py β whisperlivekit/audio_processor.py +18 -15
- whisperlivekit/core.py +174 -0
- {diarization β whisperlivekit/diarization}/diarization_online.py +1 -1
- silero_vad_iterator.py β whisperlivekit/silero_vad_iterator.py +0 -0
- timed_objects.py β whisperlivekit/timed_objects.py +0 -0
- {web β whisperlivekit/web}/live_transcription.html +0 -0
- {whisper_streaming_custom β whisperlivekit/whisper_streaming_custom}/backends.py +1 -1
- {whisper_streaming_custom β whisperlivekit/whisper_streaming_custom}/online_asr.py +1 -1
- {whisper_streaming_custom β whisperlivekit/whisper_streaming_custom}/whisper_online.py +0 -89
README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5 |
This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. Simply launch the local server and grant microphone access. Everything runs locally on your machine β¨
|
6 |
|
7 |
<p align="center">
|
8 |
-
<img src="
|
9 |
</p>
|
10 |
|
11 |
### Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
|
@@ -24,20 +24,27 @@ This project is based on [Whisper Streaming](https://github.com/ufal/whisper_str
|
|
24 |
- **FastAPI WebSocket Server** β Real-time speech-to-text processing with async FFmpeg streaming.
|
25 |
- **JavaScript Client** β Ready-to-use MediaRecorder implementation for seamless client-side integration.
|
26 |
|
27 |
-
|
28 |
## Installation
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
1. **Clone the Repository**:
|
31 |
|
32 |
```bash
|
33 |
git clone https://github.com/QuentinFuxa/WhisperLiveKit
|
34 |
cd WhisperLiveKit
|
|
|
35 |
```
|
36 |
|
|
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
1. **Dependencies**:
|
41 |
|
42 |
- Install system dependencies:
|
43 |
```bash
|
|
|
5 |
This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. Simply launch the local server and grant microphone access. Everything runs locally on your machine β¨
|
6 |
|
7 |
<p align="center">
|
8 |
+
<img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/demo.png" alt="Demo Screenshot" width="730">
|
9 |
</p>
|
10 |
|
11 |
### Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
|
|
|
24 |
- **FastAPI WebSocket Server** β Real-time speech-to-text processing with async FFmpeg streaming.
|
25 |
- **JavaScript Client** β Ready-to-use MediaRecorder implementation for seamless client-side integration.
|
26 |
|
|
|
27 |
## Installation
|
28 |
|
29 |
+
### Via pip
|
30 |
+
|
31 |
+
```bash
|
32 |
+
pip install whisperlivekit
|
33 |
+
```
|
34 |
+
|
35 |
+
### From source
|
36 |
+
|
37 |
1. **Clone the Repository**:
|
38 |
|
39 |
```bash
|
40 |
git clone https://github.com/QuentinFuxa/WhisperLiveKit
|
41 |
cd WhisperLiveKit
|
42 |
+
pip install -e .
|
43 |
```
|
44 |
|
45 |
+
### System Dependencies
|
46 |
|
47 |
+
You need to install FFmpeg on your system:
|
|
|
|
|
48 |
|
49 |
- Install system dependencies:
|
50 |
```bash
|
parse_args.py
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
|
2 |
-
import argparse
|
3 |
-
from whisper_streaming_custom.whisper_online import add_shared_args
|
4 |
-
|
5 |
-
|
6 |
-
def parse_args():
|
7 |
-
parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
|
8 |
-
parser.add_argument(
|
9 |
-
"--host",
|
10 |
-
type=str,
|
11 |
-
default="localhost",
|
12 |
-
help="The host address to bind the server to.",
|
13 |
-
)
|
14 |
-
parser.add_argument(
|
15 |
-
"--port", type=int, default=8000, help="The port number to bind the server to."
|
16 |
-
)
|
17 |
-
parser.add_argument(
|
18 |
-
"--warmup-file",
|
19 |
-
type=str,
|
20 |
-
default=None,
|
21 |
-
dest="warmup_file",
|
22 |
-
help="""
|
23 |
-
The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
|
24 |
-
If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
|
25 |
-
If False, no warmup is performed.
|
26 |
-
""",
|
27 |
-
)
|
28 |
-
|
29 |
-
parser.add_argument(
|
30 |
-
"--confidence-validation",
|
31 |
-
type=bool,
|
32 |
-
default=False,
|
33 |
-
help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
|
34 |
-
)
|
35 |
-
|
36 |
-
parser.add_argument(
|
37 |
-
"--diarization",
|
38 |
-
type=bool,
|
39 |
-
default=True,
|
40 |
-
help="Whether to enable speaker diarization.",
|
41 |
-
)
|
42 |
-
|
43 |
-
parser.add_argument(
|
44 |
-
"--transcription",
|
45 |
-
type=bool,
|
46 |
-
default=True,
|
47 |
-
help="To disable to only see live diarization results.",
|
48 |
-
)
|
49 |
-
|
50 |
-
add_shared_args(parser)
|
51 |
-
args = parser.parse_args()
|
52 |
-
return args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name="whisperlivekit",
|
5 |
+
version="0.1.0",
|
6 |
+
description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
|
7 |
+
long_description=open("README.md", "r", encoding="utf-8").read(),
|
8 |
+
long_description_content_type="text/markdown",
|
9 |
+
author="Quentin Fuxa",
|
10 |
+
url="https://github.com/QuentinFuxa/WhisperLiveKit",
|
11 |
+
packages=find_packages(),
|
12 |
+
install_requires=[
|
13 |
+
"fastapi",
|
14 |
+
"ffmpeg-python",
|
15 |
+
"librosa",
|
16 |
+
"soundfile",
|
17 |
+
"faster-whisper",
|
18 |
+
"uvicorn",
|
19 |
+
"websockets",
|
20 |
+
],
|
21 |
+
extras_require={
|
22 |
+
"diarization": ["diart"],
|
23 |
+
"vac": ["torch"],
|
24 |
+
"sentence": ["mosestokenizer", "wtpsplit"],
|
25 |
+
},
|
26 |
+
package_data={
|
27 |
+
'whisperlivekit': ['web/*.html'],
|
28 |
+
},
|
29 |
+
entry_points={
|
30 |
+
'console_scripts': [
|
31 |
+
'whisperlivekit-server=whisperlivekit.server:run_server',
|
32 |
+
],
|
33 |
+
},
|
34 |
+
classifiers=[
|
35 |
+
"Development Status :: 4 - Beta",
|
36 |
+
"Intended Audience :: Developers",
|
37 |
+
"License :: OSI Approved :: MIT License",
|
38 |
+
"Programming Language :: Python :: 3.9",
|
39 |
+
"Programming Language :: Python :: 3.10",
|
40 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
41 |
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
42 |
+
],
|
43 |
+
python_requires=">=3.9",
|
44 |
+
)
|
whisper_fastapi_online_server.py
CHANGED
@@ -1,37 +1,26 @@
|
|
1 |
from contextlib import asynccontextmanager
|
2 |
-
|
3 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
4 |
from fastapi.responses import HTMLResponse
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
|
7 |
-
from
|
|
|
|
|
8 |
import asyncio
|
9 |
import logging
|
10 |
-
|
11 |
-
from audio_processor import AudioProcessor
|
12 |
|
13 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
14 |
logging.getLogger().setLevel(logging.WARNING)
|
15 |
logger = logging.getLogger(__name__)
|
16 |
logger.setLevel(logging.DEBUG)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
|
21 |
@asynccontextmanager
|
22 |
async def lifespan(app: FastAPI):
|
23 |
-
global
|
24 |
-
|
25 |
-
asr, tokenizer = backend_factory(args)
|
26 |
-
warmup_asr(asr, args.warmup_file)
|
27 |
-
else:
|
28 |
-
asr, tokenizer = None, None
|
29 |
-
|
30 |
-
if args.diarization:
|
31 |
-
from diarization.diarization_online import DiartDiarization
|
32 |
-
diarization = DiartDiarization()
|
33 |
-
else :
|
34 |
-
diarization = None
|
35 |
yield
|
36 |
|
37 |
app = FastAPI(lifespan=lifespan)
|
@@ -44,13 +33,9 @@ app.add_middleware(
|
|
44 |
)
|
45 |
|
46 |
|
47 |
-
# Load demo HTML for the root endpoint
|
48 |
-
with open("web/live_transcription.html", "r", encoding="utf-8") as f:
|
49 |
-
html = f.read()
|
50 |
-
|
51 |
@app.get("/")
|
52 |
async def get():
|
53 |
-
return HTMLResponse(
|
54 |
|
55 |
|
56 |
async def handle_websocket_results(websocket, results_generator):
|
@@ -64,12 +49,12 @@ async def handle_websocket_results(websocket, results_generator):
|
|
64 |
|
65 |
@app.websocket("/asr")
|
66 |
async def websocket_endpoint(websocket: WebSocket):
|
67 |
-
audio_processor = AudioProcessor(
|
68 |
|
69 |
await websocket.accept()
|
70 |
logger.info("WebSocket connection opened.")
|
71 |
|
72 |
-
results_generator = await audio_processor.create_tasks(
|
73 |
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
74 |
|
75 |
try:
|
@@ -85,8 +70,13 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
85 |
|
86 |
if __name__ == "__main__":
|
87 |
import uvicorn
|
88 |
-
|
|
|
|
|
89 |
uvicorn.run(
|
90 |
-
"whisper_fastapi_online_server:app",
|
|
|
|
|
|
|
91 |
log_level="info"
|
92 |
)
|
|
|
1 |
from contextlib import asynccontextmanager
|
|
|
2 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
3 |
from fastapi.responses import HTMLResponse
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
|
6 |
+
from whisperlivekit import WhisperLiveKit
|
7 |
+
from whisperlivekit.audio_processor import AudioProcessor
|
8 |
+
|
9 |
import asyncio
|
10 |
import logging
|
11 |
+
import os
|
|
|
12 |
|
13 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
14 |
logging.getLogger().setLevel(logging.WARNING)
|
15 |
logger = logging.getLogger(__name__)
|
16 |
logger.setLevel(logging.DEBUG)
|
17 |
|
18 |
+
kit = None
|
|
|
19 |
|
20 |
@asynccontextmanager
|
21 |
async def lifespan(app: FastAPI):
|
22 |
+
global kit
|
23 |
+
kit = WhisperLiveKit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
yield
|
25 |
|
26 |
app = FastAPI(lifespan=lifespan)
|
|
|
33 |
)
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
36 |
@app.get("/")
|
37 |
async def get():
|
38 |
+
return HTMLResponse(kit.web_interface())
|
39 |
|
40 |
|
41 |
async def handle_websocket_results(websocket, results_generator):
|
|
|
49 |
|
50 |
@app.websocket("/asr")
|
51 |
async def websocket_endpoint(websocket: WebSocket):
|
52 |
+
audio_processor = AudioProcessor()
|
53 |
|
54 |
await websocket.accept()
|
55 |
logger.info("WebSocket connection opened.")
|
56 |
|
57 |
+
results_generator = await audio_processor.create_tasks()
|
58 |
websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
|
59 |
|
60 |
try:
|
|
|
70 |
|
71 |
if __name__ == "__main__":
|
72 |
import uvicorn
|
73 |
+
|
74 |
+
temp_kit = WhisperLiveKit(transcription=False, diarization=False)
|
75 |
+
|
76 |
uvicorn.run(
|
77 |
+
"whisper_fastapi_online_server:app",
|
78 |
+
host=temp_kit.args.host,
|
79 |
+
port=temp_kit.args.port,
|
80 |
+
reload=True,
|
81 |
log_level="info"
|
82 |
)
|
whisperlivekit/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .core import WhisperLiveKit, parse_args
|
2 |
+
from .audio_processor import AudioProcessor
|
3 |
+
|
4 |
+
__all__ = ['WhisperLiveKit', 'AudioProcessor', 'parse_args']
|
audio_processor.py β whisperlivekit/audio_processor.py
RENAMED
@@ -7,8 +7,9 @@ import logging
|
|
7 |
import traceback
|
8 |
from datetime import timedelta
|
9 |
from typing import List, Dict, Any
|
10 |
-
from timed_objects import ASRToken
|
11 |
-
from whisper_streaming_custom.whisper_online import online_factory
|
|
|
12 |
|
13 |
# Set up logging once
|
14 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
@@ -22,16 +23,19 @@ def format_time(seconds: float) -> str:
|
|
22 |
class AudioProcessor:
|
23 |
"""
|
24 |
Processes audio streams for transcription and diarization.
|
25 |
-
Handles audio processing, state management, and result formatting
|
26 |
"""
|
27 |
|
28 |
-
def __init__(self
|
29 |
"""Initialize the audio processor with configuration, models, and state."""
|
|
|
|
|
|
|
30 |
# Audio processing settings
|
31 |
-
self.args = args
|
32 |
self.sample_rate = 16000
|
33 |
self.channels = 1
|
34 |
-
self.samples_per_sec = int(self.sample_rate * args.min_chunk_size)
|
35 |
self.bytes_per_sample = 2
|
36 |
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
37 |
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
@@ -49,16 +53,17 @@ class AudioProcessor:
|
|
49 |
self.last_response_content = ""
|
50 |
|
51 |
# Models and processing
|
52 |
-
self.asr = asr
|
53 |
-
self.tokenizer = tokenizer
|
|
|
54 |
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
55 |
-
self.transcription_queue = asyncio.Queue() if args.transcription else None
|
56 |
-
self.diarization_queue = asyncio.Queue() if args.diarization else None
|
57 |
self.pcm_buffer = bytearray()
|
58 |
|
59 |
# Initialize transcription engine if enabled
|
60 |
-
if args.transcription:
|
61 |
-
self.online = online_factory(args, asr, tokenizer)
|
62 |
|
63 |
def convert_pcm_to_float(self, pcm_buffer):
|
64 |
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
@@ -362,10 +367,8 @@ class AudioProcessor:
|
|
362 |
logger.warning(f"Traceback: {traceback.format_exc()}")
|
363 |
await asyncio.sleep(0.5) # Back off on error
|
364 |
|
365 |
-
async def create_tasks(self
|
366 |
"""Create and start processing tasks."""
|
367 |
-
if diarization:
|
368 |
-
self.diarization = diarization
|
369 |
|
370 |
tasks = []
|
371 |
if self.args.transcription and self.online:
|
|
|
7 |
import traceback
|
8 |
from datetime import timedelta
|
9 |
from typing import List, Dict, Any
|
10 |
+
from whisperlivekit.timed_objects import ASRToken
|
11 |
+
from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
|
12 |
+
from whisperlivekit.core import WhisperLiveKit
|
13 |
|
14 |
# Set up logging once
|
15 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
23 |
class AudioProcessor:
|
24 |
"""
|
25 |
Processes audio streams for transcription and diarization.
|
26 |
+
Handles audio processing, state management, and result formatting.
|
27 |
"""
|
28 |
|
29 |
+
def __init__(self):
|
30 |
"""Initialize the audio processor with configuration, models, and state."""
|
31 |
+
|
32 |
+
models = WhisperLiveKit()
|
33 |
+
|
34 |
# Audio processing settings
|
35 |
+
self.args = models.args
|
36 |
self.sample_rate = 16000
|
37 |
self.channels = 1
|
38 |
+
self.samples_per_sec = int(self.sample_rate * self.args.min_chunk_size)
|
39 |
self.bytes_per_sample = 2
|
40 |
self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
|
41 |
self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
|
|
|
53 |
self.last_response_content = ""
|
54 |
|
55 |
# Models and processing
|
56 |
+
self.asr = models.asr
|
57 |
+
self.tokenizer = models.tokenizer
|
58 |
+
self.diarization = models.diarization
|
59 |
self.ffmpeg_process = self.start_ffmpeg_decoder()
|
60 |
+
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
61 |
+
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
62 |
self.pcm_buffer = bytearray()
|
63 |
|
64 |
# Initialize transcription engine if enabled
|
65 |
+
if self.args.transcription:
|
66 |
+
self.online = online_factory(self.args, models.asr, models.tokenizer)
|
67 |
|
68 |
def convert_pcm_to_float(self, pcm_buffer):
|
69 |
"""Convert PCM buffer in s16le format to normalized NumPy array."""
|
|
|
367 |
logger.warning(f"Traceback: {traceback.format_exc()}")
|
368 |
await asyncio.sleep(0.5) # Back off on error
|
369 |
|
370 |
+
async def create_tasks(self):
|
371 |
"""Create and start processing tasks."""
|
|
|
|
|
372 |
|
373 |
tasks = []
|
374 |
if self.args.transcription and self.online:
|
whisperlivekit/core.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
|
2 |
+
from argparse import Namespace, ArgumentParser
|
3 |
+
|
4 |
+
def parse_args():
|
5 |
+
parser = ArgumentParser(description="Whisper FastAPI Online Server")
|
6 |
+
parser.add_argument(
|
7 |
+
"--host",
|
8 |
+
type=str,
|
9 |
+
default="localhost",
|
10 |
+
help="The host address to bind the server to.",
|
11 |
+
)
|
12 |
+
parser.add_argument(
|
13 |
+
"--port", type=int, default=8000, help="The port number to bind the server to."
|
14 |
+
)
|
15 |
+
parser.add_argument(
|
16 |
+
"--warmup-file",
|
17 |
+
type=str,
|
18 |
+
default=None,
|
19 |
+
dest="warmup_file",
|
20 |
+
help="""
|
21 |
+
The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
|
22 |
+
If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
|
23 |
+
If False, no warmup is performed.
|
24 |
+
""",
|
25 |
+
)
|
26 |
+
|
27 |
+
parser.add_argument(
|
28 |
+
"--confidence-validation",
|
29 |
+
type=bool,
|
30 |
+
default=False,
|
31 |
+
help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
|
32 |
+
)
|
33 |
+
|
34 |
+
parser.add_argument(
|
35 |
+
"--diarization",
|
36 |
+
type=bool,
|
37 |
+
default=True,
|
38 |
+
help="Whether to enable speaker diarization.",
|
39 |
+
)
|
40 |
+
|
41 |
+
parser.add_argument(
|
42 |
+
"--transcription",
|
43 |
+
type=bool,
|
44 |
+
default=True,
|
45 |
+
help="To disable to only see live diarization results.",
|
46 |
+
)
|
47 |
+
|
48 |
+
parser.add_argument(
|
49 |
+
"--min-chunk-size",
|
50 |
+
type=float,
|
51 |
+
default=0.5,
|
52 |
+
help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--model",
|
56 |
+
type=str,
|
57 |
+
default="tiny",
|
58 |
+
choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
|
59 |
+
","
|
60 |
+
),
|
61 |
+
help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.",
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--model_cache_dir",
|
65 |
+
type=str,
|
66 |
+
default=None,
|
67 |
+
help="Overriding the default model cache dir where models downloaded from the hub are saved",
|
68 |
+
)
|
69 |
+
parser.add_argument(
|
70 |
+
"--model_dir",
|
71 |
+
type=str,
|
72 |
+
default=None,
|
73 |
+
help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.",
|
74 |
+
)
|
75 |
+
parser.add_argument(
|
76 |
+
"--lan",
|
77 |
+
"--language",
|
78 |
+
type=str,
|
79 |
+
default="auto",
|
80 |
+
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
|
81 |
+
)
|
82 |
+
parser.add_argument(
|
83 |
+
"--task",
|
84 |
+
type=str,
|
85 |
+
default="transcribe",
|
86 |
+
choices=["transcribe", "translate"],
|
87 |
+
help="Transcribe or translate.",
|
88 |
+
)
|
89 |
+
parser.add_argument(
|
90 |
+
"--backend",
|
91 |
+
type=str,
|
92 |
+
default="faster-whisper",
|
93 |
+
choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
|
94 |
+
help="Load only this backend for Whisper processing.",
|
95 |
+
)
|
96 |
+
parser.add_argument(
|
97 |
+
"--vac",
|
98 |
+
action="store_true",
|
99 |
+
default=False,
|
100 |
+
help="Use VAC = voice activity controller. Recommended. Requires torch.",
|
101 |
+
)
|
102 |
+
parser.add_argument(
|
103 |
+
"--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
|
104 |
+
)
|
105 |
+
parser.add_argument(
|
106 |
+
"--vad",
|
107 |
+
action="store_true",
|
108 |
+
default=True,
|
109 |
+
help="Use VAD = voice activity detection, with the default parameters.",
|
110 |
+
)
|
111 |
+
parser.add_argument(
|
112 |
+
"--buffer_trimming",
|
113 |
+
type=str,
|
114 |
+
default="segment",
|
115 |
+
choices=["sentence", "segment"],
|
116 |
+
help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.',
|
117 |
+
)
|
118 |
+
parser.add_argument(
|
119 |
+
"--buffer_trimming_sec",
|
120 |
+
type=float,
|
121 |
+
default=15,
|
122 |
+
help="Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.",
|
123 |
+
)
|
124 |
+
parser.add_argument(
|
125 |
+
"-l",
|
126 |
+
"--log-level",
|
127 |
+
dest="log_level",
|
128 |
+
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
129 |
+
help="Set the log level",
|
130 |
+
default="DEBUG",
|
131 |
+
)
|
132 |
+
|
133 |
+
args = parser.parse_args()
|
134 |
+
return args
|
135 |
+
|
136 |
+
class WhisperLiveKit:
|
137 |
+
_instance = None
|
138 |
+
_initialized = False
|
139 |
+
|
140 |
+
def __new__(cls, *args, **kwargs):
|
141 |
+
if cls._instance is None:
|
142 |
+
cls._instance = super().__new__(cls)
|
143 |
+
return cls._instance
|
144 |
+
|
145 |
+
def __init__(self, **kwargs):
|
146 |
+
if WhisperLiveKit._initialized:
|
147 |
+
return
|
148 |
+
|
149 |
+
default_args = vars(parse_args())
|
150 |
+
|
151 |
+
merged_args = {**default_args, **kwargs}
|
152 |
+
|
153 |
+
self.args = Namespace(**merged_args)
|
154 |
+
|
155 |
+
self.asr = None
|
156 |
+
self.tokenizer = None
|
157 |
+
self.diarization = None
|
158 |
+
|
159 |
+
if self.args.transcription:
|
160 |
+
self.asr, self.tokenizer = backend_factory(self.args)
|
161 |
+
warmup_asr(self.asr, self.args.warmup_file)
|
162 |
+
|
163 |
+
if self.args.diarization:
|
164 |
+
from whisperlivekit.diarization.diarization_online import DiartDiarization
|
165 |
+
self.diarization = DiartDiarization()
|
166 |
+
|
167 |
+
WhisperLiveKit._initialized = True
|
168 |
+
|
169 |
+
def web_interface(self):
|
170 |
+
import pkg_resources
|
171 |
+
html_path = pkg_resources.resource_filename('whisperlivekit', 'web/live_transcription.html')
|
172 |
+
with open(html_path, "r", encoding="utf-8") as f:
|
173 |
+
html = f.read()
|
174 |
+
return html
|
{diarization β whisperlivekit/diarization}/diarization_online.py
RENAMED
@@ -8,7 +8,7 @@ import logging
|
|
8 |
from diart import SpeakerDiarization, SpeakerDiarizationConfig
|
9 |
from diart.inference import StreamingInference
|
10 |
from diart.sources import AudioSource
|
11 |
-
from timed_objects import SpeakerSegment
|
12 |
from diart.sources import MicrophoneAudioSource
|
13 |
from rx.core import Observer
|
14 |
from typing import Tuple, Any, List
|
|
|
8 |
from diart import SpeakerDiarization, SpeakerDiarizationConfig
|
9 |
from diart.inference import StreamingInference
|
10 |
from diart.sources import AudioSource
|
11 |
+
from whisperlivekit.timed_objects import SpeakerSegment
|
12 |
from diart.sources import MicrophoneAudioSource
|
13 |
from rx.core import Observer
|
14 |
from typing import Tuple, Any, List
|
silero_vad_iterator.py β whisperlivekit/silero_vad_iterator.py
RENAMED
File without changes
|
timed_objects.py β whisperlivekit/timed_objects.py
RENAMED
File without changes
|
{web β whisperlivekit/web}/live_transcription.html
RENAMED
File without changes
|
{whisper_streaming_custom β whisperlivekit/whisper_streaming_custom}/backends.py
RENAMED
@@ -6,7 +6,7 @@ import math
|
|
6 |
import torch
|
7 |
from typing import List
|
8 |
import numpy as np
|
9 |
-
from timed_objects import ASRToken
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
|
|
6 |
import torch
|
7 |
from typing import List
|
8 |
import numpy as np
|
9 |
+
from whisperlivekit.timed_objects import ASRToken
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
{whisper_streaming_custom β whisperlivekit/whisper_streaming_custom}/online_asr.py
RENAMED
@@ -2,7 +2,7 @@ import sys
|
|
2 |
import numpy as np
|
3 |
import logging
|
4 |
from typing import List, Tuple, Optional
|
5 |
-
from timed_objects import ASRToken, Sentence, Transcript
|
6 |
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
|
|
2 |
import numpy as np
|
3 |
import logging
|
4 |
from typing import List, Tuple, Optional
|
5 |
+
from whisperlivekit.timed_objects import ASRToken, Sentence, Transcript
|
6 |
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
{whisper_streaming_custom β whisperlivekit/whisper_streaming_custom}/whisper_online.py
RENAMED
@@ -64,95 +64,6 @@ def create_tokenizer(lan):
|
|
64 |
return WtPtok()
|
65 |
|
66 |
|
67 |
-
def add_shared_args(parser):
|
68 |
-
"""shared args for simulation (this entry point) and server
|
69 |
-
parser: argparse.ArgumentParser object
|
70 |
-
"""
|
71 |
-
parser.add_argument(
|
72 |
-
"--min-chunk-size",
|
73 |
-
type=float,
|
74 |
-
default=0.5,
|
75 |
-
help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
|
76 |
-
)
|
77 |
-
parser.add_argument(
|
78 |
-
"--model",
|
79 |
-
type=str,
|
80 |
-
default="tiny",
|
81 |
-
choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
|
82 |
-
","
|
83 |
-
),
|
84 |
-
help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.",
|
85 |
-
)
|
86 |
-
parser.add_argument(
|
87 |
-
"--model_cache_dir",
|
88 |
-
type=str,
|
89 |
-
default=None,
|
90 |
-
help="Overriding the default model cache dir where models downloaded from the hub are saved",
|
91 |
-
)
|
92 |
-
parser.add_argument(
|
93 |
-
"--model_dir",
|
94 |
-
type=str,
|
95 |
-
default=None,
|
96 |
-
help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.",
|
97 |
-
)
|
98 |
-
parser.add_argument(
|
99 |
-
"--lan",
|
100 |
-
"--language",
|
101 |
-
type=str,
|
102 |
-
default="auto",
|
103 |
-
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
|
104 |
-
)
|
105 |
-
parser.add_argument(
|
106 |
-
"--task",
|
107 |
-
type=str,
|
108 |
-
default="transcribe",
|
109 |
-
choices=["transcribe", "translate"],
|
110 |
-
help="Transcribe or translate.",
|
111 |
-
)
|
112 |
-
parser.add_argument(
|
113 |
-
"--backend",
|
114 |
-
type=str,
|
115 |
-
default="faster-whisper",
|
116 |
-
choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
|
117 |
-
help="Load only this backend for Whisper processing.",
|
118 |
-
)
|
119 |
-
parser.add_argument(
|
120 |
-
"--vac",
|
121 |
-
action="store_true",
|
122 |
-
default=False,
|
123 |
-
help="Use VAC = voice activity controller. Recommended. Requires torch.",
|
124 |
-
)
|
125 |
-
parser.add_argument(
|
126 |
-
"--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
|
127 |
-
)
|
128 |
-
parser.add_argument(
|
129 |
-
"--vad",
|
130 |
-
action="store_true",
|
131 |
-
default=True,
|
132 |
-
help="Use VAD = voice activity detection, with the default parameters.",
|
133 |
-
)
|
134 |
-
parser.add_argument(
|
135 |
-
"--buffer_trimming",
|
136 |
-
type=str,
|
137 |
-
default="segment",
|
138 |
-
choices=["sentence", "segment"],
|
139 |
-
help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.',
|
140 |
-
)
|
141 |
-
parser.add_argument(
|
142 |
-
"--buffer_trimming_sec",
|
143 |
-
type=float,
|
144 |
-
default=15,
|
145 |
-
help="Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.",
|
146 |
-
)
|
147 |
-
parser.add_argument(
|
148 |
-
"-l",
|
149 |
-
"--log-level",
|
150 |
-
dest="log_level",
|
151 |
-
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
152 |
-
help="Set the log level",
|
153 |
-
default="DEBUG",
|
154 |
-
)
|
155 |
-
|
156 |
def backend_factory(args):
|
157 |
backend = args.backend
|
158 |
if backend == "openai-api":
|
|
|
64 |
return WtPtok()
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
def backend_factory(args):
|
68 |
backend = args.backend
|
69 |
if backend == "openai-api":
|