qfuxa commited on
Commit
72d0416
Β·
1 Parent(s): 69c754e

script to lib

Browse files
README.md CHANGED
@@ -5,7 +5,7 @@
5
  This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. Simply launch the local server and grant microphone access. Everything runs locally on your machine ✨
6
 
7
  <p align="center">
8
- <img src="web/demo.png" alt="Demo Screenshot" width="730">
9
  </p>
10
 
11
  ### Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
@@ -24,20 +24,27 @@ This project is based on [Whisper Streaming](https://github.com/ufal/whisper_str
24
  - **FastAPI WebSocket Server** – Real-time speech-to-text processing with async FFmpeg streaming.
25
  - **JavaScript Client** – Ready-to-use MediaRecorder implementation for seamless client-side integration.
26
 
27
-
28
  ## Installation
29
 
 
 
 
 
 
 
 
 
30
  1. **Clone the Repository**:
31
 
32
  ```bash
33
  git clone https://github.com/QuentinFuxa/WhisperLiveKit
34
  cd WhisperLiveKit
 
35
  ```
36
 
 
37
 
38
- ### How to Launch the Server
39
-
40
- 1. **Dependencies**:
41
 
42
  - Install system dependencies:
43
  ```bash
 
5
  This project is based on [Whisper Streaming](https://github.com/ufal/whisper_streaming) and lets you transcribe audio directly from your browser. Simply launch the local server and grant microphone access. Everything runs locally on your machine ✨
6
 
7
  <p align="center">
8
+ <img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/demo.png" alt="Demo Screenshot" width="730">
9
  </p>
10
 
11
  ### Differences from [Whisper Streaming](https://github.com/ufal/whisper_streaming)
 
24
  - **FastAPI WebSocket Server** – Real-time speech-to-text processing with async FFmpeg streaming.
25
  - **JavaScript Client** – Ready-to-use MediaRecorder implementation for seamless client-side integration.
26
 
 
27
  ## Installation
28
 
29
+ ### Via pip
30
+
31
+ ```bash
32
+ pip install whisperlivekit
33
+ ```
34
+
35
+ ### From source
36
+
37
  1. **Clone the Repository**:
38
 
39
  ```bash
40
  git clone https://github.com/QuentinFuxa/WhisperLiveKit
41
  cd WhisperLiveKit
42
+ pip install -e .
43
  ```
44
 
45
+ ### System Dependencies
46
 
47
+ You need to install FFmpeg on your system:
 
 
48
 
49
  - Install system dependencies:
50
  ```bash
parse_args.py DELETED
@@ -1,52 +0,0 @@
1
-
2
- import argparse
3
- from whisper_streaming_custom.whisper_online import add_shared_args
4
-
5
-
6
- def parse_args():
7
- parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
8
- parser.add_argument(
9
- "--host",
10
- type=str,
11
- default="localhost",
12
- help="The host address to bind the server to.",
13
- )
14
- parser.add_argument(
15
- "--port", type=int, default=8000, help="The port number to bind the server to."
16
- )
17
- parser.add_argument(
18
- "--warmup-file",
19
- type=str,
20
- default=None,
21
- dest="warmup_file",
22
- help="""
23
- The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
24
- If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
25
- If False, no warmup is performed.
26
- """,
27
- )
28
-
29
- parser.add_argument(
30
- "--confidence-validation",
31
- type=bool,
32
- default=False,
33
- help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
34
- )
35
-
36
- parser.add_argument(
37
- "--diarization",
38
- type=bool,
39
- default=True,
40
- help="Whether to enable speaker diarization.",
41
- )
42
-
43
- parser.add_argument(
44
- "--transcription",
45
- type=bool,
46
- default=True,
47
- help="To disable to only see live diarization results.",
48
- )
49
-
50
- add_shared_args(parser)
51
- args = parser.parse_args()
52
- return args
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="whisperlivekit",
5
+ version="0.1.0",
6
+ description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
7
+ long_description=open("README.md", "r", encoding="utf-8").read(),
8
+ long_description_content_type="text/markdown",
9
+ author="Quentin Fuxa",
10
+ url="https://github.com/QuentinFuxa/WhisperLiveKit",
11
+ packages=find_packages(),
12
+ install_requires=[
13
+ "fastapi",
14
+ "ffmpeg-python",
15
+ "librosa",
16
+ "soundfile",
17
+ "faster-whisper",
18
+ "uvicorn",
19
+ "websockets",
20
+ ],
21
+ extras_require={
22
+ "diarization": ["diart"],
23
+ "vac": ["torch"],
24
+ "sentence": ["mosestokenizer", "wtpsplit"],
25
+ },
26
+ package_data={
27
+ 'whisperlivekit': ['web/*.html'],
28
+ },
29
+ entry_points={
30
+ 'console_scripts': [
31
+ 'whisperlivekit-server=whisperlivekit.server:run_server',
32
+ ],
33
+ },
34
+ classifiers=[
35
+ "Development Status :: 4 - Beta",
36
+ "Intended Audience :: Developers",
37
+ "License :: OSI Approved :: MIT License",
38
+ "Programming Language :: Python :: 3.9",
39
+ "Programming Language :: Python :: 3.10",
40
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
41
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
42
+ ],
43
+ python_requires=">=3.9",
44
+ )
whisper_fastapi_online_server.py CHANGED
@@ -1,37 +1,26 @@
1
  from contextlib import asynccontextmanager
2
-
3
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
4
  from fastapi.responses import HTMLResponse
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
- from whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
 
 
8
  import asyncio
9
  import logging
10
- from parse_args import parse_args
11
- from audio_processor import AudioProcessor
12
 
13
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
  logging.getLogger().setLevel(logging.WARNING)
15
  logger = logging.getLogger(__name__)
16
  logger.setLevel(logging.DEBUG)
17
 
18
- args = parse_args()
19
-
20
 
21
  @asynccontextmanager
22
  async def lifespan(app: FastAPI):
23
- global asr, tokenizer, diarization
24
- if args.transcription:
25
- asr, tokenizer = backend_factory(args)
26
- warmup_asr(asr, args.warmup_file)
27
- else:
28
- asr, tokenizer = None, None
29
-
30
- if args.diarization:
31
- from diarization.diarization_online import DiartDiarization
32
- diarization = DiartDiarization()
33
- else :
34
- diarization = None
35
  yield
36
 
37
  app = FastAPI(lifespan=lifespan)
@@ -44,13 +33,9 @@ app.add_middleware(
44
  )
45
 
46
 
47
- # Load demo HTML for the root endpoint
48
- with open("web/live_transcription.html", "r", encoding="utf-8") as f:
49
- html = f.read()
50
-
51
  @app.get("/")
52
  async def get():
53
- return HTMLResponse(html)
54
 
55
 
56
  async def handle_websocket_results(websocket, results_generator):
@@ -64,12 +49,12 @@ async def handle_websocket_results(websocket, results_generator):
64
 
65
  @app.websocket("/asr")
66
  async def websocket_endpoint(websocket: WebSocket):
67
- audio_processor = AudioProcessor(args, asr, tokenizer)
68
 
69
  await websocket.accept()
70
  logger.info("WebSocket connection opened.")
71
 
72
- results_generator = await audio_processor.create_tasks(diarization)
73
  websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
74
 
75
  try:
@@ -85,8 +70,13 @@ async def websocket_endpoint(websocket: WebSocket):
85
 
86
  if __name__ == "__main__":
87
  import uvicorn
88
-
 
 
89
  uvicorn.run(
90
- "whisper_fastapi_online_server:app", host=args.host, port=args.port, reload=True,
 
 
 
91
  log_level="info"
92
  )
 
1
  from contextlib import asynccontextmanager
 
2
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
3
  from fastapi.responses import HTMLResponse
4
  from fastapi.middleware.cors import CORSMiddleware
5
 
6
+ from whisperlivekit import WhisperLiveKit
7
+ from whisperlivekit.audio_processor import AudioProcessor
8
+
9
  import asyncio
10
  import logging
11
+ import os
 
12
 
13
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
  logging.getLogger().setLevel(logging.WARNING)
15
  logger = logging.getLogger(__name__)
16
  logger.setLevel(logging.DEBUG)
17
 
18
+ kit = None
 
19
 
20
  @asynccontextmanager
21
  async def lifespan(app: FastAPI):
22
+ global kit
23
+ kit = WhisperLiveKit()
 
 
 
 
 
 
 
 
 
 
24
  yield
25
 
26
  app = FastAPI(lifespan=lifespan)
 
33
  )
34
 
35
 
 
 
 
 
36
  @app.get("/")
37
  async def get():
38
+ return HTMLResponse(kit.web_interface())
39
 
40
 
41
  async def handle_websocket_results(websocket, results_generator):
 
49
 
50
  @app.websocket("/asr")
51
  async def websocket_endpoint(websocket: WebSocket):
52
+ audio_processor = AudioProcessor()
53
 
54
  await websocket.accept()
55
  logger.info("WebSocket connection opened.")
56
 
57
+ results_generator = await audio_processor.create_tasks()
58
  websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
59
 
60
  try:
 
70
 
71
  if __name__ == "__main__":
72
  import uvicorn
73
+
74
+ temp_kit = WhisperLiveKit(transcription=False, diarization=False)
75
+
76
  uvicorn.run(
77
+ "whisper_fastapi_online_server:app",
78
+ host=temp_kit.args.host,
79
+ port=temp_kit.args.port,
80
+ reload=True,
81
  log_level="info"
82
  )
whisperlivekit/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .core import WhisperLiveKit, parse_args
2
+ from .audio_processor import AudioProcessor
3
+
4
+ __all__ = ['WhisperLiveKit', 'AudioProcessor', 'parse_args']
audio_processor.py β†’ whisperlivekit/audio_processor.py RENAMED
@@ -7,8 +7,9 @@ import logging
7
  import traceback
8
  from datetime import timedelta
9
  from typing import List, Dict, Any
10
- from timed_objects import ASRToken
11
- from whisper_streaming_custom.whisper_online import online_factory
 
12
 
13
  # Set up logging once
14
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -22,16 +23,19 @@ def format_time(seconds: float) -> str:
22
  class AudioProcessor:
23
  """
24
  Processes audio streams for transcription and diarization.
25
- Handles audio processing, state management, and result formatting in a single class.
26
  """
27
 
28
- def __init__(self, args, asr, tokenizer):
29
  """Initialize the audio processor with configuration, models, and state."""
 
 
 
30
  # Audio processing settings
31
- self.args = args
32
  self.sample_rate = 16000
33
  self.channels = 1
34
- self.samples_per_sec = int(self.sample_rate * args.min_chunk_size)
35
  self.bytes_per_sample = 2
36
  self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
37
  self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
@@ -49,16 +53,17 @@ class AudioProcessor:
49
  self.last_response_content = ""
50
 
51
  # Models and processing
52
- self.asr = asr
53
- self.tokenizer = tokenizer
 
54
  self.ffmpeg_process = self.start_ffmpeg_decoder()
55
- self.transcription_queue = asyncio.Queue() if args.transcription else None
56
- self.diarization_queue = asyncio.Queue() if args.diarization else None
57
  self.pcm_buffer = bytearray()
58
 
59
  # Initialize transcription engine if enabled
60
- if args.transcription:
61
- self.online = online_factory(args, asr, tokenizer)
62
 
63
  def convert_pcm_to_float(self, pcm_buffer):
64
  """Convert PCM buffer in s16le format to normalized NumPy array."""
@@ -362,10 +367,8 @@ class AudioProcessor:
362
  logger.warning(f"Traceback: {traceback.format_exc()}")
363
  await asyncio.sleep(0.5) # Back off on error
364
 
365
- async def create_tasks(self, diarization=None):
366
  """Create and start processing tasks."""
367
- if diarization:
368
- self.diarization = diarization
369
 
370
  tasks = []
371
  if self.args.transcription and self.online:
 
7
  import traceback
8
  from datetime import timedelta
9
  from typing import List, Dict, Any
10
+ from whisperlivekit.timed_objects import ASRToken
11
+ from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
12
+ from whisperlivekit.core import WhisperLiveKit
13
 
14
  # Set up logging once
15
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
23
  class AudioProcessor:
24
  """
25
  Processes audio streams for transcription and diarization.
26
+ Handles audio processing, state management, and result formatting.
27
  """
28
 
29
+ def __init__(self):
30
  """Initialize the audio processor with configuration, models, and state."""
31
+
32
+ models = WhisperLiveKit()
33
+
34
  # Audio processing settings
35
+ self.args = models.args
36
  self.sample_rate = 16000
37
  self.channels = 1
38
+ self.samples_per_sec = int(self.sample_rate * self.args.min_chunk_size)
39
  self.bytes_per_sample = 2
40
  self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
41
  self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
 
53
  self.last_response_content = ""
54
 
55
  # Models and processing
56
+ self.asr = models.asr
57
+ self.tokenizer = models.tokenizer
58
+ self.diarization = models.diarization
59
  self.ffmpeg_process = self.start_ffmpeg_decoder()
60
+ self.transcription_queue = asyncio.Queue() if self.args.transcription else None
61
+ self.diarization_queue = asyncio.Queue() if self.args.diarization else None
62
  self.pcm_buffer = bytearray()
63
 
64
  # Initialize transcription engine if enabled
65
+ if self.args.transcription:
66
+ self.online = online_factory(self.args, models.asr, models.tokenizer)
67
 
68
  def convert_pcm_to_float(self, pcm_buffer):
69
  """Convert PCM buffer in s16le format to normalized NumPy array."""
 
367
  logger.warning(f"Traceback: {traceback.format_exc()}")
368
  await asyncio.sleep(0.5) # Back off on error
369
 
370
+ async def create_tasks(self):
371
  """Create and start processing tasks."""
 
 
372
 
373
  tasks = []
374
  if self.args.transcription and self.online:
whisperlivekit/core.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory, warmup_asr
2
+ from argparse import Namespace, ArgumentParser
3
+
4
+ def parse_args():
5
+ parser = ArgumentParser(description="Whisper FastAPI Online Server")
6
+ parser.add_argument(
7
+ "--host",
8
+ type=str,
9
+ default="localhost",
10
+ help="The host address to bind the server to.",
11
+ )
12
+ parser.add_argument(
13
+ "--port", type=int, default=8000, help="The port number to bind the server to."
14
+ )
15
+ parser.add_argument(
16
+ "--warmup-file",
17
+ type=str,
18
+ default=None,
19
+ dest="warmup_file",
20
+ help="""
21
+ The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
22
+ If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
23
+ If False, no warmup is performed.
24
+ """,
25
+ )
26
+
27
+ parser.add_argument(
28
+ "--confidence-validation",
29
+ type=bool,
30
+ default=False,
31
+ help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
32
+ )
33
+
34
+ parser.add_argument(
35
+ "--diarization",
36
+ type=bool,
37
+ default=True,
38
+ help="Whether to enable speaker diarization.",
39
+ )
40
+
41
+ parser.add_argument(
42
+ "--transcription",
43
+ type=bool,
44
+ default=True,
45
+ help="To disable to only see live diarization results.",
46
+ )
47
+
48
+ parser.add_argument(
49
+ "--min-chunk-size",
50
+ type=float,
51
+ default=0.5,
52
+ help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
53
+ )
54
+ parser.add_argument(
55
+ "--model",
56
+ type=str,
57
+ default="tiny",
58
+ choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
59
+ ","
60
+ ),
61
+ help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.",
62
+ )
63
+ parser.add_argument(
64
+ "--model_cache_dir",
65
+ type=str,
66
+ default=None,
67
+ help="Overriding the default model cache dir where models downloaded from the hub are saved",
68
+ )
69
+ parser.add_argument(
70
+ "--model_dir",
71
+ type=str,
72
+ default=None,
73
+ help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.",
74
+ )
75
+ parser.add_argument(
76
+ "--lan",
77
+ "--language",
78
+ type=str,
79
+ default="auto",
80
+ help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
81
+ )
82
+ parser.add_argument(
83
+ "--task",
84
+ type=str,
85
+ default="transcribe",
86
+ choices=["transcribe", "translate"],
87
+ help="Transcribe or translate.",
88
+ )
89
+ parser.add_argument(
90
+ "--backend",
91
+ type=str,
92
+ default="faster-whisper",
93
+ choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
94
+ help="Load only this backend for Whisper processing.",
95
+ )
96
+ parser.add_argument(
97
+ "--vac",
98
+ action="store_true",
99
+ default=False,
100
+ help="Use VAC = voice activity controller. Recommended. Requires torch.",
101
+ )
102
+ parser.add_argument(
103
+ "--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
104
+ )
105
+ parser.add_argument(
106
+ "--vad",
107
+ action="store_true",
108
+ default=True,
109
+ help="Use VAD = voice activity detection, with the default parameters.",
110
+ )
111
+ parser.add_argument(
112
+ "--buffer_trimming",
113
+ type=str,
114
+ default="segment",
115
+ choices=["sentence", "segment"],
116
+ help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.',
117
+ )
118
+ parser.add_argument(
119
+ "--buffer_trimming_sec",
120
+ type=float,
121
+ default=15,
122
+ help="Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.",
123
+ )
124
+ parser.add_argument(
125
+ "-l",
126
+ "--log-level",
127
+ dest="log_level",
128
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
129
+ help="Set the log level",
130
+ default="DEBUG",
131
+ )
132
+
133
+ args = parser.parse_args()
134
+ return args
135
+
136
+ class WhisperLiveKit:
137
+ _instance = None
138
+ _initialized = False
139
+
140
+ def __new__(cls, *args, **kwargs):
141
+ if cls._instance is None:
142
+ cls._instance = super().__new__(cls)
143
+ return cls._instance
144
+
145
+ def __init__(self, **kwargs):
146
+ if WhisperLiveKit._initialized:
147
+ return
148
+
149
+ default_args = vars(parse_args())
150
+
151
+ merged_args = {**default_args, **kwargs}
152
+
153
+ self.args = Namespace(**merged_args)
154
+
155
+ self.asr = None
156
+ self.tokenizer = None
157
+ self.diarization = None
158
+
159
+ if self.args.transcription:
160
+ self.asr, self.tokenizer = backend_factory(self.args)
161
+ warmup_asr(self.asr, self.args.warmup_file)
162
+
163
+ if self.args.diarization:
164
+ from whisperlivekit.diarization.diarization_online import DiartDiarization
165
+ self.diarization = DiartDiarization()
166
+
167
+ WhisperLiveKit._initialized = True
168
+
169
+ def web_interface(self):
170
+ import pkg_resources
171
+ html_path = pkg_resources.resource_filename('whisperlivekit', 'web/live_transcription.html')
172
+ with open(html_path, "r", encoding="utf-8") as f:
173
+ html = f.read()
174
+ return html
{diarization β†’ whisperlivekit/diarization}/diarization_online.py RENAMED
@@ -8,7 +8,7 @@ import logging
8
  from diart import SpeakerDiarization, SpeakerDiarizationConfig
9
  from diart.inference import StreamingInference
10
  from diart.sources import AudioSource
11
- from timed_objects import SpeakerSegment
12
  from diart.sources import MicrophoneAudioSource
13
  from rx.core import Observer
14
  from typing import Tuple, Any, List
 
8
  from diart import SpeakerDiarization, SpeakerDiarizationConfig
9
  from diart.inference import StreamingInference
10
  from diart.sources import AudioSource
11
+ from whisperlivekit.timed_objects import SpeakerSegment
12
  from diart.sources import MicrophoneAudioSource
13
  from rx.core import Observer
14
  from typing import Tuple, Any, List
silero_vad_iterator.py β†’ whisperlivekit/silero_vad_iterator.py RENAMED
File without changes
timed_objects.py β†’ whisperlivekit/timed_objects.py RENAMED
File without changes
{web β†’ whisperlivekit/web}/live_transcription.html RENAMED
File without changes
{whisper_streaming_custom β†’ whisperlivekit/whisper_streaming_custom}/backends.py RENAMED
@@ -6,7 +6,7 @@ import math
6
  import torch
7
  from typing import List
8
  import numpy as np
9
- from timed_objects import ASRToken
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
6
  import torch
7
  from typing import List
8
  import numpy as np
9
+ from whisperlivekit.timed_objects import ASRToken
10
 
11
  logger = logging.getLogger(__name__)
12
 
{whisper_streaming_custom β†’ whisperlivekit/whisper_streaming_custom}/online_asr.py RENAMED
@@ -2,7 +2,7 @@ import sys
2
  import numpy as np
3
  import logging
4
  from typing import List, Tuple, Optional
5
- from timed_objects import ASRToken, Sentence, Transcript
6
 
7
  logger = logging.getLogger(__name__)
8
 
 
2
  import numpy as np
3
  import logging
4
  from typing import List, Tuple, Optional
5
+ from whisperlivekit.timed_objects import ASRToken, Sentence, Transcript
6
 
7
  logger = logging.getLogger(__name__)
8
 
{whisper_streaming_custom β†’ whisperlivekit/whisper_streaming_custom}/whisper_online.py RENAMED
@@ -64,95 +64,6 @@ def create_tokenizer(lan):
64
  return WtPtok()
65
 
66
 
67
- def add_shared_args(parser):
68
- """shared args for simulation (this entry point) and server
69
- parser: argparse.ArgumentParser object
70
- """
71
- parser.add_argument(
72
- "--min-chunk-size",
73
- type=float,
74
- default=0.5,
75
- help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
76
- )
77
- parser.add_argument(
78
- "--model",
79
- type=str,
80
- default="tiny",
81
- choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
82
- ","
83
- ),
84
- help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.",
85
- )
86
- parser.add_argument(
87
- "--model_cache_dir",
88
- type=str,
89
- default=None,
90
- help="Overriding the default model cache dir where models downloaded from the hub are saved",
91
- )
92
- parser.add_argument(
93
- "--model_dir",
94
- type=str,
95
- default=None,
96
- help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.",
97
- )
98
- parser.add_argument(
99
- "--lan",
100
- "--language",
101
- type=str,
102
- default="auto",
103
- help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
104
- )
105
- parser.add_argument(
106
- "--task",
107
- type=str,
108
- default="transcribe",
109
- choices=["transcribe", "translate"],
110
- help="Transcribe or translate.",
111
- )
112
- parser.add_argument(
113
- "--backend",
114
- type=str,
115
- default="faster-whisper",
116
- choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
117
- help="Load only this backend for Whisper processing.",
118
- )
119
- parser.add_argument(
120
- "--vac",
121
- action="store_true",
122
- default=False,
123
- help="Use VAC = voice activity controller. Recommended. Requires torch.",
124
- )
125
- parser.add_argument(
126
- "--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
127
- )
128
- parser.add_argument(
129
- "--vad",
130
- action="store_true",
131
- default=True,
132
- help="Use VAD = voice activity detection, with the default parameters.",
133
- )
134
- parser.add_argument(
135
- "--buffer_trimming",
136
- type=str,
137
- default="segment",
138
- choices=["sentence", "segment"],
139
- help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.',
140
- )
141
- parser.add_argument(
142
- "--buffer_trimming_sec",
143
- type=float,
144
- default=15,
145
- help="Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.",
146
- )
147
- parser.add_argument(
148
- "-l",
149
- "--log-level",
150
- dest="log_level",
151
- choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
152
- help="Set the log level",
153
- default="DEBUG",
154
- )
155
-
156
  def backend_factory(args):
157
  backend = args.backend
158
  if backend == "openai-api":
 
64
  return WtPtok()
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def backend_factory(args):
68
  backend = args.backend
69
  if backend == "openai-api":