Support parallel execution of Silero VAD
Browse files- app.py +28 -13
- cli.py +3 -1
- src/modelCache.py +17 -0
- src/vad.py +60 -28
- src/vadParallel.py +93 -23
- src/whisperContainer.py +12 -26
- tests/vad_test.py +2 -2
app.py
CHANGED
|
@@ -6,10 +6,9 @@ from io import StringIO
|
|
| 6 |
import os
|
| 7 |
import pathlib
|
| 8 |
import tempfile
|
|
|
|
| 9 |
from src.vadParallel import ParallelContext, ParallelTranscription
|
| 10 |
|
| 11 |
-
from src.whisperContainer import WhisperContainer, WhisperModelCache
|
| 12 |
-
|
| 13 |
# External programs
|
| 14 |
import ffmpeg
|
| 15 |
|
|
@@ -19,6 +18,7 @@ import gradio as gr
|
|
| 19 |
from src.download import ExceededMaximumDuration, download_url
|
| 20 |
from src.utils import slugify, write_srt, write_vtt
|
| 21 |
from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
|
|
|
|
| 22 |
|
| 23 |
# Limitations (set to -1 to disable)
|
| 24 |
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
|
@@ -50,11 +50,13 @@ LANGUAGES = [
|
|
| 50 |
]
|
| 51 |
|
| 52 |
class WhisperTranscriber:
|
| 53 |
-
def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None, delete_uploaded_files: bool = DELETE_UPLOADED_FILES):
|
| 54 |
-
self.model_cache =
|
| 55 |
self.parallel_device_list = None
|
| 56 |
-
self.
|
|
|
|
| 57 |
self.vad_process_timeout = vad_process_timeout
|
|
|
|
| 58 |
|
| 59 |
self.vad_model = None
|
| 60 |
self.inputAudioMaxDuration = input_audio_max_duration
|
|
@@ -142,17 +144,27 @@ class WhisperTranscriber:
|
|
| 142 |
# No parallel devices, so just run the VAD and Whisper in sequence
|
| 143 |
return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# Create parallel context if needed
|
| 146 |
-
if (self.
|
| 147 |
# Create a context wih processes and automatically clear the pool after 1 hour of inactivity
|
| 148 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
parallel_vad = ParallelTranscription()
|
| 151 |
return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
|
| 152 |
-
config=vadConfig,
|
|
|
|
| 153 |
|
| 154 |
def _has_parallel_devices(self):
|
| 155 |
-
return self.parallel_device_list is not None and len(self.parallel_device_list) > 0
|
| 156 |
|
| 157 |
def _concat_prompt(self, prompt1, prompt2):
|
| 158 |
if (prompt1 is None):
|
|
@@ -249,13 +261,15 @@ class WhisperTranscriber:
|
|
| 249 |
def close(self):
|
| 250 |
self.clear_cache()
|
| 251 |
|
| 252 |
-
if (self.
|
| 253 |
-
self.
|
|
|
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
|
| 257 |
-
default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None, vad_process_timeout: float = None):
|
| 258 |
-
ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout)
|
| 259 |
|
| 260 |
# Specify a list of devices to use for parallel processing
|
| 261 |
ui.set_parallel_devices(vad_parallel_devices)
|
|
@@ -303,6 +317,7 @@ if __name__ == '__main__':
|
|
| 303 |
parser.add_argument("--default_model_name", type=str, default="medium", help="The default model name.")
|
| 304 |
parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
|
| 305 |
parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
|
|
|
|
| 306 |
parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
|
| 307 |
|
| 308 |
args = parser.parse_args().__dict__
|
|
|
|
| 6 |
import os
|
| 7 |
import pathlib
|
| 8 |
import tempfile
|
| 9 |
+
from src.modelCache import ModelCache
|
| 10 |
from src.vadParallel import ParallelContext, ParallelTranscription
|
| 11 |
|
|
|
|
|
|
|
| 12 |
# External programs
|
| 13 |
import ffmpeg
|
| 14 |
|
|
|
|
| 18 |
from src.download import ExceededMaximumDuration, download_url
|
| 19 |
from src.utils import slugify, write_srt, write_vtt
|
| 20 |
from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
|
| 21 |
+
from src.whisperContainer import WhisperContainer
|
| 22 |
|
| 23 |
# Limitations (set to -1 to disable)
|
| 24 |
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
|
|
|
| 50 |
]
|
| 51 |
|
| 52 |
class WhisperTranscriber:
|
| 53 |
+
def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None, vad_cpu_cores: int = 1, delete_uploaded_files: bool = DELETE_UPLOADED_FILES):
|
| 54 |
+
self.model_cache = ModelCache()
|
| 55 |
self.parallel_device_list = None
|
| 56 |
+
self.gpu_parallel_context = None
|
| 57 |
+
self.cpu_parallel_context = None
|
| 58 |
self.vad_process_timeout = vad_process_timeout
|
| 59 |
+
self.vad_cpu_cores = vad_cpu_cores
|
| 60 |
|
| 61 |
self.vad_model = None
|
| 62 |
self.inputAudioMaxDuration = input_audio_max_duration
|
|
|
|
| 144 |
# No parallel devices, so just run the VAD and Whisper in sequence
|
| 145 |
return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
|
| 146 |
|
| 147 |
+
gpu_devices = self.parallel_device_list
|
| 148 |
+
|
| 149 |
+
if (gpu_devices is None or len(gpu_devices) == 0):
|
| 150 |
+
# No GPU devices specified, pass the current environment variable to the first GPU process. This may be NULL.
|
| 151 |
+
gpu_devices = [os.environ.get("CUDA_VISIBLE_DEVICES", None)]
|
| 152 |
+
|
| 153 |
# Create parallel context if needed
|
| 154 |
+
if (self.gpu_parallel_context is None):
|
| 155 |
# Create a context wih processes and automatically clear the pool after 1 hour of inactivity
|
| 156 |
+
self.gpu_parallel_context = ParallelContext(num_processes=len(gpu_devices), auto_cleanup_timeout_seconds=self.vad_process_timeout)
|
| 157 |
+
# We also need a CPU context for the VAD
|
| 158 |
+
if (self.cpu_parallel_context is None):
|
| 159 |
+
self.cpu_parallel_context = ParallelContext(num_processes=self.vad_cpu_cores, auto_cleanup_timeout_seconds=self.vad_process_timeout)
|
| 160 |
|
| 161 |
parallel_vad = ParallelTranscription()
|
| 162 |
return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
|
| 163 |
+
config=vadConfig, cpu_device_count=self.vad_cpu_cores, gpu_devices=gpu_devices,
|
| 164 |
+
cpu_parallel_context=self.cpu_parallel_context, gpu_parallel_context=self.gpu_parallel_context)
|
| 165 |
|
| 166 |
def _has_parallel_devices(self):
|
| 167 |
+
return (self.parallel_device_list is not None and len(self.parallel_device_list) > 0) or self.vad_cpu_cores > 1
|
| 168 |
|
| 169 |
def _concat_prompt(self, prompt1, prompt2):
|
| 170 |
if (prompt1 is None):
|
|
|
|
| 261 |
def close(self):
|
| 262 |
self.clear_cache()
|
| 263 |
|
| 264 |
+
if (self.gpu_parallel_context is not None):
|
| 265 |
+
self.gpu_parallel_context.close()
|
| 266 |
+
if (self.cpu_parallel_context is not None):
|
| 267 |
+
self.cpu_parallel_context.close()
|
| 268 |
|
| 269 |
|
| 270 |
def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
|
| 271 |
+
default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None, vad_process_timeout: float = None, vad_cpu_cores: int = 1):
|
| 272 |
+
ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores)
|
| 273 |
|
| 274 |
# Specify a list of devices to use for parallel processing
|
| 275 |
ui.set_parallel_devices(vad_parallel_devices)
|
|
|
|
| 317 |
parser.add_argument("--default_model_name", type=str, default="medium", help="The default model name.")
|
| 318 |
parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
|
| 319 |
parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
|
| 320 |
+
parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
|
| 321 |
parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
|
| 322 |
|
| 323 |
args = parser.parse_args().__dict__
|
cli.py
CHANGED
|
@@ -32,6 +32,7 @@ def cli():
|
|
| 32 |
parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
|
| 33 |
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
| 34 |
parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
|
|
|
|
| 35 |
parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
|
| 36 |
|
| 37 |
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
|
@@ -73,8 +74,9 @@ def cli():
|
|
| 73 |
vad_max_merge_size = args.pop("vad_max_merge_size")
|
| 74 |
vad_padding = args.pop("vad_padding")
|
| 75 |
vad_prompt_window = args.pop("vad_prompt_window")
|
|
|
|
| 76 |
|
| 77 |
-
model = WhisperContainer(model_name, device=device, download_root=model_dir)
|
| 78 |
transcriber = WhisperTranscriber(delete_uploaded_files=False)
|
| 79 |
transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
|
| 80 |
|
|
|
|
| 32 |
parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
|
| 33 |
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
| 34 |
parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
|
| 35 |
+
parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
|
| 36 |
parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
|
| 37 |
|
| 38 |
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
|
|
|
| 74 |
vad_max_merge_size = args.pop("vad_max_merge_size")
|
| 75 |
vad_padding = args.pop("vad_padding")
|
| 76 |
vad_prompt_window = args.pop("vad_prompt_window")
|
| 77 |
+
vad_cpu_cores = args.pop("vad_cpu_cores")
|
| 78 |
|
| 79 |
+
model = WhisperContainer(model_name, device=device, download_root=model_dir, vad_cpu_cores=vad_cpu_cores)
|
| 80 |
transcriber = WhisperTranscriber(delete_uploaded_files=False)
|
| 81 |
transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
|
| 82 |
|
src/modelCache.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ModelCache:
|
| 2 |
+
def __init__(self):
|
| 3 |
+
self._cache = dict()
|
| 4 |
+
|
| 5 |
+
def get(self, model_key: str, model_factory):
|
| 6 |
+
result = self._cache.get(model_key)
|
| 7 |
+
|
| 8 |
+
if result is None:
|
| 9 |
+
result = model_factory()
|
| 10 |
+
self._cache[model_key] = result
|
| 11 |
+
return result
|
| 12 |
+
|
| 13 |
+
def clear(self):
|
| 14 |
+
self._cache.clear()
|
| 15 |
+
|
| 16 |
+
# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
|
| 17 |
+
GLOBAL_MODEL_CACHE = ModelCache()
|
src/vad.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
from abc import ABC, abstractmethod
|
| 2 |
from collections import Counter, deque
|
|
|
|
| 3 |
|
| 4 |
from typing import Any, Deque, Iterator, List, Dict
|
| 5 |
|
| 6 |
from pprint import pprint
|
|
|
|
| 7 |
|
| 8 |
from src.segments import merge_timestamps
|
| 9 |
from src.whisperContainer import WhisperCallback
|
|
@@ -76,7 +78,7 @@ class AbstractTranscription(ABC):
|
|
| 76 |
return load_audio(str, self.sampling_rate, start_time, duration)
|
| 77 |
|
| 78 |
@abstractmethod
|
| 79 |
-
def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
|
| 80 |
"""
|
| 81 |
Get the start and end timestamps of the sections that should be transcribed by this VAD method.
|
| 82 |
|
|
@@ -93,10 +95,10 @@ class AbstractTranscription(ABC):
|
|
| 93 |
"""
|
| 94 |
return
|
| 95 |
|
| 96 |
-
def get_merged_timestamps(self,
|
| 97 |
"""
|
| 98 |
Get the start and end timestamps of the sections that should be transcribed by this VAD method,
|
| 99 |
-
after merging the segments using the specified configuration.
|
| 100 |
|
| 101 |
Parameters
|
| 102 |
----------
|
|
@@ -109,21 +111,17 @@ class AbstractTranscription(ABC):
|
|
| 109 |
-------
|
| 110 |
A list of start and end timestamps, in fractional seconds.
|
| 111 |
"""
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size,
|
| 115 |
config.segment_padding_left, config.segment_padding_right)
|
| 116 |
|
| 117 |
if config.non_speech_strategy != NonSpeechStrategy.SKIP:
|
| 118 |
-
max_audio_duration = get_audio_duration(audio)
|
| 119 |
-
|
| 120 |
# Expand segments to include the gaps between them
|
| 121 |
if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
|
| 122 |
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
| 123 |
-
merged = self.fill_gaps(merged, total_duration=
|
| 124 |
elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
|
| 125 |
# With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
|
| 126 |
-
merged = self.expand_gaps(merged, total_duration=
|
| 127 |
else:
|
| 128 |
raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
|
| 129 |
|
|
@@ -147,8 +145,11 @@ class AbstractTranscription(ABC):
|
|
| 147 |
A list of start and end timestamps, in fractional seconds.
|
| 148 |
"""
|
| 149 |
|
|
|
|
|
|
|
|
|
|
| 150 |
# Get speech timestamps from full audio file
|
| 151 |
-
merged = self.get_merged_timestamps(
|
| 152 |
|
| 153 |
# A deque of transcribed segments that is passed to the next segment as a prompt
|
| 154 |
prompt_window = deque()
|
|
@@ -392,22 +393,41 @@ class AbstractTranscription(ABC):
|
|
| 392 |
|
| 393 |
|
| 394 |
class VadSileroTranscription(AbstractTranscription):
|
| 395 |
-
def __init__(self, sampling_rate: int = 16000):
|
| 396 |
super().__init__(sampling_rate=sampling_rate)
|
| 397 |
-
|
| 398 |
-
self.
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
result = []
|
| 405 |
|
|
|
|
|
|
|
|
|
|
| 406 |
# Divide procesisng of audio into chunks
|
| 407 |
-
chunk_start =
|
| 408 |
|
| 409 |
-
while (chunk_start <
|
| 410 |
-
chunk_duration = min(
|
| 411 |
|
| 412 |
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
| 413 |
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
|
@@ -421,23 +441,35 @@ class VadSileroTranscription(AbstractTranscription):
|
|
| 421 |
result.extend(adjusted)
|
| 422 |
chunk_start += chunk_duration
|
| 423 |
|
|
|
|
|
|
|
|
|
|
| 424 |
return result
|
| 425 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
# A very simple VAD that just marks every N seconds as speech
|
| 427 |
class VadPeriodicTranscription(AbstractTranscription):
|
| 428 |
def __init__(self, sampling_rate: int = 16000):
|
| 429 |
super().__init__(sampling_rate=sampling_rate)
|
| 430 |
|
| 431 |
-
def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig):
|
| 432 |
-
# Get duration in seconds
|
| 433 |
-
audio_duration = get_audio_duration(audio)
|
| 434 |
result = []
|
| 435 |
|
| 436 |
# Generate a timestamp every N seconds
|
| 437 |
-
start_timestamp =
|
| 438 |
|
| 439 |
-
while (start_timestamp <
|
| 440 |
-
end_timestamp = min(start_timestamp + config.periodic_duration,
|
| 441 |
segment_duration = end_timestamp - start_timestamp
|
| 442 |
|
| 443 |
# Minimum duration is 1 second
|
|
|
|
| 1 |
from abc import ABC, abstractmethod
|
| 2 |
from collections import Counter, deque
|
| 3 |
+
import time
|
| 4 |
|
| 5 |
from typing import Any, Deque, Iterator, List, Dict
|
| 6 |
|
| 7 |
from pprint import pprint
|
| 8 |
+
from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
|
| 9 |
|
| 10 |
from src.segments import merge_timestamps
|
| 11 |
from src.whisperContainer import WhisperCallback
|
|
|
|
| 78 |
return load_audio(str, self.sampling_rate, start_time, duration)
|
| 79 |
|
| 80 |
@abstractmethod
|
| 81 |
+
def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
|
| 82 |
"""
|
| 83 |
Get the start and end timestamps of the sections that should be transcribed by this VAD method.
|
| 84 |
|
|
|
|
| 95 |
"""
|
| 96 |
return
|
| 97 |
|
| 98 |
+
def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: TranscriptionConfig, total_duration: float):
|
| 99 |
"""
|
| 100 |
Get the start and end timestamps of the sections that should be transcribed by this VAD method,
|
| 101 |
+
after merging the given segments using the specified configuration.
|
| 102 |
|
| 103 |
Parameters
|
| 104 |
----------
|
|
|
|
| 111 |
-------
|
| 112 |
A list of start and end timestamps, in fractional seconds.
|
| 113 |
"""
|
| 114 |
+
merged = merge_timestamps(timestamps, config.max_silent_period, config.max_merge_size,
|
|
|
|
|
|
|
| 115 |
config.segment_padding_left, config.segment_padding_right)
|
| 116 |
|
| 117 |
if config.non_speech_strategy != NonSpeechStrategy.SKIP:
|
|
|
|
|
|
|
| 118 |
# Expand segments to include the gaps between them
|
| 119 |
if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
|
| 120 |
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
| 121 |
+
merged = self.fill_gaps(merged, total_duration=total_duration, max_expand_size=config.max_merge_size)
|
| 122 |
elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
|
| 123 |
# With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
|
| 124 |
+
merged = self.expand_gaps(merged, total_duration=total_duration)
|
| 125 |
else:
|
| 126 |
raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
|
| 127 |
|
|
|
|
| 145 |
A list of start and end timestamps, in fractional seconds.
|
| 146 |
"""
|
| 147 |
|
| 148 |
+
max_audio_duration = get_audio_duration(audio)
|
| 149 |
+
timestamp_segments = self.get_transcribe_timestamps(audio, config, 0, max_audio_duration)
|
| 150 |
+
|
| 151 |
# Get speech timestamps from full audio file
|
| 152 |
+
merged = self.get_merged_timestamps(timestamp_segments, config, max_audio_duration)
|
| 153 |
|
| 154 |
# A deque of transcribed segments that is passed to the next segment as a prompt
|
| 155 |
prompt_window = deque()
|
|
|
|
| 393 |
|
| 394 |
|
| 395 |
class VadSileroTranscription(AbstractTranscription):
|
| 396 |
+
def __init__(self, sampling_rate: int = 16000, cache: ModelCache = None):
|
| 397 |
super().__init__(sampling_rate=sampling_rate)
|
| 398 |
+
self.model = None
|
| 399 |
+
self.cache = cache
|
| 400 |
+
self._initialize_model()
|
| 401 |
+
|
| 402 |
+
def _initialize_model(self):
|
| 403 |
+
if (self.cache is not None):
|
| 404 |
+
model_key = "VadSileroTranscription"
|
| 405 |
+
self.model, self.get_speech_timestamps = self.cache.get(model_key, self._create_model)
|
| 406 |
+
print("Loaded Silerio model from cache.")
|
| 407 |
+
else:
|
| 408 |
+
self.model, self.get_speech_timestamps = self._create_model()
|
| 409 |
+
print("Created Silerio model")
|
| 410 |
+
|
| 411 |
+
def _create_model(self):
|
| 412 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
| 413 |
+
|
| 414 |
+
# Silero does not benefit from multi-threading
|
| 415 |
+
torch.set_num_threads(1) # JIT
|
| 416 |
+
(get_speech_timestamps, _, _, _, _) = utils
|
| 417 |
+
|
| 418 |
+
return model, get_speech_timestamps
|
| 419 |
+
|
| 420 |
+
def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
|
| 421 |
result = []
|
| 422 |
|
| 423 |
+
print("Getting timestamps from audio file: {}, start: {}, duration: {}".format(audio, start_time, end_time))
|
| 424 |
+
perf_start_time = time.perf_counter()
|
| 425 |
+
|
| 426 |
# Divide procesisng of audio into chunks
|
| 427 |
+
chunk_start = start_time
|
| 428 |
|
| 429 |
+
while (chunk_start < end_time):
|
| 430 |
+
chunk_duration = min(end_time - chunk_start, VAD_MAX_PROCESSING_CHUNK)
|
| 431 |
|
| 432 |
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
| 433 |
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
|
|
|
| 441 |
result.extend(adjusted)
|
| 442 |
chunk_start += chunk_duration
|
| 443 |
|
| 444 |
+
perf_end_time = time.perf_counter()
|
| 445 |
+
print("VAD processing took {} seconds".format(perf_end_time - perf_start_time))
|
| 446 |
+
|
| 447 |
return result
|
| 448 |
|
| 449 |
+
def __getstate__(self):
|
| 450 |
+
# We only need the sampling rate
|
| 451 |
+
return { 'sampling_rate': self.sampling_rate }
|
| 452 |
+
|
| 453 |
+
def __setstate__(self, state):
|
| 454 |
+
self.sampling_rate = state['sampling_rate']
|
| 455 |
+
self.model = None
|
| 456 |
+
# Use the global cache
|
| 457 |
+
self.cache = GLOBAL_MODEL_CACHE
|
| 458 |
+
self._initialize_model()
|
| 459 |
+
|
| 460 |
# A very simple VAD that just marks every N seconds as speech
|
| 461 |
class VadPeriodicTranscription(AbstractTranscription):
|
| 462 |
def __init__(self, sampling_rate: int = 16000):
|
| 463 |
super().__init__(sampling_rate=sampling_rate)
|
| 464 |
|
| 465 |
+
def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig, start_time: float, end_time: float):
|
|
|
|
|
|
|
| 466 |
result = []
|
| 467 |
|
| 468 |
# Generate a timestamp every N seconds
|
| 469 |
+
start_timestamp = start_time
|
| 470 |
|
| 471 |
+
while (start_timestamp < end_time):
|
| 472 |
+
end_timestamp = min(start_timestamp + config.periodic_duration, end_time)
|
| 473 |
segment_duration = end_timestamp - start_timestamp
|
| 474 |
|
| 475 |
# Minimum duration is 1 second
|
src/vadParallel.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
import multiprocessing
|
| 2 |
import threading
|
| 3 |
import time
|
| 4 |
-
from src.vad import AbstractTranscription, TranscriptionConfig
|
| 5 |
from src.whisperContainer import WhisperCallback
|
| 6 |
|
| 7 |
from multiprocessing import Pool
|
| 8 |
|
| 9 |
-
from typing import List
|
| 10 |
import os
|
| 11 |
|
| 12 |
|
|
@@ -76,19 +76,28 @@ class ParallelTranscriptionConfig(TranscriptionConfig):
|
|
| 76 |
super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
|
| 77 |
self.device_id = device_id
|
| 78 |
self.override_timestamps = override_timestamps
|
| 79 |
-
|
| 80 |
class ParallelTranscription(AbstractTranscription):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def __init__(self, sampling_rate: int = 16000):
|
| 82 |
super().__init__(sampling_rate=sampling_rate)
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
# First, get the timestamps for the original audio
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Split into a list for each device
|
| 90 |
# TODO: Split by time instead of by number of chunks
|
| 91 |
-
merged_split = list(self._split(merged, len(
|
| 92 |
|
| 93 |
# Parameters that will be passed to the transcribe function
|
| 94 |
parameters = []
|
|
@@ -96,15 +105,15 @@ class ParallelTranscription(AbstractTranscription):
|
|
| 96 |
|
| 97 |
for i in range(len(merged_split)):
|
| 98 |
device_segment_list = list(merged_split[i])
|
| 99 |
-
device_id =
|
| 100 |
|
| 101 |
if (len(device_segment_list) <= 0):
|
| 102 |
continue
|
| 103 |
|
| 104 |
-
print("Device " + device_id + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
|
| 105 |
|
| 106 |
# Create a new config with the given device ID
|
| 107 |
-
device_config = ParallelTranscriptionConfig(
|
| 108 |
segment_index += len(device_segment_list)
|
| 109 |
|
| 110 |
parameters.append([audio, whisperCallable, device_config]);
|
|
@@ -119,12 +128,12 @@ class ParallelTranscription(AbstractTranscription):
|
|
| 119 |
|
| 120 |
# Spawn a separate process for each device
|
| 121 |
try:
|
| 122 |
-
if (
|
| 123 |
-
|
| 124 |
created_context = True
|
| 125 |
|
| 126 |
# Get a pool of processes
|
| 127 |
-
pool =
|
| 128 |
|
| 129 |
# Run the transcription in parallel
|
| 130 |
results = pool.starmap(self.transcribe, parameters)
|
|
@@ -140,29 +149,90 @@ class ParallelTranscription(AbstractTranscription):
|
|
| 140 |
|
| 141 |
finally:
|
| 142 |
# Return the pool to the context
|
| 143 |
-
if (
|
| 144 |
-
|
| 145 |
# Always close the context if we created it
|
| 146 |
if (created_context):
|
| 147 |
-
|
| 148 |
|
| 149 |
return merged
|
| 150 |
|
| 151 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
return []
|
| 153 |
|
| 154 |
-
def get_merged_timestamps(self,
|
| 155 |
# Override timestamps that will be processed
|
| 156 |
if (config.override_timestamps is not None):
|
| 157 |
print("Using override timestamps of size " + str(len(config.override_timestamps)))
|
| 158 |
return config.override_timestamps
|
| 159 |
-
return super().get_merged_timestamps(
|
| 160 |
|
| 161 |
def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
|
| 162 |
-
# Override device ID
|
| 163 |
-
if (
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
return super().transcribe(audio, whisperCallable, config)
|
| 167 |
|
| 168 |
def _split(self, a, n):
|
|
|
|
| 1 |
import multiprocessing
|
| 2 |
import threading
|
| 3 |
import time
|
| 4 |
+
from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
|
| 5 |
from src.whisperContainer import WhisperCallback
|
| 6 |
|
| 7 |
from multiprocessing import Pool
|
| 8 |
|
| 9 |
+
from typing import Any, Dict, List
|
| 10 |
import os
|
| 11 |
|
| 12 |
|
|
|
|
| 76 |
super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
|
| 77 |
self.device_id = device_id
|
| 78 |
self.override_timestamps = override_timestamps
|
| 79 |
+
|
| 80 |
class ParallelTranscription(AbstractTranscription):
|
| 81 |
+
# Silero VAD typically takes about 3 seconds per minute, so there's no need to split the chunks
|
| 82 |
+
# into smaller segments than 2 minute (min 6 seconds per CPU core)
|
| 83 |
+
MIN_CPU_CHUNK_SIZE_SECONDS = 2 * 60
|
| 84 |
+
|
| 85 |
def __init__(self, sampling_rate: int = 16000):
|
| 86 |
super().__init__(sampling_rate=sampling_rate)
|
| 87 |
|
| 88 |
+
def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
|
| 89 |
+
cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None):
|
| 90 |
+
total_duration = get_audio_duration(audio)
|
| 91 |
+
|
| 92 |
# First, get the timestamps for the original audio
|
| 93 |
+
if (cpu_device_count > 1):
|
| 94 |
+
merged = self._get_merged_timestamps_parallel(transcription, audio, config, total_duration, cpu_device_count, cpu_parallel_context)
|
| 95 |
+
else:
|
| 96 |
+
merged = transcription.get_merged_timestamps(audio, config, total_duration)
|
| 97 |
|
| 98 |
# Split into a list for each device
|
| 99 |
# TODO: Split by time instead of by number of chunks
|
| 100 |
+
merged_split = list(self._split(merged, len(gpu_devices)))
|
| 101 |
|
| 102 |
# Parameters that will be passed to the transcribe function
|
| 103 |
parameters = []
|
|
|
|
| 105 |
|
| 106 |
for i in range(len(merged_split)):
|
| 107 |
device_segment_list = list(merged_split[i])
|
| 108 |
+
device_id = gpu_devices[i]
|
| 109 |
|
| 110 |
if (len(device_segment_list) <= 0):
|
| 111 |
continue
|
| 112 |
|
| 113 |
+
print("Device " + str(device_id) + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
|
| 114 |
|
| 115 |
# Create a new config with the given device ID
|
| 116 |
+
device_config = ParallelTranscriptionConfig(device_id, device_segment_list, segment_index, config)
|
| 117 |
segment_index += len(device_segment_list)
|
| 118 |
|
| 119 |
parameters.append([audio, whisperCallable, device_config]);
|
|
|
|
| 128 |
|
| 129 |
# Spawn a separate process for each device
|
| 130 |
try:
|
| 131 |
+
if (gpu_parallel_context is None):
|
| 132 |
+
gpu_parallel_context = ParallelContext(len(gpu_devices))
|
| 133 |
created_context = True
|
| 134 |
|
| 135 |
# Get a pool of processes
|
| 136 |
+
pool = gpu_parallel_context.get_pool()
|
| 137 |
|
| 138 |
# Run the transcription in parallel
|
| 139 |
results = pool.starmap(self.transcribe, parameters)
|
|
|
|
| 149 |
|
| 150 |
finally:
|
| 151 |
# Return the pool to the context
|
| 152 |
+
if (gpu_parallel_context is not None):
|
| 153 |
+
gpu_parallel_context.return_pool(pool)
|
| 154 |
# Always close the context if we created it
|
| 155 |
if (created_context):
|
| 156 |
+
gpu_parallel_context.close()
|
| 157 |
|
| 158 |
return merged
|
| 159 |
|
| 160 |
+
def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
|
| 161 |
+
cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
|
| 162 |
+
parameters = []
|
| 163 |
+
|
| 164 |
+
chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
|
| 165 |
+
chunk_start = 0
|
| 166 |
+
cpu_device_id = 0
|
| 167 |
+
|
| 168 |
+
perf_start_time = time.perf_counter()
|
| 169 |
+
|
| 170 |
+
# Create chunks that will be processed on the CPU
|
| 171 |
+
while (chunk_start < total_duration):
|
| 172 |
+
chunk_end = min(chunk_start + chunk_size, total_duration)
|
| 173 |
+
|
| 174 |
+
print("Parallel VAD: Executing chunk from " + str(chunk_start) + " to " +
|
| 175 |
+
str(chunk_end) + " on CPU device " + str(cpu_device_id))
|
| 176 |
+
parameters.append([audio, config, chunk_start, chunk_end]);
|
| 177 |
+
|
| 178 |
+
cpu_device_id += 1
|
| 179 |
+
chunk_start = chunk_end
|
| 180 |
+
|
| 181 |
+
created_context = False
|
| 182 |
+
|
| 183 |
+
# Spawn a separate process for each device
|
| 184 |
+
try:
|
| 185 |
+
if (cpu_parallel_context is None):
|
| 186 |
+
cpu_parallel_context = ParallelContext(cpu_device_count)
|
| 187 |
+
created_context = True
|
| 188 |
+
|
| 189 |
+
# Get a pool of processes
|
| 190 |
+
pool = cpu_parallel_context.get_pool()
|
| 191 |
+
|
| 192 |
+
# Run the transcription in parallel. Note that transcription must be picklable.
|
| 193 |
+
results = pool.starmap(transcription.get_transcribe_timestamps, parameters)
|
| 194 |
+
|
| 195 |
+
timestamps = []
|
| 196 |
+
|
| 197 |
+
# Flatten the results
|
| 198 |
+
for result in results:
|
| 199 |
+
timestamps.extend(result)
|
| 200 |
+
|
| 201 |
+
merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
|
| 202 |
+
|
| 203 |
+
perf_end_time = time.perf_counter()
|
| 204 |
+
print("Parallel VAD processing took {} seconds".format(perf_end_time - perf_start_time))
|
| 205 |
+
return merged
|
| 206 |
+
|
| 207 |
+
finally:
|
| 208 |
+
# Return the pool to the context
|
| 209 |
+
if (cpu_parallel_context is not None):
|
| 210 |
+
cpu_parallel_context.return_pool(pool)
|
| 211 |
+
# Always close the context if we created it
|
| 212 |
+
if (created_context):
|
| 213 |
+
cpu_parallel_context.close()
|
| 214 |
+
|
| 215 |
+
def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig, start_time: float, duration: float):
|
| 216 |
return []
|
| 217 |
|
| 218 |
+
def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
|
| 219 |
# Override timestamps that will be processed
|
| 220 |
if (config.override_timestamps is not None):
|
| 221 |
print("Using override timestamps of size " + str(len(config.override_timestamps)))
|
| 222 |
return config.override_timestamps
|
| 223 |
+
return super().get_merged_timestamps(timestamps, config, total_duration)
|
| 224 |
|
| 225 |
def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
|
| 226 |
+
# Override device ID the first time
|
| 227 |
+
if (os.environ.get("INITIALIZED", None) is None):
|
| 228 |
+
os.environ["INITIALIZED"] = "1"
|
| 229 |
+
|
| 230 |
+
# Note that this may be None if the user didn't specify a device. In that case, Whisper will
|
| 231 |
+
# just use the default GPU device.
|
| 232 |
+
if (config.device_id is not None):
|
| 233 |
+
print("Using device " + config.device_id)
|
| 234 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
|
| 235 |
+
|
| 236 |
return super().transcribe(audio, whisperCallable, config)
|
| 237 |
|
| 238 |
def _split(self, a, n):
|
src/whisperContainer.py
CHANGED
|
@@ -1,29 +1,10 @@
|
|
| 1 |
# External programs
|
| 2 |
import whisper
|
| 3 |
|
| 4 |
-
|
| 5 |
-
def __init__(self):
|
| 6 |
-
self._cache = dict()
|
| 7 |
-
|
| 8 |
-
def get(self, model_name, device: str = None):
|
| 9 |
-
key = model_name + ":" + (device if device else '')
|
| 10 |
-
|
| 11 |
-
result = self._cache.get(key)
|
| 12 |
-
|
| 13 |
-
if result is None:
|
| 14 |
-
print("Loading whisper model " + model_name)
|
| 15 |
-
result = whisper.load_model(name=model_name, device=device)
|
| 16 |
-
self._cache[key] = result
|
| 17 |
-
return result
|
| 18 |
-
|
| 19 |
-
def clear(self):
|
| 20 |
-
self._cache.clear()
|
| 21 |
-
|
| 22 |
-
# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
|
| 23 |
-
GLOBAL_WHISPER_MODEL_CACHE = WhisperModelCache()
|
| 24 |
|
| 25 |
class WhisperContainer:
|
| 26 |
-
def __init__(self, model_name: str, device: str = None, download_root: str = None, cache:
|
| 27 |
self.model_name = model_name
|
| 28 |
self.device = device
|
| 29 |
self.download_root = download_root
|
|
@@ -36,12 +17,16 @@ class WhisperContainer:
|
|
| 36 |
if self.model is None:
|
| 37 |
|
| 38 |
if (self.cache is None):
|
| 39 |
-
|
| 40 |
-
self.model = whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
|
| 41 |
else:
|
| 42 |
-
|
|
|
|
| 43 |
return self.model
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
|
| 46 |
"""
|
| 47 |
Create a WhisperCallback object that can be used to transcript audio files.
|
|
@@ -65,14 +50,15 @@ class WhisperContainer:
|
|
| 65 |
|
| 66 |
# This is required for multiprocessing
|
| 67 |
def __getstate__(self):
|
| 68 |
-
return { "model_name": self.model_name, "device": self.device }
|
| 69 |
|
| 70 |
def __setstate__(self, state):
|
| 71 |
self.model_name = state["model_name"]
|
| 72 |
self.device = state["device"]
|
|
|
|
| 73 |
self.model = None
|
| 74 |
# Depickled objects must use the global cache
|
| 75 |
-
self.cache =
|
| 76 |
|
| 77 |
|
| 78 |
class WhisperCallback:
|
|
|
|
| 1 |
# External programs
|
| 2 |
import whisper
|
| 3 |
|
| 4 |
+
from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
class WhisperContainer:
|
| 7 |
+
def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: ModelCache = None):
|
| 8 |
self.model_name = model_name
|
| 9 |
self.device = device
|
| 10 |
self.download_root = download_root
|
|
|
|
| 17 |
if self.model is None:
|
| 18 |
|
| 19 |
if (self.cache is None):
|
| 20 |
+
self.model = self._create_model()
|
|
|
|
| 21 |
else:
|
| 22 |
+
model_key = "WhisperContainer." + self.model_name + ":" + (self.device if self.device else '')
|
| 23 |
+
self.model = self.cache.get(model_key, self._create_model)
|
| 24 |
return self.model
|
| 25 |
|
| 26 |
+
def _create_model(self):
|
| 27 |
+
print("Loading whisper model " + self.model_name)
|
| 28 |
+
return whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
|
| 29 |
+
|
| 30 |
def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
|
| 31 |
"""
|
| 32 |
Create a WhisperCallback object that can be used to transcript audio files.
|
|
|
|
| 50 |
|
| 51 |
# This is required for multiprocessing
|
| 52 |
def __getstate__(self):
|
| 53 |
+
return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root }
|
| 54 |
|
| 55 |
def __setstate__(self, state):
|
| 56 |
self.model_name = state["model_name"]
|
| 57 |
self.device = state["device"]
|
| 58 |
+
self.download_root = state["download_root"]
|
| 59 |
self.model = None
|
| 60 |
# Depickled objects must use the global cache
|
| 61 |
+
self.cache = GLOBAL_MODEL_CACHE
|
| 62 |
|
| 63 |
|
| 64 |
class WhisperCallback:
|
tests/vad_test.py
CHANGED
|
@@ -5,7 +5,7 @@ import sys
|
|
| 5 |
|
| 6 |
sys.path.append('../whisper-webui')
|
| 7 |
|
| 8 |
-
from src.vad import AbstractTranscription, VadSileroTranscription
|
| 9 |
|
| 10 |
class TestVad(unittest.TestCase):
|
| 11 |
def __init__(self, *args, **kwargs):
|
|
@@ -55,7 +55,7 @@ class MockVadTranscription(AbstractTranscription):
|
|
| 55 |
# For mocking, this just returns a simple numppy array
|
| 56 |
return np.array([start_time_seconds, duration_seconds], dtype=np.float64)
|
| 57 |
|
| 58 |
-
def get_transcribe_timestamps(self, audio: str):
|
| 59 |
result = []
|
| 60 |
|
| 61 |
result.append( { 'start': 30, 'end': 60 } )
|
|
|
|
| 5 |
|
| 6 |
sys.path.append('../whisper-webui')
|
| 7 |
|
| 8 |
+
from src.vad import AbstractTranscription, TranscriptionConfig, VadSileroTranscription
|
| 9 |
|
| 10 |
class TestVad(unittest.TestCase):
|
| 11 |
def __init__(self, *args, **kwargs):
|
|
|
|
| 55 |
# For mocking, this just returns a simple numppy array
|
| 56 |
return np.array([start_time_seconds, duration_seconds], dtype=np.float64)
|
| 57 |
|
| 58 |
+
def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, duration: float):
|
| 59 |
result = []
|
| 60 |
|
| 61 |
result.append( { 'start': 30, 'end': 60 } )
|