Spaces:
Sleeping
Sleeping
Process VAD in chunks of up to 1 hour
Browse files- src/vad.py +28 -7
src/vad.py
CHANGED
|
@@ -34,6 +34,8 @@ TRANSCRIBE_NON_SPEECH = False
|
|
| 34 |
# Minimum size of segments to process
|
| 35 |
MIN_SEGMENT_DURATION = 1
|
| 36 |
|
|
|
|
|
|
|
| 37 |
class AbstractTranscription(ABC):
|
| 38 |
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
|
| 39 |
self.sampling_rate = 16000
|
|
@@ -89,7 +91,7 @@ class AbstractTranscription(ABC):
|
|
| 89 |
pprint(merged)
|
| 90 |
|
| 91 |
if self.transcribe_non_speech:
|
| 92 |
-
max_audio_duration =
|
| 93 |
|
| 94 |
# Expand segments to include the gaps between them
|
| 95 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
|
@@ -120,7 +122,7 @@ class AbstractTranscription(ABC):
|
|
| 120 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "expanded: ", segment_expand_amount)
|
| 121 |
segment_result = whisperCallable(segment_audio)
|
| 122 |
|
| 123 |
-
adjusted_segments = self.
|
| 124 |
|
| 125 |
# Append to output
|
| 126 |
result['text'] += segment_result['text']
|
|
@@ -198,7 +200,7 @@ class AbstractTranscription(ABC):
|
|
| 198 |
|
| 199 |
return result
|
| 200 |
|
| 201 |
-
def
|
| 202 |
result = []
|
| 203 |
|
| 204 |
for segment in segments:
|
|
@@ -303,10 +305,26 @@ class VadSileroTranscription(AbstractTranscription):
|
|
| 303 |
(self.get_speech_timestamps, _, _, _, _) = utils
|
| 304 |
|
| 305 |
def get_transcribe_timestamps(self, audio: str):
|
| 306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
return seconds_timestamps
|
| 312 |
|
|
@@ -318,7 +336,7 @@ class VadPeriodicTranscription(AbstractTranscription):
|
|
| 318 |
|
| 319 |
def get_transcribe_timestamps(self, audio: str):
|
| 320 |
# Get duration in seconds
|
| 321 |
-
audio_duration =
|
| 322 |
result = []
|
| 323 |
|
| 324 |
# Generate a timestamp every N seconds
|
|
@@ -336,6 +354,9 @@ class VadPeriodicTranscription(AbstractTranscription):
|
|
| 336 |
|
| 337 |
return result
|
| 338 |
|
|
|
|
|
|
|
|
|
|
| 339 |
def load_audio(file: str, sample_rate: int = 16000,
|
| 340 |
start_time: str = None, duration: str = None):
|
| 341 |
"""
|
|
|
|
| 34 |
# Minimum size of segments to process
|
| 35 |
MIN_SEGMENT_DURATION = 1
|
| 36 |
|
| 37 |
+
VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
| 38 |
+
|
| 39 |
class AbstractTranscription(ABC):
|
| 40 |
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
|
| 41 |
self.sampling_rate = 16000
|
|
|
|
| 91 |
pprint(merged)
|
| 92 |
|
| 93 |
if self.transcribe_non_speech:
|
| 94 |
+
max_audio_duration = get_audio_duration(audio)
|
| 95 |
|
| 96 |
# Expand segments to include the gaps between them
|
| 97 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
|
|
|
| 122 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "expanded: ", segment_expand_amount)
|
| 123 |
segment_result = whisperCallable(segment_audio)
|
| 124 |
|
| 125 |
+
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
| 126 |
|
| 127 |
# Append to output
|
| 128 |
result['text'] += segment_result['text']
|
|
|
|
| 200 |
|
| 201 |
return result
|
| 202 |
|
| 203 |
+
def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
|
| 204 |
result = []
|
| 205 |
|
| 206 |
for segment in segments:
|
|
|
|
| 305 |
(self.get_speech_timestamps, _, _, _, _) = utils
|
| 306 |
|
| 307 |
def get_transcribe_timestamps(self, audio: str):
|
| 308 |
+
audio_duration = get_audio_duration(audio)
|
| 309 |
+
result = []
|
| 310 |
+
|
| 311 |
+
# Divide procesisng of audio into chunks
|
| 312 |
+
chunk_start = 0.0
|
| 313 |
+
|
| 314 |
+
while (chunk_start < audio_duration):
|
| 315 |
+
chunk_duration = min(audio_duration - chunk_start, VAD_MAX_PROCESSING_CHUNK)
|
| 316 |
|
| 317 |
+
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
| 318 |
+
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
| 319 |
+
|
| 320 |
+
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
|
| 321 |
+
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
| 322 |
+
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
| 323 |
+
|
| 324 |
+
pprint(adjusted)
|
| 325 |
+
|
| 326 |
+
result.extend(adjusted)
|
| 327 |
+
chunk_start += chunk_duration
|
| 328 |
|
| 329 |
return seconds_timestamps
|
| 330 |
|
|
|
|
| 336 |
|
| 337 |
def get_transcribe_timestamps(self, audio: str):
|
| 338 |
# Get duration in seconds
|
| 339 |
+
audio_duration = get_audio_duration(audio)
|
| 340 |
result = []
|
| 341 |
|
| 342 |
# Generate a timestamp every N seconds
|
|
|
|
| 354 |
|
| 355 |
return result
|
| 356 |
|
| 357 |
+
def get_audio_duration(file: str):
|
| 358 |
+
return float(ffmpeg.probe(file)["format"]["duration"])
|
| 359 |
+
|
| 360 |
def load_audio(file: str, sample_rate: int = 16000,
|
| 361 |
start_time: str = None, duration: str = None):
|
| 362 |
"""
|