Update modules/vad/silero_vad.py
Browse files- modules/vad/silero_vad.py +19 -19
modules/vad/silero_vad.py
CHANGED
|
@@ -4,11 +4,11 @@ from faster_whisper.vad import VadOptions, get_vad_model
|
|
| 4 |
import numpy as np
|
| 5 |
from typing import BinaryIO, Union, List, Optional, Tuple
|
| 6 |
import warnings
|
|
|
|
| 7 |
import faster_whisper
|
| 8 |
from faster_whisper.transcribe import SpeechTimestampsMap, Segment
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
-
|
| 12 |
class SileroVAD:
|
| 13 |
def __init__(self):
|
| 14 |
self.sampling_rate = 16000
|
|
@@ -57,6 +57,7 @@ class SileroVAD:
|
|
| 57 |
vad_options=vad_parameters,
|
| 58 |
progress=progress
|
| 59 |
)
|
|
|
|
| 60 |
audio = self.collect_chunks(audio, speech_chunks)
|
| 61 |
duration_after_vad = audio.shape[0] / sampling_rate
|
| 62 |
|
|
@@ -93,35 +94,27 @@ class SileroVAD:
|
|
| 93 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
| 94 |
window_size_samples = self.window_size_samples
|
| 95 |
speech_pad_ms = vad_options.speech_pad_ms
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
| 99 |
max_speech_samples = (
|
| 100 |
-
sampling_rate * max_speech_duration_s
|
| 101 |
- window_size_samples
|
| 102 |
- 2 * speech_pad_samples
|
| 103 |
)
|
| 104 |
-
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
| 105 |
-
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
| 106 |
|
| 107 |
audio_length_samples = len(audio)
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
|
| 114 |
-
|
| 115 |
-
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
| 116 |
-
if len(chunk) < window_size_samples:
|
| 117 |
-
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
| 118 |
-
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
|
| 119 |
-
speech_probs.append(speech_prob)
|
| 120 |
|
| 121 |
triggered = False
|
| 122 |
speeches = []
|
| 123 |
current_speech = {}
|
| 124 |
-
neg_threshold =
|
| 125 |
|
| 126 |
# to save potential segment end (and tolerate some silence)
|
| 127 |
temp_end = 0
|
|
@@ -222,6 +215,13 @@ class SileroVAD:
|
|
| 222 |
|
| 223 |
return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
@staticmethod
|
| 226 |
def format_timestamp(
|
| 227 |
seconds: float,
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
from typing import BinaryIO, Union, List, Optional, Tuple
|
| 6 |
import warnings
|
| 7 |
+
import bisect
|
| 8 |
import faster_whisper
|
| 9 |
from faster_whisper.transcribe import SpeechTimestampsMap, Segment
|
| 10 |
import gradio as gr
|
| 11 |
|
|
|
|
| 12 |
class SileroVAD:
|
| 13 |
def __init__(self):
|
| 14 |
self.sampling_rate = 16000
|
|
|
|
| 57 |
vad_options=vad_parameters,
|
| 58 |
progress=progress
|
| 59 |
)
|
| 60 |
+
|
| 61 |
audio = self.collect_chunks(audio, speech_chunks)
|
| 62 |
duration_after_vad = audio.shape[0] / sampling_rate
|
| 63 |
|
|
|
|
| 94 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
| 95 |
window_size_samples = self.window_size_samples
|
| 96 |
speech_pad_ms = vad_options.speech_pad_ms
|
| 97 |
+
min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
|
| 98 |
+
speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
|
|
|
|
| 99 |
max_speech_samples = (
|
| 100 |
+
self.sampling_rate * max_speech_duration_s
|
| 101 |
- window_size_samples
|
| 102 |
- 2 * speech_pad_samples
|
| 103 |
)
|
| 104 |
+
min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
|
| 105 |
+
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
|
| 106 |
|
| 107 |
audio_length_samples = len(audio)
|
| 108 |
|
| 109 |
+
padded_audio = np.pad(
|
| 110 |
+
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
|
| 111 |
+
)
|
| 112 |
+
speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
triggered = False
|
| 115 |
speeches = []
|
| 116 |
current_speech = {}
|
| 117 |
+
neg_threshold = vad_options.neg_threshold
|
| 118 |
|
| 119 |
# to save potential segment end (and tolerate some silence)
|
| 120 |
temp_end = 0
|
|
|
|
| 215 |
|
| 216 |
return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
|
| 217 |
|
| 218 |
+
def get_chunk_index(self, time: float) -> int:
|
| 219 |
+
sample = int(time * self.sampling_rate)
|
| 220 |
+
return min(
|
| 221 |
+
bisect.bisect(self.chunk_end_sample, sample),
|
| 222 |
+
len(self.chunk_end_sample) - 1,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
@staticmethod
|
| 226 |
def format_timestamp(
|
| 227 |
seconds: float,
|