vad
Browse files
mic_test_whisper_simple.py
CHANGED
@@ -72,12 +72,12 @@ SAMPLING_RATE = 16000
|
|
72 |
model = "large-v2"
|
73 |
src_lan = "en" # source language
|
74 |
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
75 |
-
|
76 |
min_sample_length = 1 * SAMPLING_RATE
|
77 |
|
78 |
|
79 |
|
80 |
-
|
81 |
asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
|
82 |
|
83 |
tokenizer = create_tokenizer(tgt_lan)
|
@@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr)
|
|
85 |
|
86 |
|
87 |
stream = MicrophoneStream()
|
88 |
-
stream =
|
89 |
stream = online.stream_process(stream)
|
90 |
|
91 |
for isFinal, text in stream:
|
|
|
72 |
model = "large-v2"
|
73 |
src_lan = "en" # source language
|
74 |
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
75 |
+
use_vad = False
|
76 |
min_sample_length = 1 * SAMPLING_RATE
|
77 |
|
78 |
|
79 |
|
80 |
+
vac = VoiceActivityController(use_vad_result = use_vad)
|
81 |
asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
|
82 |
|
83 |
tokenizer = create_tokenizer(tgt_lan)
|
|
|
85 |
|
86 |
|
87 |
stream = MicrophoneStream()
|
88 |
+
stream = vac.detect_user_speech(stream, audio_in_int16 = False)
|
89 |
stream = online.stream_process(stream)
|
90 |
|
91 |
for isFinal, text in stream:
|
mic_test_whisper_streaming.py
CHANGED
@@ -13,7 +13,7 @@ model = "large-v2"
|
|
13 |
src_lan = "en" # source language
|
14 |
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
15 |
use_vad_result = True
|
16 |
-
min_sample_length = 1
|
17 |
|
18 |
|
19 |
|
@@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream): # processing loop:
|
|
54 |
|
55 |
if is_final:
|
56 |
o = online.finish()
|
57 |
-
online.init()
|
58 |
# final_processing_pending = False
|
59 |
print('-----'*10)
|
60 |
complete_text = complete_text + o[2]
|
61 |
print('FINAL - '+ complete_text) # do something with current partial output
|
62 |
print('-----'*10)
|
|
|
63 |
out = []
|
64 |
out_len = 0
|
65 |
|
|
|
13 |
src_lan = "en" # source language
|
14 |
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
15 |
use_vad_result = True
|
16 |
+
min_sample_length = 1 * SAMPLING_RATE
|
17 |
|
18 |
|
19 |
|
|
|
54 |
|
55 |
if is_final:
|
56 |
o = online.finish()
|
|
|
57 |
# final_processing_pending = False
|
58 |
print('-----'*10)
|
59 |
complete_text = complete_text + o[2]
|
60 |
print('FINAL - '+ complete_text) # do something with current partial output
|
61 |
print('-----'*10)
|
62 |
+
online.init()
|
63 |
out = []
|
64 |
out_len = 0
|
65 |
|
voice_activity_controller.py
CHANGED
@@ -76,7 +76,7 @@ class VoiceActivityController:
|
|
76 |
if self.current_sample - self.temp_end < self.min_silence_samples:
|
77 |
return audio, 0, window_size_samples
|
78 |
else:
|
79 |
-
return np.array([], dtype=np.float16) , 0, window_size_samples
|
80 |
|
81 |
|
82 |
|
|
|
76 |
if self.current_sample - self.temp_end < self.min_silence_samples:
|
77 |
return audio, 0, window_size_samples
|
78 |
else:
|
79 |
+
return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
|
80 |
|
81 |
|
82 |
|