rockdrigo commited on
Commit
324dee0
·
1 Parent(s): fe4207e
mic_test_whisper_simple.py CHANGED
@@ -72,12 +72,12 @@ SAMPLING_RATE = 16000
72
  model = "large-v2"
73
  src_lan = "en" # source language
74
  tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
75
- use_vad_result = True
76
  min_sample_length = 1 * SAMPLING_RATE
77
 
78
 
79
 
80
- vad = VoiceActivityController(use_vad_result = use_vad_result)
81
  asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
82
 
83
  tokenizer = create_tokenizer(tgt_lan)
@@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr)
85
 
86
 
87
  stream = MicrophoneStream()
88
- stream = vad.detect_user_speech(stream, audio_in_int16 = False)
89
  stream = online.stream_process(stream)
90
 
91
  for isFinal, text in stream:
 
72
  model = "large-v2"
73
  src_lan = "en" # source language
74
  tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
75
+ use_vad = False
76
  min_sample_length = 1 * SAMPLING_RATE
77
 
78
 
79
 
80
+ vac = VoiceActivityController(use_vad_result = use_vad)
81
  asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
82
 
83
  tokenizer = create_tokenizer(tgt_lan)
 
85
 
86
 
87
  stream = MicrophoneStream()
88
+ stream = vac.detect_user_speech(stream, audio_in_int16 = False)
89
  stream = online.stream_process(stream)
90
 
91
  for isFinal, text in stream:
mic_test_whisper_streaming.py CHANGED
@@ -13,7 +13,7 @@ model = "large-v2"
13
  src_lan = "en" # source language
14
  tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15
  use_vad_result = True
16
- min_sample_length = 1.5 * SAMPLING_RATE
17
 
18
 
19
 
@@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream): # processing loop:
54
 
55
  if is_final:
56
  o = online.finish()
57
- online.init()
58
  # final_processing_pending = False
59
  print('-----'*10)
60
  complete_text = complete_text + o[2]
61
  print('FINAL - '+ complete_text) # do something with current partial output
62
  print('-----'*10)
 
63
  out = []
64
  out_len = 0
65
 
 
13
  src_lan = "en" # source language
14
  tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15
  use_vad_result = True
16
+ min_sample_length = 1 * SAMPLING_RATE
17
 
18
 
19
 
 
54
 
55
  if is_final:
56
  o = online.finish()
 
57
  # final_processing_pending = False
58
  print('-----'*10)
59
  complete_text = complete_text + o[2]
60
  print('FINAL - '+ complete_text) # do something with current partial output
61
  print('-----'*10)
62
+ online.init()
63
  out = []
64
  out_len = 0
65
 
voice_activity_controller.py CHANGED
@@ -76,7 +76,7 @@ class VoiceActivityController:
76
  if self.current_sample - self.temp_end < self.min_silence_samples:
77
  return audio, 0, window_size_samples
78
  else:
79
- return np.array([], dtype=np.float16) , 0, window_size_samples
80
 
81
 
82
 
 
76
  if self.current_sample - self.temp_end < self.min_silence_samples:
77
  return audio, 0, window_size_samples
78
  else:
79
+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples
80
 
81
 
82