Dominik Macháček commited on
Commit
6fa0080
·
1 Parent(s): d543411

- performance tests pending
- TODO: timestamps after refresh are decreasing

voice_activity_controller.py CHANGED
@@ -1,18 +1,5 @@
1
  import torch
2
  import numpy as np
3
- # import sounddevice as sd
4
- import torch
5
- import numpy as np
6
- import datetime
7
-
8
-
9
- def int2float(sound):
10
- abs_max = np.abs(sound).max()
11
- sound = sound.astype('float32')
12
- if abs_max > 0:
13
- sound *= 1/32768
14
- sound = sound.squeeze() # depends on the use case
15
- return sound
16
 
17
  class VoiceActivityController:
18
  def __init__(
@@ -22,10 +9,10 @@ class VoiceActivityController:
22
  min_speech_to_final_ms = 100,
23
  min_silence_duration_ms = 100,
24
  use_vad_result = True,
25
- activity_detected_callback=None,
26
  threshold =0.3
27
  ):
28
- self.activity_detected_callback=activity_detected_callback
29
  self.model, self.utils = torch.hub.load(
30
  repo_or_dir='snakers4/silero-vad',
31
  model='silero_vad'
@@ -42,7 +29,6 @@ class VoiceActivityController:
42
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
43
 
44
  self.use_vad_result = use_vad_result
45
- self.last_marked_chunk = None
46
  self.threshold = threshold
47
  self.reset_states()
48
 
@@ -55,7 +41,13 @@ class VoiceActivityController:
55
  self.speech_len = 0
56
 
57
  def apply_vad(self, audio):
58
- # x = int2float(audio)
 
 
 
 
 
 
59
  x = audio
60
  if not torch.is_tensor(x):
61
  try:
@@ -64,16 +56,16 @@ class VoiceActivityController:
64
  raise TypeError("Audio cannot be casted to tensor. Cast it manually")
65
 
66
  speech_prob = self.model(x, self.sampling_rate).item()
 
67
 
68
  window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
69
  self.current_sample += window_size_samples
70
 
71
-
72
- if (speech_prob >= self.threshold):
73
  self.temp_end = 0
74
  return audio, window_size_samples, 0
75
 
76
- else :
77
  if not self.temp_end:
78
  self.temp_end = self.current_sample
79
 
@@ -84,14 +76,12 @@ class VoiceActivityController:
84
 
85
 
86
  def detect_speech_iter(self, data, audio_in_int16 = False):
87
- # audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
88
  audio_block = data
89
  wav = audio_block
90
 
91
- print(wav, len(wav), type(wav), wav.dtype)
92
-
93
  is_final = False
94
  voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
 
95
 
96
 
97
  if speech_in_wav > 0 :
@@ -101,27 +91,20 @@ class VoiceActivityController:
101
  # self.activity_detected_callback()
102
 
103
  self.last_silence_len += last_silent_in_wav
 
 
104
  if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit:
 
105
 
106
  is_final = True
107
  self.last_silence_len= 0
108
  self.speech_len = 0
109
 
110
- # return voice_audio.tobytes(), is_final
111
  return voice_audio, is_final
112
 
113
-
114
-
115
  def detect_user_speech(self, audio_stream, audio_in_int16 = False):
116
  self.last_silence_len= 0
117
  self.speech_len = 0
118
 
119
  for data in audio_stream: # replace with your condition of choice
120
  yield self.detect_speech_iter(data, audio_in_int16)
121
-
122
-
123
-
124
-
125
-
126
-
127
-
 
1
  import torch
2
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  class VoiceActivityController:
5
  def __init__(
 
9
  min_speech_to_final_ms = 100,
10
  min_silence_duration_ms = 100,
11
  use_vad_result = True,
12
+ # activity_detected_callback=None,
13
  threshold =0.3
14
  ):
15
+ # self.activity_detected_callback=activity_detected_callback
16
  self.model, self.utils = torch.hub.load(
17
  repo_or_dir='snakers4/silero-vad',
18
  model='silero_vad'
 
29
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
30
 
31
  self.use_vad_result = use_vad_result
 
32
  self.threshold = threshold
33
  self.reset_states()
34
 
 
41
  self.speech_len = 0
42
 
43
  def apply_vad(self, audio):
44
+ """
45
+ returns: triple
46
+ (voice_audio,
47
+ speech_in_wav,
48
+ silence_in_wav)
49
+
50
+ """
51
  x = audio
52
  if not torch.is_tensor(x):
53
  try:
 
56
  raise TypeError("Audio cannot be casted to tensor. Cast it manually")
57
 
58
  speech_prob = self.model(x, self.sampling_rate).item()
59
+ print("speech_prob",speech_prob)
60
 
61
  window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
62
  self.current_sample += window_size_samples
63
 
64
+ if speech_prob >= self.threshold: # speech is detected
 
65
  self.temp_end = 0
66
  return audio, window_size_samples, 0
67
 
68
+ else: # silence detected, counting w
69
  if not self.temp_end:
70
  self.temp_end = self.current_sample
71
 
 
76
 
77
 
78
  def detect_speech_iter(self, data, audio_in_int16 = False):
 
79
  audio_block = data
80
  wav = audio_block
81
 
 
 
82
  is_final = False
83
  voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
84
+ print("speech, last silence",speech_in_wav, last_silent_in_wav)
85
 
86
 
87
  if speech_in_wav > 0 :
 
91
  # self.activity_detected_callback()
92
 
93
  self.last_silence_len += last_silent_in_wav
94
+ print("self.last_silence_len",self.last_silence_len, self.final_silence_limit,self.last_silence_len>= self.final_silence_limit)
95
+ print("self.speech_len, final_speech_limit",self.speech_len , self.final_speech_limit,self.speech_len >= self.final_speech_limit)
96
  if self.last_silence_len>= self.final_silence_limit and self.speech_len >= self.final_speech_limit:
97
+ for i in range(10): print("TADY!!!")
98
 
99
  is_final = True
100
  self.last_silence_len= 0
101
  self.speech_len = 0
102
 
 
103
  return voice_audio, is_final
104
 
 
 
105
  def detect_user_speech(self, audio_stream, audio_in_int16 = False):
106
  self.last_silence_len= 0
107
  self.speech_len = 0
108
 
109
  for data in audio_stream: # replace with your condition of choice
110
  yield self.detect_speech_iter(data, audio_in_int16)
 
 
 
 
 
 
 
whisper_online_server.py CHANGED
@@ -9,7 +9,8 @@ parser = argparse.ArgumentParser()
9
  # server options
10
  parser.add_argument("--host", type=str, default='localhost')
11
  parser.add_argument("--port", type=int, default=43007)
12
-
 
13
 
14
  # options from whisper_online
15
  add_shared_args(parser)
@@ -57,8 +58,11 @@ if args.buffer_trimming == "sentence":
57
  tokenizer = create_tokenizer(tgt_language)
58
  else:
59
  tokenizer = None
60
- online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
61
-
 
 
 
62
 
63
 
64
  demo_audio_path = "cs-maji-2.16k.wav"
 
9
  # server options
10
  parser.add_argument("--host", type=str, default='localhost')
11
  parser.add_argument("--port", type=int, default=43007)
12
+ parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
13
+ parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
14
 
15
  # options from whisper_online
16
  add_shared_args(parser)
 
58
  tokenizer = create_tokenizer(tgt_language)
59
  else:
60
  tokenizer = None
61
+ if not args.vac:
62
+ online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
63
+ else:
64
+ from whisper_online_vac import *
65
+ online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
66
 
67
 
68
  demo_audio_path = "cs-maji-2.16k.wav"
whisper_online_vac.py CHANGED
@@ -7,52 +7,46 @@ SAMPLING_RATE = 16000
7
 
8
  class VACOnlineASRProcessor(OnlineASRProcessor):
9
 
10
- def __init__(self, *a, **kw):
 
 
11
  self.online = OnlineASRProcessor(*a, **kw)
12
- self.vac = VoiceActivityController(use_vad_result = True)
13
 
14
- self.is_currently_final = False
15
  self.logfile = self.online.logfile
16
 
17
- #self.vac_buffer = io.BytesIO()
18
- #self.vac_stream = self.vac.detect_user_speech(self.vac_buffer, audio_in_int16=False)
19
-
20
- self.audio_log = open("audio_log.wav","wb")
21
 
22
  def init(self):
23
  self.online.init()
24
  self.vac.reset_states()
 
 
 
25
 
26
  def insert_audio_chunk(self, audio):
27
- print(audio, len(audio), type(audio), audio.dtype)
28
  r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
29
- raw_bytes, is_final = r
30
- print("is_final",is_final)
31
- print("raw_bytes", raw_bytes[:10], len(raw_bytes), type(raw_bytes))
32
- # self.audio_log.write(raw_bytes)
33
- #sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
34
- #audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
35
- audio = raw_bytes
36
- print("po překonvertování", audio, len(audio), type(audio), audio.dtype)
37
  self.is_currently_final = is_final
38
  self.online.insert_audio_chunk(audio)
39
- # self.audio_log.write(audio)
40
- self.audio_log.flush()
41
-
42
- print("inserted",file=self.logfile)
43
 
44
  def process_iter(self):
45
  if self.is_currently_final:
46
  return self.finish()
47
- else:
48
- print(self.online.audio_buffer)
49
  ret = self.online.process_iter()
50
- print("tady",file=self.logfile)
51
  return ret
 
 
 
52
 
53
  def finish(self):
54
  ret = self.online.finish()
55
  self.online.init()
 
56
  return ret
57
 
58
 
@@ -67,7 +61,7 @@ if __name__ == "__main__":
67
  parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
68
  parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
69
  parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
70
-
71
  args = parser.parse_args()
72
 
73
  # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
@@ -111,12 +105,12 @@ if __name__ == "__main__":
111
  asr.use_vad()
112
 
113
 
114
- min_chunk = args.min_chunk_size
115
  if args.buffer_trimming == "sentence":
116
  tokenizer = create_tokenizer(tgt_language)
117
  else:
118
  tokenizer = None
119
- online = VACOnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
120
 
121
 
122
  # load the audio into the LRU cache before we start the timer
 
7
 
8
  class VACOnlineASRProcessor(OnlineASRProcessor):
9
 
10
+ def __init__(self, online_chunk_size, *a, **kw):
11
+ self.online_chunk_size = online_chunk_size
12
+
13
  self.online = OnlineASRProcessor(*a, **kw)
14
+ self.vac = VoiceActivityController(use_vad_result = False)
15
 
 
16
  self.logfile = self.online.logfile
17
 
18
+ self.init()
 
 
 
19
 
20
  def init(self):
21
  self.online.init()
22
  self.vac.reset_states()
23
+ self.current_online_chunk_buffer_size = 0
24
+ self.is_currently_final = False
25
+
26
 
27
  def insert_audio_chunk(self, audio):
 
28
  r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
29
+ audio, is_final = r
30
+ print(is_final)
 
 
 
 
 
 
31
  self.is_currently_final = is_final
32
  self.online.insert_audio_chunk(audio)
33
+ self.current_online_chunk_buffer_size += len(audio)
 
 
 
34
 
35
  def process_iter(self):
36
  if self.is_currently_final:
37
  return self.finish()
38
+ elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size:
39
+ self.current_online_chunk_buffer_size = 0
40
  ret = self.online.process_iter()
 
41
  return ret
42
+ else:
43
+ print("no online update, only VAD", file=self.logfile)
44
+ return (None, None, "")
45
 
46
  def finish(self):
47
  ret = self.online.finish()
48
  self.online.init()
49
+ self.current_online_chunk_buffer_size = 0
50
  return ret
51
 
52
 
 
61
  parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
62
  parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
63
  parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
64
+ parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
65
  args = parser.parse_args()
66
 
67
  # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
 
105
  asr.use_vad()
106
 
107
 
108
+ min_chunk = args.vac_chunk_size
109
  if args.buffer_trimming == "sentence":
110
  tokenizer = create_tokenizer(tgt_language)
111
  else:
112
  tokenizer = None
113
+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
114
 
115
 
116
  # load the audio into the LRU cache before we start the timer