Dominik Macháček commited on
Commit
e6648e4
·
1 Parent(s): 863242f

fixed silero vad chunk size

Browse files

issues #141 #121 #142 #136 etc.

silero_vad.py → silero_vad_iterator.py RENAMED
@@ -2,6 +2,7 @@ import torch
2
 
3
  # This is copied from silero-vad's vad_utils.py:
4
  # https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
 
5
 
6
  # Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
7
 
@@ -10,8 +11,8 @@ class VADIterator:
10
  model,
11
  threshold: float = 0.5,
12
  sampling_rate: int = 16000,
13
- min_silence_duration_ms: int = 100,
14
- speech_pad_ms: int = 30
15
  ):
16
 
17
  """
@@ -95,11 +96,14 @@ class VADIterator:
95
  return None
96
 
97
  #######################
98
- # this is our workaround for Silero v5 requiring at least 512-sized audio chunks
99
- # (see https://github.com/ufal/whisper_streaming/issues/116 )
100
 
101
  import numpy as np
102
  class FixedVADIterator(VADIterator):
 
 
 
 
103
 
104
  def reset_states(self):
105
  super().reset_states()
@@ -107,11 +111,19 @@ class FixedVADIterator(VADIterator):
107
 
108
  def __call__(self, x, return_seconds=False):
109
  self.buffer = np.append(self.buffer, x)
110
- if len(self.buffer) >= 512:
111
- ret = super().__call__(self.buffer, return_seconds=return_seconds)
112
- self.buffer = np.array([],dtype=np.float32)
113
- return ret
114
- return None
 
 
 
 
 
 
 
 
115
 
116
  if __name__ == "__main__":
117
  # test/demonstrate the need for FixedVADIterator:
 
2
 
3
  # This is copied from silero-vad's vad_utils.py:
4
  # https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
5
+ # (except changed defaults)
6
 
7
  # Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
8
 
 
11
  model,
12
  threshold: float = 0.5,
13
  sampling_rate: int = 16000,
14
+ min_silence_duration_ms: int = 500, # makes sense on one recording that I checked
15
+ speech_pad_ms: int = 100 # same
16
  ):
17
 
18
  """
 
96
  return None
97
 
98
  #######################
99
+ # because Silero now requires exactly 512-sized audio chunks
 
100
 
101
  import numpy as np
102
  class FixedVADIterator(VADIterator):
103
+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
104
+ If audio to be processed at once is long and multiple voiced segments detected,
105
+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
106
+ '''
107
 
108
  def reset_states(self):
109
  super().reset_states()
 
111
 
112
  def __call__(self, x, return_seconds=False):
113
  self.buffer = np.append(self.buffer, x)
114
+ ret = None
115
+ while len(self.buffer) >= 512:
116
+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
117
+ self.buffer = self.buffer[512:]
118
+ if ret is None:
119
+ ret = r
120
+ elif r is not None:
121
+ if 'end' in r:
122
+ ret['end'] = r['end'] # the latter end
123
+ if 'start' in r and 'end' in ret: # there is an earlier start.
124
+ # Remove end, merging this segment with the previous one.
125
+ del ret['end']
126
+ return ret if ret != {} else None
127
 
128
  if __name__ == "__main__":
129
  # test/demonstrate the need for FixedVADIterator:
whisper_online.py CHANGED
@@ -534,8 +534,8 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
534
  repo_or_dir='snakers4/silero-vad',
535
  model='silero_vad'
536
  )
537
- from silero_vad import FixedVADIterator
538
- self.vac = FixedVADIterator(model) # we use all the default options: 500ms silence, etc.
539
 
540
  self.logfile = self.online.logfile
541
  self.init()
@@ -561,24 +561,31 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
561
  self.audio_buffer = np.append(self.audio_buffer, audio)
562
 
563
  if res is not None:
564
- frame = list(res.values())[0]
565
  if 'start' in res and 'end' not in res:
566
  self.status = 'voice'
567
- send_audio = self.audio_buffer[frame-self.buffer_offset:]
568
- self.online.init(offset=frame/self.SAMPLING_RATE)
569
  self.online.insert_audio_chunk(send_audio)
570
  self.current_online_chunk_buffer_size += len(send_audio)
571
  self.clear_buffer()
572
  elif 'end' in res and 'start' not in res:
573
  self.status = 'nonvoice'
574
- send_audio = self.audio_buffer[:frame-self.buffer_offset]
575
  self.online.insert_audio_chunk(send_audio)
576
  self.current_online_chunk_buffer_size += len(send_audio)
577
  self.is_currently_final = True
578
  self.clear_buffer()
579
  else:
580
- # It doesn't happen in the current code.
581
- raise NotImplemented("both start and end of voice in one chunk!!!")
 
 
 
 
 
 
 
582
  else:
583
  if self.status == 'voice':
584
  self.online.insert_audio_chunk(self.audio_buffer)
 
534
  repo_or_dir='snakers4/silero-vad',
535
  model='silero_vad'
536
  )
537
+ from silero_vad_iterator import FixedVADIterator
538
+ self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc.
539
 
540
  self.logfile = self.online.logfile
541
  self.init()
 
561
  self.audio_buffer = np.append(self.audio_buffer, audio)
562
 
563
  if res is not None:
564
+ frame = list(res.values())[0]-self.buffer_offset
565
  if 'start' in res and 'end' not in res:
566
  self.status = 'voice'
567
+ send_audio = self.audio_buffer[frame:]
568
+ self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE)
569
  self.online.insert_audio_chunk(send_audio)
570
  self.current_online_chunk_buffer_size += len(send_audio)
571
  self.clear_buffer()
572
  elif 'end' in res and 'start' not in res:
573
  self.status = 'nonvoice'
574
+ send_audio = self.audio_buffer[:frame]
575
  self.online.insert_audio_chunk(send_audio)
576
  self.current_online_chunk_buffer_size += len(send_audio)
577
  self.is_currently_final = True
578
  self.clear_buffer()
579
  else:
580
+ beg = res["start"]-self.buffer_offset
581
+ end = res["end"]-self.buffer_offset
582
+ self.status = 'nonvoice'
583
+ send_audio = self.audio_buffer[beg:end]
584
+ self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE)
585
+ self.online.insert_audio_chunk(send_audio)
586
+ self.current_online_chunk_buffer_size += len(send_audio)
587
+ self.is_currently_final = True
588
+ self.clear_buffer()
589
  else:
590
  if self.status == 'voice':
591
  self.online.insert_audio_chunk(self.audio_buffer)