rockdrigo commited on
Commit
c8c786a
·
1 Parent(s): 3fad813

use of silero model instead of silero VadIterator

Browse files
mic_test_whisper_simple.py CHANGED
@@ -39,7 +39,6 @@ class SimpleASRProcessor:
39
  if chunk is not None:
40
  sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
41
  audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
42
- # self.audio_buffer.append(chunk)
43
  out = []
44
  out.append(audio)
45
  a = np.concatenate(out)
@@ -47,15 +46,16 @@ class SimpleASRProcessor:
47
 
48
  if is_final and len(self.audio_buffer) > 0:
49
  res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
50
- # use custom ts_words
51
  tsw = self.ts_words(res)
 
52
  self.init_prompt = self.init_prompt + tsw
53
  self.init_prompt = self.init_prompt [-100:]
54
  self.audio_buffer.resize(0)
55
  iter_in_phrase =0
 
56
  yield True, tsw
57
- # show progress evry 10 chunks
58
- elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
59
  res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
60
  # use custom ts_words
61
  tsw = self.ts_words(res)
 
39
  if chunk is not None:
40
  sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
41
  audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
 
42
  out = []
43
  out.append(audio)
44
  a = np.concatenate(out)
 
46
 
47
  if is_final and len(self.audio_buffer) > 0:
48
  res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
 
49
  tsw = self.ts_words(res)
50
+
51
  self.init_prompt = self.init_prompt + tsw
52
  self.init_prompt = self.init_prompt [-100:]
53
  self.audio_buffer.resize(0)
54
  iter_in_phrase =0
55
+
56
  yield True, tsw
57
+ # show progress evry 50 chunks
58
+ elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
59
  res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
60
  # use custom ts_words
61
  tsw = self.ts_words(res)
mic_test_whisper_streaming.py CHANGED
@@ -13,7 +13,7 @@ model = "large-v2"
13
  src_lan = "en" # source language
14
  tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15
  use_vad_result = True
16
- min_sample_length = 1 * SAMPLING_RATE
17
 
18
 
19
 
 
13
  src_lan = "en" # source language
14
  tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15
  use_vad_result = True
16
+ min_sample_length = 1.5 * SAMPLING_RATE
17
 
18
 
19
 
microphone_stream.py CHANGED
@@ -29,7 +29,7 @@ class MicrophoneStream:
29
  self._pyaudio = pyaudio.PyAudio()
30
  self.sample_rate = sample_rate
31
 
32
- self._chunk_size = int(self.sample_rate * 0.1)
33
  self._stream = self._pyaudio.open(
34
  format=pyaudio.paInt16,
35
  channels=1,
 
29
  self._pyaudio = pyaudio.PyAudio()
30
  self.sample_rate = sample_rate
31
 
32
+ self._chunk_size = int(self.sample_rate * 40 / 1000)
33
  self._stream = self._pyaudio.open(
34
  format=pyaudio.paInt16,
35
  channels=1,
voice_activity_controller.py CHANGED
@@ -3,16 +3,27 @@ import numpy as np
3
  # import sounddevice as sd
4
  import torch
5
  import numpy as np
 
6
 
7
 
 
 
 
 
 
 
 
 
8
  class VoiceActivityController:
9
  def __init__(
10
  self,
11
  sampling_rate = 16000,
12
- second_ofSilence = 0.5,
13
- second_ofSpeech = 0.25,
 
14
  use_vad_result = True,
15
  activity_detected_callback=None,
 
16
  ):
17
  self.activity_detected_callback=activity_detected_callback
18
  self.model, self.utils = torch.hub.load(
@@ -26,84 +37,77 @@ class VoiceActivityController:
26
  collect_chunks) = self.utils
27
 
28
  self.sampling_rate = sampling_rate
29
- self.silence_limit = second_ofSilence * self.sampling_rate
30
- self.speech_limit = second_ofSpeech *self.sampling_rate
 
31
 
32
  self.use_vad_result = use_vad_result
33
- self.vad_iterator = VADIterator(
34
- model =self.model,
35
- threshold = 0.3, # 0.5
36
- sampling_rate= self.sampling_rate,
37
- min_silence_duration_ms = 500, #100
38
- speech_pad_ms = 400 #30
39
- )
40
  self.last_marked_chunk = None
41
-
42
-
43
- def int2float(self, sound):
44
- abs_max = np.abs(sound).max()
45
- sound = sound.astype('float32')
46
- if abs_max > 0:
47
- sound *= 1/32768
48
- sound = sound.squeeze() # depends on the use case
49
- return sound
50
 
51
  def apply_vad(self, audio):
52
- audio_float32 = self.int2float(audio)
53
- chunk = self.vad_iterator(audio_float32, return_seconds=False)
54
-
55
- if chunk is not None:
56
- if "start" in chunk:
57
- start = chunk["start"]
58
- self.last_marked_chunk = chunk
59
- return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
60
-
61
- if "end" in chunk:
62
- #todo: pending get the padding from the next chunk
63
- end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
64
- self.last_marked_chunk = chunk
65
- return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
66
 
67
- if self.last_marked_chunk is not None:
68
- if "start" in self.last_marked_chunk:
69
- return audio, len(audio) ,0
 
 
 
 
 
 
 
 
 
70
 
71
- if "end" in self.last_marked_chunk:
72
- return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio)
73
 
74
- return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0
75
 
76
 
77
 
78
  def detect_user_speech(self, audio_stream, audio_in_int16 = False):
79
- silence_len= 0
80
  speech_len = 0
81
 
82
  for data in audio_stream: # replace with your condition of choice
83
- # if isinstance(data, EndOfTransmission):
84
- # raise EndOfTransmission("End of transmission detected")
85
 
86
 
87
  audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
88
  wav = audio_block
89
 
90
-
91
  is_final = False
92
- voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
93
- # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
94
 
95
  if speech_in_wav > 0 :
96
- silence_len= 0
97
  speech_len += speech_in_wav
98
  if self.activity_detected_callback is not None:
99
  self.activity_detected_callback()
100
 
101
- silence_len = silence_len + last_silent_duration_in_wav
102
- if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
 
103
  is_final = True
104
- silence_len= 0
105
- speech_len = 0
106
-
107
 
108
  yield voice_audio.tobytes(), is_final
109
 
 
3
  # import sounddevice as sd
4
  import torch
5
  import numpy as np
6
+ import datetime
7
 
8
 
9
+ def int2float(sound):
10
+ abs_max = np.abs(sound).max()
11
+ sound = sound.astype('float32')
12
+ if abs_max > 0:
13
+ sound *= 1/32768
14
+ sound = sound.squeeze() # depends on the use case
15
+ return sound
16
+
17
  class VoiceActivityController:
18
  def __init__(
19
  self,
20
  sampling_rate = 16000,
21
+ min_silence_to_final_ms = 500,
22
+ min_speech_to_final_ms = 100,
23
+ min_silence_duration_ms = 100,
24
  use_vad_result = True,
25
  activity_detected_callback=None,
26
+ threshold =0.3
27
  ):
28
  self.activity_detected_callback=activity_detected_callback
29
  self.model, self.utils = torch.hub.load(
 
37
  collect_chunks) = self.utils
38
 
39
  self.sampling_rate = sampling_rate
40
+ self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000
41
+ self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
42
+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
43
 
44
  self.use_vad_result = use_vad_result
 
 
 
 
 
 
 
45
  self.last_marked_chunk = None
46
+ self.threshold = threshold
47
+ self.reset_states()
48
+
49
+ def reset_states(self):
50
+ self.model.reset_states()
51
+ self.temp_end = 0
52
+ self.current_sample = 0
 
 
53
 
54
  def apply_vad(self, audio):
55
+ x = int2float(audio)
56
+ if not torch.is_tensor(x):
57
+ try:
58
+ x = torch.Tensor(x)
59
+ except:
60
+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
61
+
62
+ speech_prob = self.model(x, self.sampling_rate).item()
63
+
64
+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
65
+ self.current_sample += window_size_samples
66
+
 
 
67
 
68
+ if (speech_prob >= self.threshold):
69
+ self.temp_end = 0
70
+ return audio, window_size_samples, 0
71
+
72
+ else :
73
+ if not self.temp_end:
74
+ self.temp_end = self.current_sample
75
+
76
+ if self.current_sample - self.temp_end < self.min_silence_samples:
77
+ return audio, 0, window_size_samples
78
+ else:
79
+ return np.array([], dtype=np.float16) , 0, window_size_samples
80
 
 
 
81
 
 
82
 
83
 
84
 
85
  def detect_user_speech(self, audio_stream, audio_in_int16 = False):
86
+ last_silence_len= 0
87
  speech_len = 0
88
 
89
  for data in audio_stream: # replace with your condition of choice
 
 
90
 
91
 
92
  audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
93
  wav = audio_block
94
 
 
95
  is_final = False
96
+ voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
97
+
98
 
99
  if speech_in_wav > 0 :
100
+ last_silence_len= 0
101
  speech_len += speech_in_wav
102
  if self.activity_detected_callback is not None:
103
  self.activity_detected_callback()
104
 
105
+ last_silence_len += last_silent_in_wav
106
+ if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
107
+
108
  is_final = True
109
+ last_silence_len= 0
110
+ speech_len = 0
 
111
 
112
  yield voice_audio.tobytes(), is_final
113
 
whisper_online.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  import librosa
5
  from functools import lru_cache
6
  import time
7
-
8
 
9
 
10
  @lru_cache
@@ -118,14 +118,21 @@ class FasterWhisperASR(ASRBase):
118
  return model
119
 
120
  def transcribe(self, audio, init_prompt=""):
 
 
121
  # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
122
  segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
 
 
 
123
  return list(segments)
124
 
125
  def ts_words(self, segments):
126
  o = []
127
  for segment in segments:
128
  for word in segment.words:
 
 
129
  # not stripping the spaces -- should not be merged with them!
130
  w = word.word
131
  t = (word.start, word.end, w)
 
4
  import librosa
5
  from functools import lru_cache
6
  import time
7
+ import datetime
8
 
9
 
10
  @lru_cache
 
118
  return model
119
 
120
  def transcribe(self, audio, init_prompt=""):
121
+
122
+ # tiempo_inicio = datetime.datetime.now()
123
  # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
124
  segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
125
+
126
+ # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.')
127
+
128
  return list(segments)
129
 
130
  def ts_words(self, segments):
131
  o = []
132
  for segment in segments:
133
  for word in segment.words:
134
+ if segment.no_speech_prob > 0.9:
135
+ continue
136
  # not stripping the spaces -- should not be merged with them!
137
  w = word.word
138
  t = (word.start, word.end, w)