Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

rockdrigo commited on Dec 6, 2023

Commit

c8c786a

1 Parent(s): 3fad813

use of silero model instead of silero VadIterator

Browse files

Files changed (5) hide show

mic_test_whisper_simple.py +4 -4
mic_test_whisper_streaming.py +1 -1
microphone_stream.py +1 -1
voice_activity_controller.py +56 -52
whisper_online.py +8 -1

mic_test_whisper_simple.py CHANGED Viewed

@@ -39,7 +39,6 @@ class SimpleASRProcessor:
             if chunk is not None:
                 sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
                 audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
-                # self.audio_buffer.append(chunk)
                 out = []
                 out.append(audio)
                 a = np.concatenate(out)
@@ -47,15 +46,16 @@ class SimpleASRProcessor:
             if is_final and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
-                # use custom ts_words
                 tsw = self.ts_words(res)
                 self.init_prompt = self.init_prompt + tsw
                 self.init_prompt  = self.init_prompt [-100:]
                 self.audio_buffer.resize(0)
                 iter_in_phrase =0
                 yield True, tsw
-            # show progress evry 10 chunks
-            elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                 # use custom ts_words
                 tsw = self.ts_words(res)

             if chunk is not None:
                 sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
                 audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
                 out = []
                 out.append(audio)
                 a = np.concatenate(out)
             if is_final and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                 tsw = self.ts_words(res)
                 self.init_prompt = self.init_prompt + tsw
                 self.init_prompt  = self.init_prompt [-100:]
                 self.audio_buffer.resize(0)
                 iter_in_phrase =0
                 yield True, tsw
+            # show progress evry 50 chunks
+            elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                 # use custom ts_words
                 tsw = self.ts_words(res)

mic_test_whisper_streaming.py CHANGED Viewed

@@ -13,7 +13,7 @@ model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1 * SAMPLING_RATE

 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
+min_sample_length = 1.5 * SAMPLING_RATE

microphone_stream.py CHANGED Viewed

@@ -29,7 +29,7 @@ class MicrophoneStream:
         self._pyaudio = pyaudio.PyAudio()
         self.sample_rate = sample_rate
-        self._chunk_size = int(self.sample_rate * 0.1)
         self._stream = self._pyaudio.open(
             format=pyaudio.paInt16,
             channels=1,

         self._pyaudio = pyaudio.PyAudio()
         self.sample_rate = sample_rate
+        self._chunk_size = int(self.sample_rate * 40  / 1000)
         self._stream = self._pyaudio.open(
             format=pyaudio.paInt16,
             channels=1,

voice_activity_controller.py CHANGED Viewed

@@ -3,16 +3,27 @@ import numpy as np
 # import sounddevice as sd
 import torch
 import numpy as np
 class VoiceActivityController:
     def __init__(
             self,
             sampling_rate = 16000,
-            second_ofSilence = 0.5,
-            second_ofSpeech = 0.25,
             use_vad_result = True,
             activity_detected_callback=None,
         ):
         self.activity_detected_callback=activity_detected_callback
         self.model, self.utils = torch.hub.load(
@@ -26,84 +37,77 @@ class VoiceActivityController:
         collect_chunks) = self.utils
         self.sampling_rate = sampling_rate
-        self.silence_limit = second_ofSilence * self.sampling_rate
-        self.speech_limit = second_ofSpeech *self.sampling_rate
         self.use_vad_result = use_vad_result
-        self.vad_iterator = VADIterator(
-            model =self.model,
-            threshold = 0.3, # 0.5
-            sampling_rate= self.sampling_rate,
-            min_silence_duration_ms = 500, #100
-            speech_pad_ms = 400 #30
-        )
         self.last_marked_chunk = None
-    def int2float(self, sound):
-        abs_max = np.abs(sound).max()
-        sound = sound.astype('float32')
-        if abs_max > 0:
-            sound *= 1/32768
-        sound = sound.squeeze()  # depends on the use case
-        return sound
     def apply_vad(self, audio):
-        audio_float32 = self.int2float(audio)
-        chunk = self.vad_iterator(audio_float32, return_seconds=False)
-        if chunk is not None:
-            if "start" in chunk:
-                start = chunk["start"]
-                self.last_marked_chunk = chunk
-                return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
-            if "end" in chunk:
-                #todo: pending get the padding from the next chunk
-                end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
-                self.last_marked_chunk = chunk
-                return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
-        if self.last_marked_chunk is not None:
-            if "start" in self.last_marked_chunk:
-                return audio, len(audio)  ,0
-            if "end" in self.last_marked_chunk:
-                return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio)
-        return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0
     def detect_user_speech(self, audio_stream, audio_in_int16 = False):
-        silence_len= 0
         speech_len = 0
         for data in audio_stream:  # replace with your condition of choice
-            # if isinstance(data, EndOfTransmission):
-            #     raise EndOfTransmission("End of transmission detected")
             audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
             wav = audio_block
             is_final = False
-            voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
-            # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
             if speech_in_wav > 0 :
-                silence_len= 0
                 speech_len += speech_in_wav
                 if self.activity_detected_callback is not None:
                     self.activity_detected_callback()
-            silence_len = silence_len + last_silent_duration_in_wav
-            if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
                 is_final = True
-                silence_len= 0
-                speech_len = 0
             yield voice_audio.tobytes(), is_final

 # import sounddevice as sd
 import torch
 import numpy as np
+import datetime
+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound
 class VoiceActivityController:
     def __init__(
             self,
             sampling_rate = 16000,
+            min_silence_to_final_ms = 500,
+            min_speech_to_final_ms = 100,
+            min_silence_duration_ms = 100,
             use_vad_result = True,
             activity_detected_callback=None,
+            threshold =0.3
         ):
         self.activity_detected_callback=activity_detected_callback
         self.model, self.utils = torch.hub.load(
         collect_chunks) = self.utils
         self.sampling_rate = sampling_rate
+        self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000
+        self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
         self.use_vad_result = use_vad_result
         self.last_marked_chunk = None
+        self.threshold = threshold
+        self.reset_states()
+    def reset_states(self):
+        self.model.reset_states()
+        self.temp_end = 0
+        self.current_sample = 0
     def apply_vad(self, audio):
+        x = int2float(audio)
+        if not torch.is_tensor(x):
+            try:
+                x = torch.Tensor(x)
+            except:
+                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
+        speech_prob = self.model(x, self.sampling_rate).item()
+        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+        self.current_sample += window_size_samples
+        if (speech_prob >= self.threshold):
+            self.temp_end = 0
+            return audio, window_size_samples, 0
+        else :
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return audio, 0, window_size_samples
+            else:
+                return np.array([], dtype=np.float16) , 0, window_size_samples
     def detect_user_speech(self, audio_stream, audio_in_int16 = False):
+        last_silence_len= 0
         speech_len = 0
         for data in audio_stream:  # replace with your condition of choice
             audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
             wav = audio_block
             is_final = False
+            voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
             if speech_in_wav > 0 :
+                last_silence_len= 0
                 speech_len += speech_in_wav
                 if self.activity_detected_callback is not None:
                     self.activity_detected_callback()
+            last_silence_len +=  last_silent_in_wav
+            if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
                 is_final = True
+                last_silence_len= 0
+                speech_len = 0
             yield voice_audio.tobytes(), is_final

whisper_online.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import librosa
 from functools import lru_cache
 import time
 @lru_cache
@@ -118,14 +118,21 @@ class FasterWhisperASR(ASRBase):
         return model
     def transcribe(self, audio, init_prompt=""):
         # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
         segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
         return list(segments)
     def ts_words(self, segments):
         o = []
         for segment in segments:
             for word in segment.words:
                 # not stripping the spaces -- should not be merged with them!
                 w = word.word
                 t = (word.start, word.end, w)

 import librosa
 from functools import lru_cache
 import time
+import datetime
 @lru_cache
         return model
     def transcribe(self, audio, init_prompt=""):
+        # tiempo_inicio = datetime.datetime.now()
         # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
         segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
+        # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe  take { (datetime.datetime.now() -tiempo_inicio)  } ms.')
         return list(segments)
     def ts_words(self, segments):
         o = []
         for segment in segments:
             for word in segment.words:
+                if segment.no_speech_prob > 0.9:
+                    continue
                 # not stripping the spaces -- should not be merged with them!
                 w = word.word
                 t = (word.start, word.end, w)