Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

Dominik Macháček commited on Aug 18, 2024

Commit

2ec2266

1 Parent(s): f390770

remove mic test and streams

Browse files

Files changed (3) hide show

mic_test_whisper_simple.py +0 -95
mic_test_whisper_streaming.py +0 -71
microphone_stream.py +0 -82

mic_test_whisper_simple.py DELETED Viewed

@@ -1,95 +0,0 @@
-from microphone_stream import MicrophoneStream
-from voice_activity_controller import VoiceActivityController
-from whisper_online import *
-import numpy as np
-import librosa
-import io
-import soundfile
-import sys
-class SimpleASRProcessor:
-    def __init__(self, asr, sampling_rate = 16000):
-        """run this when starting or restarting processing"""
-        self.audio_buffer = np.array([],dtype=np.float32)
-        self.prompt_buffer = ""
-        self.asr = asr
-        self.sampling_rate = sampling_rate
-        self.init_prompt = ''
-    def ts_words(self, segments):
-        result = ""
-        for segment in segments:
-            if segment.no_speech_prob > 0.9:
-                continue
-            for word in segment.words:
-                w = word.word
-                t = (word.start, word.end, w)
-                result +=w
-        return result
-    def stream_process(self, vad_result):
-        iter_in_phrase = 0
-        for chunk, is_final in vad_result:
-            iter_in_phrase += 1
-            if chunk is not None:
-                sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
-                audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
-                out = []
-                out.append(audio)
-                a = np.concatenate(out)
-                self.audio_buffer = np.append(self.audio_buffer, a)
-            if is_final and len(self.audio_buffer) > 0:
-                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
-                tsw = self.ts_words(res)
-                self.init_prompt = self.init_prompt + tsw
-                self.init_prompt  = self.init_prompt [-100:]
-                self.audio_buffer.resize(0)
-                iter_in_phrase =0
-                yield True, tsw
-            # show progress evry 50 chunks
-            elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
-                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
-                # use custom ts_words
-                tsw = self.ts_words(res)
-                yield False, tsw
-SAMPLING_RATE = 16000
-model = "large-v2"
-src_lan = "en"  # source language
-tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
-use_vad = False
-min_sample_length = 1 * SAMPLING_RATE
-vac = VoiceActivityController(use_vad_result = use_vad)
-asr = FasterWhisperASR(src_lan, "large-v2")  # loads and wraps Whisper model
-tokenizer = create_tokenizer(tgt_lan)
-online = SimpleASRProcessor(asr)
-stream = MicrophoneStream()
-stream = vac.detect_user_speech(stream, audio_in_int16 = False)
-stream = online.stream_process(stream)
-for isFinal, text in stream:
-    if isFinal:
-        print( text,  end="\r\n")
-    else:
-        print( text,  end="\r")

mic_test_whisper_streaming.py DELETED Viewed

@@ -1,71 +0,0 @@
-from microphone_stream import MicrophoneStream
-from voice_activity_controller import VoiceActivityController
-from whisper_online import *
-import numpy as np
-import librosa
-import io
-import soundfile
-import sys
-SAMPLING_RATE = 16000
-model = "large-v2"
-src_lan = "en"  # source language
-tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
-use_vad_result = True
-min_sample_length = 1 * SAMPLING_RATE
-asr = FasterWhisperASR(src_lan, model)  # loads and wraps Whisper model
-tokenizer = create_tokenizer(tgt_lan)  # sentence segmenter for the target language
-online = OnlineASRProcessor(asr, tokenizer)  # create processing object
-microphone_stream = MicrophoneStream()
-vad = VoiceActivityController(use_vad_result = use_vad_result)
-complete_text = ''
-final_processing_pending = False
-out = []
-out_len = 0
-for iter in vad.detect_user_speech(microphone_stream):   # processing loop:
-    raw_bytes=  iter[0]
-    is_final =  iter[1]
-    if  raw_bytes:
-        sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
-        audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
-        out.append(audio)
-        out_len += len(audio)
-    if (is_final or out_len >= min_sample_length) and out_len>0:
-        a = np.concatenate(out)
-        online.insert_audio_chunk(a)
-    if out_len > min_sample_length:
-        o = online.process_iter()
-        print('-----'*10)
-        complete_text = complete_text + o[2]
-        print('PARTIAL - '+ complete_text) # do something with current partial output
-        print('-----'*10)
-        out = []
-        out_len = 0
-    if is_final:
-        o = online.finish()
-        # final_processing_pending = False
-        print('-----'*10)
-        complete_text = complete_text + o[2]
-        print('FINAL - '+ complete_text) # do something with current partial output
-        print('-----'*10)
-        online.init()
-        out = []
-        out_len = 0

microphone_stream.py DELETED Viewed

@@ -1,82 +0,0 @@
-### mic stream
-import queue
-import re
-import sys
-import pyaudio
-class MicrophoneStream:
-    def __init__(
-        self,
-        sample_rate: int = 16000,
-    ):
-        """
-        Creates a stream of audio from the microphone.
-        Args:
-            chunk_size: The size of each chunk of audio to read from the microphone.
-            channels: The number of channels to record audio from.
-            sample_rate: The sample rate to record audio at.
-        """
-        try:
-            import pyaudio
-        except ImportError:
-            raise Exception('py audio not installed')
-        self._pyaudio = pyaudio.PyAudio()
-        self.sample_rate = sample_rate
-        self._chunk_size = int(self.sample_rate * 40  / 1000)
-        self._stream = self._pyaudio.open(
-            format=pyaudio.paInt16,
-            channels=1,
-            rate=sample_rate,
-            input=True,
-            frames_per_buffer=self._chunk_size,
-        )
-        self._open = True
-    def __iter__(self):
-        """
-        Returns the iterator object.
-        """
-        return self
-    def __next__(self):
-        """
-        Reads a chunk of audio from the microphone.
-        """
-        if not self._open:
-            raise StopIteration
-        try:
-            return self._stream.read(self._chunk_size)
-        except KeyboardInterrupt:
-            raise StopIteration
-    def close(self):
-        """
-        Closes the stream.
-        """
-        self._open = False
-        if self._stream.is_active():
-            self._stream.stop_stream()
-        self._stream.close()
-        self._pyaudio.terminate()