Dominik Macháček commited on
Commit
2ec2266
·
1 Parent(s): f390770

remove mic test and streams

Browse files
mic_test_whisper_simple.py DELETED
@@ -1,95 +0,0 @@
1
- from microphone_stream import MicrophoneStream
2
- from voice_activity_controller import VoiceActivityController
3
- from whisper_online import *
4
- import numpy as np
5
- import librosa
6
- import io
7
- import soundfile
8
- import sys
9
-
10
-
11
-
12
-
13
- class SimpleASRProcessor:
14
-
15
- def __init__(self, asr, sampling_rate = 16000):
16
- """run this when starting or restarting processing"""
17
- self.audio_buffer = np.array([],dtype=np.float32)
18
- self.prompt_buffer = ""
19
- self.asr = asr
20
- self.sampling_rate = sampling_rate
21
- self.init_prompt = ''
22
-
23
- def ts_words(self, segments):
24
- result = ""
25
- for segment in segments:
26
- if segment.no_speech_prob > 0.9:
27
- continue
28
- for word in segment.words:
29
- w = word.word
30
- t = (word.start, word.end, w)
31
- result +=w
32
- return result
33
-
34
- def stream_process(self, vad_result):
35
- iter_in_phrase = 0
36
- for chunk, is_final in vad_result:
37
- iter_in_phrase += 1
38
-
39
- if chunk is not None:
40
- sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
41
- audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
42
- out = []
43
- out.append(audio)
44
- a = np.concatenate(out)
45
- self.audio_buffer = np.append(self.audio_buffer, a)
46
-
47
- if is_final and len(self.audio_buffer) > 0:
48
- res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
49
- tsw = self.ts_words(res)
50
-
51
- self.init_prompt = self.init_prompt + tsw
52
- self.init_prompt = self.init_prompt [-100:]
53
- self.audio_buffer.resize(0)
54
- iter_in_phrase =0
55
-
56
- yield True, tsw
57
- # show progress evry 50 chunks
58
- elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
59
- res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
60
- # use custom ts_words
61
- tsw = self.ts_words(res)
62
- yield False, tsw
63
-
64
-
65
-
66
-
67
-
68
-
69
-
70
- SAMPLING_RATE = 16000
71
-
72
- model = "large-v2"
73
- src_lan = "en" # source language
74
- tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
75
- use_vad = False
76
- min_sample_length = 1 * SAMPLING_RATE
77
-
78
-
79
-
80
- vac = VoiceActivityController(use_vad_result = use_vad)
81
- asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
82
-
83
- tokenizer = create_tokenizer(tgt_lan)
84
- online = SimpleASRProcessor(asr)
85
-
86
-
87
- stream = MicrophoneStream()
88
- stream = vac.detect_user_speech(stream, audio_in_int16 = False)
89
- stream = online.stream_process(stream)
90
-
91
- for isFinal, text in stream:
92
- if isFinal:
93
- print( text, end="\r\n")
94
- else:
95
- print( text, end="\r")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mic_test_whisper_streaming.py DELETED
@@ -1,71 +0,0 @@
1
- from microphone_stream import MicrophoneStream
2
- from voice_activity_controller import VoiceActivityController
3
- from whisper_online import *
4
- import numpy as np
5
- import librosa
6
- import io
7
- import soundfile
8
- import sys
9
-
10
-
11
- SAMPLING_RATE = 16000
12
- model = "large-v2"
13
- src_lan = "en" # source language
14
- tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15
- use_vad_result = True
16
- min_sample_length = 1 * SAMPLING_RATE
17
-
18
-
19
-
20
- asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model
21
- tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language
22
- online = OnlineASRProcessor(asr, tokenizer) # create processing object
23
-
24
- microphone_stream = MicrophoneStream()
25
- vad = VoiceActivityController(use_vad_result = use_vad_result)
26
-
27
- complete_text = ''
28
- final_processing_pending = False
29
- out = []
30
- out_len = 0
31
- for iter in vad.detect_user_speech(microphone_stream): # processing loop:
32
- raw_bytes= iter[0]
33
- is_final = iter[1]
34
-
35
- if raw_bytes:
36
- sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
37
- audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
38
- out.append(audio)
39
- out_len += len(audio)
40
-
41
-
42
- if (is_final or out_len >= min_sample_length) and out_len>0:
43
- a = np.concatenate(out)
44
- online.insert_audio_chunk(a)
45
-
46
- if out_len > min_sample_length:
47
- o = online.process_iter()
48
- print('-----'*10)
49
- complete_text = complete_text + o[2]
50
- print('PARTIAL - '+ complete_text) # do something with current partial output
51
- print('-----'*10)
52
- out = []
53
- out_len = 0
54
-
55
- if is_final:
56
- o = online.finish()
57
- # final_processing_pending = False
58
- print('-----'*10)
59
- complete_text = complete_text + o[2]
60
- print('FINAL - '+ complete_text) # do something with current partial output
61
- print('-----'*10)
62
- online.init()
63
- out = []
64
- out_len = 0
65
-
66
-
67
-
68
-
69
-
70
-
71
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
microphone_stream.py DELETED
@@ -1,82 +0,0 @@
1
-
2
-
3
- ### mic stream
4
-
5
- import queue
6
- import re
7
- import sys
8
- import pyaudio
9
-
10
-
11
- class MicrophoneStream:
12
- def __init__(
13
- self,
14
- sample_rate: int = 16000,
15
- ):
16
- """
17
- Creates a stream of audio from the microphone.
18
-
19
- Args:
20
- chunk_size: The size of each chunk of audio to read from the microphone.
21
- channels: The number of channels to record audio from.
22
- sample_rate: The sample rate to record audio at.
23
- """
24
- try:
25
- import pyaudio
26
- except ImportError:
27
- raise Exception('py audio not installed')
28
-
29
- self._pyaudio = pyaudio.PyAudio()
30
- self.sample_rate = sample_rate
31
-
32
- self._chunk_size = int(self.sample_rate * 40 / 1000)
33
- self._stream = self._pyaudio.open(
34
- format=pyaudio.paInt16,
35
- channels=1,
36
- rate=sample_rate,
37
- input=True,
38
- frames_per_buffer=self._chunk_size,
39
- )
40
-
41
- self._open = True
42
-
43
- def __iter__(self):
44
- """
45
- Returns the iterator object.
46
- """
47
-
48
- return self
49
-
50
- def __next__(self):
51
- """
52
- Reads a chunk of audio from the microphone.
53
- """
54
- if not self._open:
55
- raise StopIteration
56
-
57
- try:
58
- return self._stream.read(self._chunk_size)
59
- except KeyboardInterrupt:
60
- raise StopIteration
61
-
62
- def close(self):
63
- """
64
- Closes the stream.
65
- """
66
-
67
- self._open = False
68
-
69
- if self._stream.is_active():
70
- self._stream.stop_stream()
71
-
72
- self._stream.close()
73
- self._pyaudio.terminate()
74
-
75
-
76
-
77
-
78
-
79
-
80
-
81
-
82
-