Dominik Macháček
commited on
Commit
·
14c2bbe
1
Parent(s):
36bf3a3
removing duplicated code -- whisper_online_vac
Browse files- voice_activity_controller.py +1 -0
- whisper_online.py +64 -2
- whisper_online_server.py +1 -3
- whisper_online_vac.py +0 -203
voice_activity_controller.py
CHANGED
@@ -48,6 +48,7 @@ class VoiceActivityController:
|
|
48 |
silence_in_wav)
|
49 |
|
50 |
"""
|
|
|
51 |
x = audio
|
52 |
if not torch.is_tensor(x):
|
53 |
try:
|
|
|
48 |
silence_in_wav)
|
49 |
|
50 |
"""
|
51 |
+
print("applying vad here")
|
52 |
x = audio
|
53 |
if not torch.is_tensor(x):
|
54 |
try:
|
whisper_online.py
CHANGED
@@ -517,6 +517,59 @@ class OnlineASRProcessor:
|
|
517 |
e = offset + sents[-1][1]
|
518 |
return (b,e,t)
|
519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
|
521 |
|
522 |
def create_tokenizer(lan):
|
@@ -561,6 +614,8 @@ def add_shared_args(parser):
|
|
561 |
parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
|
562 |
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
|
563 |
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
|
|
|
|
|
564 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
565 |
parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
|
566 |
parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
|
@@ -607,7 +662,11 @@ def asr_factory(args, logfile=sys.stderr):
|
|
607 |
tokenizer = None
|
608 |
|
609 |
# Create the OnlineASRProcessor
|
610 |
-
|
|
|
|
|
|
|
|
|
611 |
|
612 |
return asr, online
|
613 |
|
@@ -652,7 +711,10 @@ if __name__ == "__main__":
|
|
652 |
logger.info("Audio duration is: %2.2f seconds" % duration)
|
653 |
|
654 |
asr, online = asr_factory(args, logfile=logfile)
|
655 |
-
|
|
|
|
|
|
|
656 |
|
657 |
# load the audio into the LRU cache before we start the timer
|
658 |
a = load_audio_chunk(audio_path,0,1)
|
|
|
517 |
e = offset + sents[-1][1]
|
518 |
return (b,e,t)
|
519 |
|
520 |
+
class VACOnlineASRProcessor(OnlineASRProcessor):
|
521 |
+
'''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
|
522 |
+
|
523 |
+
It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
|
524 |
+
it runs VAD and continuously detects whether there is speech or not.
|
525 |
+
When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
|
526 |
+
'''
|
527 |
+
|
528 |
+
def __init__(self, online_chunk_size, *a, **kw):
|
529 |
+
self.online_chunk_size = online_chunk_size
|
530 |
+
|
531 |
+
self.online = OnlineASRProcessor(*a, **kw)
|
532 |
+
from voice_activity_controller import VoiceActivityController
|
533 |
+
self.vac = VoiceActivityController(use_vad_result = False)
|
534 |
+
|
535 |
+
self.logfile = self.online.logfile
|
536 |
+
|
537 |
+
self.init()
|
538 |
+
|
539 |
+
def init(self):
|
540 |
+
self.online.init()
|
541 |
+
self.vac.reset_states()
|
542 |
+
self.current_online_chunk_buffer_size = 0
|
543 |
+
self.is_currently_final = False
|
544 |
+
|
545 |
+
|
546 |
+
def insert_audio_chunk(self, audio):
|
547 |
+
r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
|
548 |
+
audio, is_final = r
|
549 |
+
print(is_final)
|
550 |
+
self.is_currently_final = is_final
|
551 |
+
self.online.insert_audio_chunk(audio)
|
552 |
+
self.current_online_chunk_buffer_size += len(audio)
|
553 |
+
|
554 |
+
def process_iter(self):
|
555 |
+
if self.is_currently_final:
|
556 |
+
return self.finish()
|
557 |
+
elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size:
|
558 |
+
self.current_online_chunk_buffer_size = 0
|
559 |
+
ret = self.online.process_iter()
|
560 |
+
return ret
|
561 |
+
else:
|
562 |
+
print("no online update, only VAD", file=self.logfile)
|
563 |
+
return (None, None, "")
|
564 |
+
|
565 |
+
def finish(self):
|
566 |
+
ret = self.online.finish()
|
567 |
+
self.online.init(keep_offset=True)
|
568 |
+
self.current_online_chunk_buffer_size = 0
|
569 |
+
return ret
|
570 |
+
|
571 |
+
|
572 |
+
|
573 |
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
|
574 |
|
575 |
def create_tokenizer(lan):
|
|
|
614 |
parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
|
615 |
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
|
616 |
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
|
617 |
+
parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
|
618 |
+
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
|
619 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
620 |
parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
|
621 |
parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
|
|
|
662 |
tokenizer = None
|
663 |
|
664 |
# Create the OnlineASRProcessor
|
665 |
+
if args.vac:
|
666 |
+
|
667 |
+
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
668 |
+
else:
|
669 |
+
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
670 |
|
671 |
return asr, online
|
672 |
|
|
|
711 |
logger.info("Audio duration is: %2.2f seconds" % duration)
|
712 |
|
713 |
asr, online = asr_factory(args, logfile=logfile)
|
714 |
+
if args.vac:
|
715 |
+
min_chunk = args.vac_chunk_size
|
716 |
+
else:
|
717 |
+
min_chunk = args.min_chunk_size
|
718 |
|
719 |
# load the audio into the LRU cache before we start the timer
|
720 |
a = load_audio_chunk(audio_path,0,1)
|
whisper_online_server.py
CHANGED
@@ -13,8 +13,6 @@ parser = argparse.ArgumentParser()
|
|
13 |
# server options
|
14 |
parser.add_argument("--host", type=str, default='localhost')
|
15 |
parser.add_argument("--port", type=int, default=43007)
|
16 |
-
parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
|
17 |
-
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
|
18 |
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
|
19 |
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
|
20 |
|
@@ -108,7 +106,7 @@ class ServerProcessor:
|
|
108 |
raw_bytes = self.connection.non_blocking_receive_audio()
|
109 |
if not raw_bytes:
|
110 |
break
|
111 |
-
print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
|
112 |
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
113 |
audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
|
114 |
out.append(audio)
|
|
|
13 |
# server options
|
14 |
parser.add_argument("--host", type=str, default='localhost')
|
15 |
parser.add_argument("--port", type=int, default=43007)
|
|
|
|
|
16 |
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
|
17 |
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
|
18 |
|
|
|
106 |
raw_bytes = self.connection.non_blocking_receive_audio()
|
107 |
if not raw_bytes:
|
108 |
break
|
109 |
+
# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
|
110 |
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
111 |
audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
|
112 |
out.append(audio)
|
whisper_online_vac.py
DELETED
@@ -1,203 +0,0 @@
|
|
1 |
-
from whisper_online import *
|
2 |
-
from voice_activity_controller import *
|
3 |
-
import soundfile
|
4 |
-
import io
|
5 |
-
|
6 |
-
SAMPLING_RATE = 16000
|
7 |
-
|
8 |
-
class VACOnlineASRProcessor(OnlineASRProcessor):
|
9 |
-
|
10 |
-
def __init__(self, online_chunk_size, *a, **kw):
|
11 |
-
self.online_chunk_size = online_chunk_size
|
12 |
-
|
13 |
-
self.online = OnlineASRProcessor(*a, **kw)
|
14 |
-
self.vac = VoiceActivityController(use_vad_result = False)
|
15 |
-
|
16 |
-
self.logfile = self.online.logfile
|
17 |
-
|
18 |
-
self.init()
|
19 |
-
|
20 |
-
def init(self):
|
21 |
-
self.online.init()
|
22 |
-
self.vac.reset_states()
|
23 |
-
self.current_online_chunk_buffer_size = 0
|
24 |
-
self.is_currently_final = False
|
25 |
-
|
26 |
-
|
27 |
-
def insert_audio_chunk(self, audio):
|
28 |
-
r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
|
29 |
-
audio, is_final = r
|
30 |
-
print(is_final)
|
31 |
-
self.is_currently_final = is_final
|
32 |
-
self.online.insert_audio_chunk(audio)
|
33 |
-
self.current_online_chunk_buffer_size += len(audio)
|
34 |
-
|
35 |
-
def process_iter(self):
|
36 |
-
if self.is_currently_final:
|
37 |
-
return self.finish()
|
38 |
-
elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size:
|
39 |
-
self.current_online_chunk_buffer_size = 0
|
40 |
-
ret = self.online.process_iter()
|
41 |
-
return ret
|
42 |
-
else:
|
43 |
-
print("no online update, only VAD", file=self.logfile)
|
44 |
-
return (None, None, "")
|
45 |
-
|
46 |
-
def finish(self):
|
47 |
-
ret = self.online.finish()
|
48 |
-
self.online.init(keep_offset=True)
|
49 |
-
self.current_online_chunk_buffer_size = 0
|
50 |
-
return ret
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
if __name__ == "__main__":
|
56 |
-
|
57 |
-
import argparse
|
58 |
-
parser = argparse.ArgumentParser()
|
59 |
-
parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
|
60 |
-
add_shared_args(parser)
|
61 |
-
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
|
62 |
-
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
|
63 |
-
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
|
64 |
-
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
|
65 |
-
args = parser.parse_args()
|
66 |
-
|
67 |
-
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
|
68 |
-
logfile = sys.stderr
|
69 |
-
|
70 |
-
if args.offline and args.comp_unaware:
|
71 |
-
print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
|
72 |
-
sys.exit(1)
|
73 |
-
|
74 |
-
audio_path = args.audio_path
|
75 |
-
|
76 |
-
SAMPLING_RATE = 16000
|
77 |
-
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
78 |
-
print("Audio duration is: %2.2f seconds" % duration, file=logfile)
|
79 |
-
|
80 |
-
size = args.model
|
81 |
-
language = args.lan
|
82 |
-
|
83 |
-
t = time.time()
|
84 |
-
print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
|
85 |
-
|
86 |
-
if args.backend == "faster-whisper":
|
87 |
-
asr_cls = FasterWhisperASR
|
88 |
-
else:
|
89 |
-
asr_cls = WhisperTimestampedASR
|
90 |
-
|
91 |
-
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
|
92 |
-
|
93 |
-
if args.task == "translate":
|
94 |
-
asr.set_translate_task()
|
95 |
-
tgt_language = "en" # Whisper translates into English
|
96 |
-
else:
|
97 |
-
tgt_language = language # Whisper transcribes in this language
|
98 |
-
|
99 |
-
|
100 |
-
e = time.time()
|
101 |
-
print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
|
102 |
-
|
103 |
-
if args.vad:
|
104 |
-
print("setting VAD filter",file=logfile)
|
105 |
-
asr.use_vad()
|
106 |
-
|
107 |
-
|
108 |
-
min_chunk = args.vac_chunk_size
|
109 |
-
if args.buffer_trimming == "sentence":
|
110 |
-
tokenizer = create_tokenizer(tgt_language)
|
111 |
-
else:
|
112 |
-
tokenizer = None
|
113 |
-
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
114 |
-
|
115 |
-
|
116 |
-
# load the audio into the LRU cache before we start the timer
|
117 |
-
a = load_audio_chunk(audio_path,0,1)
|
118 |
-
|
119 |
-
# warm up the ASR, because the very first transcribe takes much more time than the other
|
120 |
-
asr.transcribe(a)
|
121 |
-
|
122 |
-
beg = args.start_at
|
123 |
-
start = time.time()-beg
|
124 |
-
|
125 |
-
def output_transcript(o, now=None):
|
126 |
-
# output format in stdout is like:
|
127 |
-
# 4186.3606 0 1720 Takhle to je
|
128 |
-
# - the first three words are:
|
129 |
-
# - emission time from beginning of processing, in milliseconds
|
130 |
-
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
|
131 |
-
# - the next words: segment transcript
|
132 |
-
if now is None:
|
133 |
-
now = time.time()-start
|
134 |
-
if o[0] is not None:
|
135 |
-
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
|
136 |
-
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
|
137 |
-
else:
|
138 |
-
print(o,file=logfile,flush=True)
|
139 |
-
|
140 |
-
if args.offline: ## offline mode processing (for testing/debugging)
|
141 |
-
a = load_audio(audio_path)
|
142 |
-
online.insert_audio_chunk(a)
|
143 |
-
try:
|
144 |
-
o = online.process_iter()
|
145 |
-
except AssertionError:
|
146 |
-
print("assertion error",file=logfile)
|
147 |
-
pass
|
148 |
-
else:
|
149 |
-
output_transcript(o)
|
150 |
-
now = None
|
151 |
-
elif args.comp_unaware: # computational unaware mode
|
152 |
-
end = beg + min_chunk
|
153 |
-
while True:
|
154 |
-
a = load_audio_chunk(audio_path,beg,end)
|
155 |
-
online.insert_audio_chunk(a)
|
156 |
-
try:
|
157 |
-
o = online.process_iter()
|
158 |
-
except AssertionError:
|
159 |
-
print("assertion error",file=logfile)
|
160 |
-
pass
|
161 |
-
else:
|
162 |
-
output_transcript(o, now=end)
|
163 |
-
|
164 |
-
print(f"## last processed {end:.2f}s",file=logfile,flush=True)
|
165 |
-
|
166 |
-
if end >= duration:
|
167 |
-
break
|
168 |
-
|
169 |
-
beg = end
|
170 |
-
|
171 |
-
if end + min_chunk > duration:
|
172 |
-
end = duration
|
173 |
-
else:
|
174 |
-
end += min_chunk
|
175 |
-
now = duration
|
176 |
-
|
177 |
-
else: # online = simultaneous mode
|
178 |
-
end = 0
|
179 |
-
while True:
|
180 |
-
now = time.time() - start
|
181 |
-
if now < end+min_chunk:
|
182 |
-
time.sleep(min_chunk+end-now)
|
183 |
-
end = time.time() - start
|
184 |
-
a = load_audio_chunk(audio_path,beg,end)
|
185 |
-
beg = end
|
186 |
-
online.insert_audio_chunk(a)
|
187 |
-
|
188 |
-
try:
|
189 |
-
o = online.process_iter()
|
190 |
-
except AssertionError:
|
191 |
-
print("assertion error",file=logfile)
|
192 |
-
pass
|
193 |
-
else:
|
194 |
-
output_transcript(o)
|
195 |
-
now = time.time() - start
|
196 |
-
print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
|
197 |
-
|
198 |
-
if end >= duration:
|
199 |
-
break
|
200 |
-
now = None
|
201 |
-
|
202 |
-
o = online.finish()
|
203 |
-
output_transcript(o, now=now)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|