Dominik Macháček commited on
Commit
14c2bbe
·
1 Parent(s): 36bf3a3

removing duplicated code -- whisper_online_vac

Browse files
voice_activity_controller.py CHANGED
@@ -48,6 +48,7 @@ class VoiceActivityController:
48
  silence_in_wav)
49
 
50
  """
 
51
  x = audio
52
  if not torch.is_tensor(x):
53
  try:
 
48
  silence_in_wav)
49
 
50
  """
51
+ print("applying vad here")
52
  x = audio
53
  if not torch.is_tensor(x):
54
  try:
whisper_online.py CHANGED
@@ -517,6 +517,59 @@ class OnlineASRProcessor:
517
  e = offset + sents[-1][1]
518
  return (b,e,t)
519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
521
 
522
  def create_tokenizer(lan):
@@ -561,6 +614,8 @@ def add_shared_args(parser):
561
  parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
562
  parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
563
  parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
 
 
564
  parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
565
  parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
566
  parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
@@ -607,7 +662,11 @@ def asr_factory(args, logfile=sys.stderr):
607
  tokenizer = None
608
 
609
  # Create the OnlineASRProcessor
610
- online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
 
 
 
 
611
 
612
  return asr, online
613
 
@@ -652,7 +711,10 @@ if __name__ == "__main__":
652
  logger.info("Audio duration is: %2.2f seconds" % duration)
653
 
654
  asr, online = asr_factory(args, logfile=logfile)
655
- min_chunk = args.min_chunk_size
 
 
 
656
 
657
  # load the audio into the LRU cache before we start the timer
658
  a = load_audio_chunk(audio_path,0,1)
 
517
  e = offset + sents[-1][1]
518
  return (b,e,t)
519
 
520
+ class VACOnlineASRProcessor(OnlineASRProcessor):
521
+ '''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
522
+
523
+ It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
524
+ it runs VAD and continuously detects whether there is speech or not.
525
+ When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
526
+ '''
527
+
528
+ def __init__(self, online_chunk_size, *a, **kw):
529
+ self.online_chunk_size = online_chunk_size
530
+
531
+ self.online = OnlineASRProcessor(*a, **kw)
532
+ from voice_activity_controller import VoiceActivityController
533
+ self.vac = VoiceActivityController(use_vad_result = False)
534
+
535
+ self.logfile = self.online.logfile
536
+
537
+ self.init()
538
+
539
+ def init(self):
540
+ self.online.init()
541
+ self.vac.reset_states()
542
+ self.current_online_chunk_buffer_size = 0
543
+ self.is_currently_final = False
544
+
545
+
546
+ def insert_audio_chunk(self, audio):
547
+ r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
548
+ audio, is_final = r
549
+ print(is_final)
550
+ self.is_currently_final = is_final
551
+ self.online.insert_audio_chunk(audio)
552
+ self.current_online_chunk_buffer_size += len(audio)
553
+
554
+ def process_iter(self):
555
+ if self.is_currently_final:
556
+ return self.finish()
557
+ elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size:
558
+ self.current_online_chunk_buffer_size = 0
559
+ ret = self.online.process_iter()
560
+ return ret
561
+ else:
562
+ print("no online update, only VAD", file=self.logfile)
563
+ return (None, None, "")
564
+
565
+ def finish(self):
566
+ ret = self.online.finish()
567
+ self.online.init(keep_offset=True)
568
+ self.current_online_chunk_buffer_size = 0
569
+ return ret
570
+
571
+
572
+
573
  WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
574
 
575
  def create_tokenizer(lan):
 
614
  parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
615
  parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
616
  parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
617
+ parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
618
+ parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
619
  parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
620
  parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
621
  parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
 
662
  tokenizer = None
663
 
664
  # Create the OnlineASRProcessor
665
+ if args.vac:
666
+
667
+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
668
+ else:
669
+ online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
670
 
671
  return asr, online
672
 
 
711
  logger.info("Audio duration is: %2.2f seconds" % duration)
712
 
713
  asr, online = asr_factory(args, logfile=logfile)
714
+ if args.vac:
715
+ min_chunk = args.vac_chunk_size
716
+ else:
717
+ min_chunk = args.min_chunk_size
718
 
719
  # load the audio into the LRU cache before we start the timer
720
  a = load_audio_chunk(audio_path,0,1)
whisper_online_server.py CHANGED
@@ -13,8 +13,6 @@ parser = argparse.ArgumentParser()
13
  # server options
14
  parser.add_argument("--host", type=str, default='localhost')
15
  parser.add_argument("--port", type=int, default=43007)
16
- parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
17
- parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
18
  parser.add_argument("--warmup-file", type=str, dest="warmup_file",
19
  help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
20
 
@@ -108,7 +106,7 @@ class ServerProcessor:
108
  raw_bytes = self.connection.non_blocking_receive_audio()
109
  if not raw_bytes:
110
  break
111
- print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
112
  sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
113
  audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
114
  out.append(audio)
 
13
  # server options
14
  parser.add_argument("--host", type=str, default='localhost')
15
  parser.add_argument("--port", type=int, default=43007)
 
 
16
  parser.add_argument("--warmup-file", type=str, dest="warmup_file",
17
  help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
18
 
 
106
  raw_bytes = self.connection.non_blocking_receive_audio()
107
  if not raw_bytes:
108
  break
109
+ # print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
110
  sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
111
  audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
112
  out.append(audio)
whisper_online_vac.py DELETED
@@ -1,203 +0,0 @@
1
- from whisper_online import *
2
- from voice_activity_controller import *
3
- import soundfile
4
- import io
5
-
6
- SAMPLING_RATE = 16000
7
-
8
- class VACOnlineASRProcessor(OnlineASRProcessor):
9
-
10
- def __init__(self, online_chunk_size, *a, **kw):
11
- self.online_chunk_size = online_chunk_size
12
-
13
- self.online = OnlineASRProcessor(*a, **kw)
14
- self.vac = VoiceActivityController(use_vad_result = False)
15
-
16
- self.logfile = self.online.logfile
17
-
18
- self.init()
19
-
20
- def init(self):
21
- self.online.init()
22
- self.vac.reset_states()
23
- self.current_online_chunk_buffer_size = 0
24
- self.is_currently_final = False
25
-
26
-
27
- def insert_audio_chunk(self, audio):
28
- r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
29
- audio, is_final = r
30
- print(is_final)
31
- self.is_currently_final = is_final
32
- self.online.insert_audio_chunk(audio)
33
- self.current_online_chunk_buffer_size += len(audio)
34
-
35
- def process_iter(self):
36
- if self.is_currently_final:
37
- return self.finish()
38
- elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size:
39
- self.current_online_chunk_buffer_size = 0
40
- ret = self.online.process_iter()
41
- return ret
42
- else:
43
- print("no online update, only VAD", file=self.logfile)
44
- return (None, None, "")
45
-
46
- def finish(self):
47
- ret = self.online.finish()
48
- self.online.init(keep_offset=True)
49
- self.current_online_chunk_buffer_size = 0
50
- return ret
51
-
52
-
53
-
54
-
55
- if __name__ == "__main__":
56
-
57
- import argparse
58
- parser = argparse.ArgumentParser()
59
- parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
60
- add_shared_args(parser)
61
- parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
62
- parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
63
- parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
64
- parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
65
- args = parser.parse_args()
66
-
67
- # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
68
- logfile = sys.stderr
69
-
70
- if args.offline and args.comp_unaware:
71
- print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
72
- sys.exit(1)
73
-
74
- audio_path = args.audio_path
75
-
76
- SAMPLING_RATE = 16000
77
- duration = len(load_audio(audio_path))/SAMPLING_RATE
78
- print("Audio duration is: %2.2f seconds" % duration, file=logfile)
79
-
80
- size = args.model
81
- language = args.lan
82
-
83
- t = time.time()
84
- print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
85
-
86
- if args.backend == "faster-whisper":
87
- asr_cls = FasterWhisperASR
88
- else:
89
- asr_cls = WhisperTimestampedASR
90
-
91
- asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
92
-
93
- if args.task == "translate":
94
- asr.set_translate_task()
95
- tgt_language = "en" # Whisper translates into English
96
- else:
97
- tgt_language = language # Whisper transcribes in this language
98
-
99
-
100
- e = time.time()
101
- print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
102
-
103
- if args.vad:
104
- print("setting VAD filter",file=logfile)
105
- asr.use_vad()
106
-
107
-
108
- min_chunk = args.vac_chunk_size
109
- if args.buffer_trimming == "sentence":
110
- tokenizer = create_tokenizer(tgt_language)
111
- else:
112
- tokenizer = None
113
- online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
114
-
115
-
116
- # load the audio into the LRU cache before we start the timer
117
- a = load_audio_chunk(audio_path,0,1)
118
-
119
- # warm up the ASR, because the very first transcribe takes much more time than the other
120
- asr.transcribe(a)
121
-
122
- beg = args.start_at
123
- start = time.time()-beg
124
-
125
- def output_transcript(o, now=None):
126
- # output format in stdout is like:
127
- # 4186.3606 0 1720 Takhle to je
128
- # - the first three words are:
129
- # - emission time from beginning of processing, in milliseconds
130
- # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
131
- # - the next words: segment transcript
132
- if now is None:
133
- now = time.time()-start
134
- if o[0] is not None:
135
- print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
136
- print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
137
- else:
138
- print(o,file=logfile,flush=True)
139
-
140
- if args.offline: ## offline mode processing (for testing/debugging)
141
- a = load_audio(audio_path)
142
- online.insert_audio_chunk(a)
143
- try:
144
- o = online.process_iter()
145
- except AssertionError:
146
- print("assertion error",file=logfile)
147
- pass
148
- else:
149
- output_transcript(o)
150
- now = None
151
- elif args.comp_unaware: # computational unaware mode
152
- end = beg + min_chunk
153
- while True:
154
- a = load_audio_chunk(audio_path,beg,end)
155
- online.insert_audio_chunk(a)
156
- try:
157
- o = online.process_iter()
158
- except AssertionError:
159
- print("assertion error",file=logfile)
160
- pass
161
- else:
162
- output_transcript(o, now=end)
163
-
164
- print(f"## last processed {end:.2f}s",file=logfile,flush=True)
165
-
166
- if end >= duration:
167
- break
168
-
169
- beg = end
170
-
171
- if end + min_chunk > duration:
172
- end = duration
173
- else:
174
- end += min_chunk
175
- now = duration
176
-
177
- else: # online = simultaneous mode
178
- end = 0
179
- while True:
180
- now = time.time() - start
181
- if now < end+min_chunk:
182
- time.sleep(min_chunk+end-now)
183
- end = time.time() - start
184
- a = load_audio_chunk(audio_path,beg,end)
185
- beg = end
186
- online.insert_audio_chunk(a)
187
-
188
- try:
189
- o = online.process_iter()
190
- except AssertionError:
191
- print("assertion error",file=logfile)
192
- pass
193
- else:
194
- output_transcript(o)
195
- now = time.time() - start
196
- print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
197
-
198
- if end >= duration:
199
- break
200
- now = None
201
-
202
- o = online.finish()
203
- output_transcript(o, now=now)