Commit
·
380c30d
1
Parent(s):
5ebbed3
Further tidying of print output, so by default there's little on the console
Browse files- whisper_online.py +38 -34
- whisper_online_server.py +4 -6
whisper_online.py
CHANGED
@@ -4,6 +4,7 @@ import numpy as np
|
|
4 |
import librosa
|
5 |
from functools import lru_cache
|
6 |
import time
|
|
|
7 |
|
8 |
|
9 |
|
@@ -57,7 +58,7 @@ class WhisperTimestampedASR(ASRBase):
|
|
57 |
from whisper_timestamped import transcribe_timestamped
|
58 |
self.transcribe_timestamped = transcribe_timestamped
|
59 |
if model_dir is not None:
|
60 |
-
|
61 |
return whisper.load_model(modelsize, download_root=cache_dir)
|
62 |
|
63 |
def transcribe(self, audio, init_prompt=""):
|
@@ -97,7 +98,7 @@ class FasterWhisperASR(ASRBase):
|
|
97 |
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
98 |
from faster_whisper import WhisperModel
|
99 |
if model_dir is not None:
|
100 |
-
|
101 |
model_size_or_path = model_dir
|
102 |
elif modelsize is not None:
|
103 |
model_size_or_path = modelsize
|
@@ -173,9 +174,11 @@ class HypothesisBuffer:
|
|
173 |
c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
|
174 |
tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
|
175 |
if c == tail:
|
176 |
-
|
177 |
for j in range(i):
|
178 |
-
|
|
|
|
|
179 |
break
|
180 |
|
181 |
def flush(self):
|
@@ -267,9 +270,9 @@ class OnlineASRProcessor:
|
|
267 |
"""
|
268 |
|
269 |
prompt, non_prompt = self.prompt()
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
274 |
|
275 |
# transform to [(beg,end,"word1"), ...]
|
@@ -278,8 +281,10 @@ class OnlineASRProcessor:
|
|
278 |
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
279 |
o = self.transcript_buffer.flush()
|
280 |
self.commited.extend(o)
|
281 |
-
|
282 |
-
|
|
|
|
|
283 |
|
284 |
# there is a newly confirmed text
|
285 |
|
@@ -303,18 +308,18 @@ class OnlineASRProcessor:
|
|
303 |
#while k>0 and self.commited[k][1] > l:
|
304 |
# k -= 1
|
305 |
#t = self.commited[k][1]
|
306 |
-
|
307 |
#self.chunk_at(t)
|
308 |
|
309 |
-
|
310 |
return self.to_flush(o)
|
311 |
|
312 |
def chunk_completed_sentence(self):
|
313 |
if self.commited == []: return
|
314 |
-
|
315 |
sents = self.words_to_sentences(self.commited)
|
316 |
for s in sents:
|
317 |
-
|
318 |
if len(sents) < 2:
|
319 |
return
|
320 |
while len(sents) > 2:
|
@@ -322,7 +327,7 @@ class OnlineASRProcessor:
|
|
322 |
# we will continue with audio processing at this timestamp
|
323 |
chunk_at = sents[-2][1]
|
324 |
|
325 |
-
|
326 |
self.chunk_at(chunk_at)
|
327 |
|
328 |
def chunk_completed_segment(self, res):
|
@@ -339,12 +344,12 @@ class OnlineASRProcessor:
|
|
339 |
ends.pop(-1)
|
340 |
e = ends[-2]+self.buffer_time_offset
|
341 |
if e <= t:
|
342 |
-
|
343 |
self.chunk_at(e)
|
344 |
else:
|
345 |
-
|
346 |
else:
|
347 |
-
|
348 |
|
349 |
|
350 |
|
@@ -391,7 +396,7 @@ class OnlineASRProcessor:
|
|
391 |
"""
|
392 |
o = self.transcript_buffer.complete()
|
393 |
f = self.to_flush(o)
|
394 |
-
|
395 |
return f
|
396 |
|
397 |
|
@@ -431,7 +436,7 @@ def create_tokenizer(lan):
|
|
431 |
|
432 |
# the following languages are in Whisper, but not in wtpsplit:
|
433 |
if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
|
434 |
-
|
435 |
lan = None
|
436 |
|
437 |
from wtpsplit import WtP
|
@@ -476,20 +481,20 @@ if __name__ == "__main__":
|
|
476 |
logfile = sys.stderr
|
477 |
|
478 |
if args.offline and args.comp_unaware:
|
479 |
-
|
480 |
sys.exit(1)
|
481 |
|
482 |
audio_path = args.audio_path
|
483 |
|
484 |
SAMPLING_RATE = 16000
|
485 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
486 |
-
|
487 |
|
488 |
size = args.model
|
489 |
language = args.lan
|
490 |
|
491 |
t = time.time()
|
492 |
-
|
493 |
|
494 |
if args.backend == "faster-whisper":
|
495 |
asr_cls = FasterWhisperASR
|
@@ -506,10 +511,10 @@ if __name__ == "__main__":
|
|
506 |
|
507 |
|
508 |
e = time.time()
|
509 |
-
|
510 |
|
511 |
if args.vad:
|
512 |
-
|
513 |
asr.use_vad()
|
514 |
|
515 |
|
@@ -543,16 +548,15 @@ if __name__ == "__main__":
|
|
543 |
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
|
544 |
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
|
545 |
else:
|
546 |
-
print(o,file=logfile,flush=True)
|
547 |
|
548 |
if args.offline: ## offline mode processing (for testing/debugging)
|
549 |
a = load_audio(audio_path)
|
550 |
online.insert_audio_chunk(a)
|
551 |
try:
|
552 |
o = online.process_iter()
|
553 |
-
except AssertionError:
|
554 |
-
|
555 |
-
pass
|
556 |
else:
|
557 |
output_transcript(o)
|
558 |
now = None
|
@@ -563,13 +567,13 @@ if __name__ == "__main__":
|
|
563 |
online.insert_audio_chunk(a)
|
564 |
try:
|
565 |
o = online.process_iter()
|
566 |
-
except AssertionError:
|
567 |
-
|
568 |
pass
|
569 |
else:
|
570 |
output_transcript(o, now=end)
|
571 |
|
572 |
-
|
573 |
|
574 |
if end >= duration:
|
575 |
break
|
@@ -595,13 +599,13 @@ if __name__ == "__main__":
|
|
595 |
|
596 |
try:
|
597 |
o = online.process_iter()
|
598 |
-
except AssertionError:
|
599 |
-
|
600 |
pass
|
601 |
else:
|
602 |
output_transcript(o)
|
603 |
now = time.time() - start
|
604 |
-
|
605 |
|
606 |
if end >= duration:
|
607 |
break
|
|
|
4 |
import librosa
|
5 |
from functools import lru_cache
|
6 |
import time
|
7 |
+
import logging
|
8 |
|
9 |
|
10 |
|
|
|
58 |
from whisper_timestamped import transcribe_timestamped
|
59 |
self.transcribe_timestamped = transcribe_timestamped
|
60 |
if model_dir is not None:
|
61 |
+
logging.debug("ignoring model_dir, not implemented")
|
62 |
return whisper.load_model(modelsize, download_root=cache_dir)
|
63 |
|
64 |
def transcribe(self, audio, init_prompt=""):
|
|
|
98 |
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
99 |
from faster_whisper import WhisperModel
|
100 |
if model_dir is not None:
|
101 |
+
logging.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
|
102 |
model_size_or_path = model_dir
|
103 |
elif modelsize is not None:
|
104 |
model_size_or_path = modelsize
|
|
|
174 |
c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
|
175 |
tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
|
176 |
if c == tail:
|
177 |
+
words = []
|
178 |
for j in range(i):
|
179 |
+
words.append(repr(self.new.pop(0)))
|
180 |
+
words_msg = "\t".join(words)
|
181 |
+
logging.debug(f"removing last {i} words: {words_msg}")
|
182 |
break
|
183 |
|
184 |
def flush(self):
|
|
|
270 |
"""
|
271 |
|
272 |
prompt, non_prompt = self.prompt()
|
273 |
+
logging.debug(f"PROMPT: {prompt}")
|
274 |
+
logging.debug(f"CONTEXT: {non_prompt}")
|
275 |
+
logging.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
|
276 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
277 |
|
278 |
# transform to [(beg,end,"word1"), ...]
|
|
|
281 |
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
282 |
o = self.transcript_buffer.flush()
|
283 |
self.commited.extend(o)
|
284 |
+
completed = self.to_flush(o)
|
285 |
+
logging.debug(f">>>>COMPLETE NOW: {completed}")
|
286 |
+
the_rest = self.to_flush(self.transcript_buffer.complete())
|
287 |
+
logging.debug(f"INCOMPLETE: {the_rest}")
|
288 |
|
289 |
# there is a newly confirmed text
|
290 |
|
|
|
308 |
#while k>0 and self.commited[k][1] > l:
|
309 |
# k -= 1
|
310 |
#t = self.commited[k][1]
|
311 |
+
logging.debug(f"chunking segment")
|
312 |
#self.chunk_at(t)
|
313 |
|
314 |
+
logging.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
|
315 |
return self.to_flush(o)
|
316 |
|
317 |
def chunk_completed_sentence(self):
|
318 |
if self.commited == []: return
|
319 |
+
logging.debug(self.commited)
|
320 |
sents = self.words_to_sentences(self.commited)
|
321 |
for s in sents:
|
322 |
+
logging.debug(f"\t\tSENT: {s}")
|
323 |
if len(sents) < 2:
|
324 |
return
|
325 |
while len(sents) > 2:
|
|
|
327 |
# we will continue with audio processing at this timestamp
|
328 |
chunk_at = sents[-2][1]
|
329 |
|
330 |
+
logging.debug(f"--- sentence chunked at {chunk_at:2.2f}")
|
331 |
self.chunk_at(chunk_at)
|
332 |
|
333 |
def chunk_completed_segment(self, res):
|
|
|
344 |
ends.pop(-1)
|
345 |
e = ends[-2]+self.buffer_time_offset
|
346 |
if e <= t:
|
347 |
+
logging.debug(f"--- segment chunked at {e:2.2f}")
|
348 |
self.chunk_at(e)
|
349 |
else:
|
350 |
+
logging.debug(f"--- last segment not within commited area")
|
351 |
else:
|
352 |
+
logging.debug(f"--- not enough segments to chunk")
|
353 |
|
354 |
|
355 |
|
|
|
396 |
"""
|
397 |
o = self.transcript_buffer.complete()
|
398 |
f = self.to_flush(o)
|
399 |
+
logging.debug("last, noncommited: {f}")
|
400 |
return f
|
401 |
|
402 |
|
|
|
436 |
|
437 |
# the following languages are in Whisper, but not in wtpsplit:
|
438 |
if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
|
439 |
+
logging.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
|
440 |
lan = None
|
441 |
|
442 |
from wtpsplit import WtP
|
|
|
481 |
logfile = sys.stderr
|
482 |
|
483 |
if args.offline and args.comp_unaware:
|
484 |
+
logging.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
|
485 |
sys.exit(1)
|
486 |
|
487 |
audio_path = args.audio_path
|
488 |
|
489 |
SAMPLING_RATE = 16000
|
490 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
491 |
+
logging.info("Audio duration is: %2.2f seconds" % duration)
|
492 |
|
493 |
size = args.model
|
494 |
language = args.lan
|
495 |
|
496 |
t = time.time()
|
497 |
+
logging.info(f"Loading Whisper {size} model for {language}...")
|
498 |
|
499 |
if args.backend == "faster-whisper":
|
500 |
asr_cls = FasterWhisperASR
|
|
|
511 |
|
512 |
|
513 |
e = time.time()
|
514 |
+
logging.info(f"done. It took {round(e-t,2)} seconds.")
|
515 |
|
516 |
if args.vad:
|
517 |
+
logging.info("setting VAD filter")
|
518 |
asr.use_vad()
|
519 |
|
520 |
|
|
|
548 |
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
|
549 |
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
|
550 |
else:
|
551 |
+
print("here?", o,file=logfile,flush=True)
|
552 |
|
553 |
if args.offline: ## offline mode processing (for testing/debugging)
|
554 |
a = load_audio(audio_path)
|
555 |
online.insert_audio_chunk(a)
|
556 |
try:
|
557 |
o = online.process_iter()
|
558 |
+
except AssertionError as e:
|
559 |
+
log.error(f"assertion error: {repr(e)}")
|
|
|
560 |
else:
|
561 |
output_transcript(o)
|
562 |
now = None
|
|
|
567 |
online.insert_audio_chunk(a)
|
568 |
try:
|
569 |
o = online.process_iter()
|
570 |
+
except AssertionError as e:
|
571 |
+
logging.error(f"assertion error: {repr(e)}")
|
572 |
pass
|
573 |
else:
|
574 |
output_transcript(o, now=end)
|
575 |
|
576 |
+
logging.debug(f"## last processed {end:.2f}s")
|
577 |
|
578 |
if end >= duration:
|
579 |
break
|
|
|
599 |
|
600 |
try:
|
601 |
o = online.process_iter()
|
602 |
+
except AssertionError as e:
|
603 |
+
logging.error(f"assertion error: {e}")
|
604 |
pass
|
605 |
else:
|
606 |
output_transcript(o)
|
607 |
now = time.time() - start
|
608 |
+
logging.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
|
609 |
|
610 |
if end >= duration:
|
611 |
break
|
whisper_online_server.py
CHANGED
@@ -39,6 +39,7 @@ logging.debug(f"Loading Whisper {size} model for {language}...")
|
|
39 |
if args.backend == "faster-whisper":
|
40 |
from faster_whisper import WhisperModel
|
41 |
asr_cls = FasterWhisperASR
|
|
|
42 |
else:
|
43 |
import whisper
|
44 |
import whisper_timestamped
|
@@ -80,7 +81,7 @@ if os.path.exists(demo_audio_path):
|
|
80 |
# warm up the ASR, because the very first transcribe takes much more time than the other
|
81 |
asr.transcribe(a)
|
82 |
else:
|
83 |
-
logging.
|
84 |
|
85 |
|
86 |
######### Server objects
|
@@ -135,8 +136,6 @@ class ServerProcessor:
|
|
135 |
out = []
|
136 |
while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
|
137 |
raw_bytes = self.connection.non_blocking_receive_audio()
|
138 |
-
print(raw_bytes[:10])
|
139 |
-
print(len(raw_bytes))
|
140 |
if not raw_bytes:
|
141 |
break
|
142 |
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
@@ -167,7 +166,7 @@ class ServerProcessor:
|
|
167 |
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
|
168 |
return "%1.0f %1.0f %s" % (beg,end,o[2])
|
169 |
else:
|
170 |
-
|
171 |
return None
|
172 |
|
173 |
def send_result(self, o):
|
@@ -181,14 +180,13 @@ class ServerProcessor:
|
|
181 |
while True:
|
182 |
a = self.receive_audio_chunk()
|
183 |
if a is None:
|
184 |
-
print("break here",file=sys.stderr)
|
185 |
break
|
186 |
self.online_asr_proc.insert_audio_chunk(a)
|
187 |
o = online.process_iter()
|
188 |
try:
|
189 |
self.send_result(o)
|
190 |
except BrokenPipeError:
|
191 |
-
|
192 |
break
|
193 |
|
194 |
# o = online.finish() # this should be working
|
|
|
39 |
if args.backend == "faster-whisper":
|
40 |
from faster_whisper import WhisperModel
|
41 |
asr_cls = FasterWhisperASR
|
42 |
+
logging.getLogger("faster_whisper").setLevel(logging.WARNING)
|
43 |
else:
|
44 |
import whisper
|
45 |
import whisper_timestamped
|
|
|
81 |
# warm up the ASR, because the very first transcribe takes much more time than the other
|
82 |
asr.transcribe(a)
|
83 |
else:
|
84 |
+
logging.debug("Whisper is not warmed up")
|
85 |
|
86 |
|
87 |
######### Server objects
|
|
|
136 |
out = []
|
137 |
while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
|
138 |
raw_bytes = self.connection.non_blocking_receive_audio()
|
|
|
|
|
139 |
if not raw_bytes:
|
140 |
break
|
141 |
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
|
|
166 |
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
|
167 |
return "%1.0f %1.0f %s" % (beg,end,o[2])
|
168 |
else:
|
169 |
+
# No text, so no output
|
170 |
return None
|
171 |
|
172 |
def send_result(self, o):
|
|
|
180 |
while True:
|
181 |
a = self.receive_audio_chunk()
|
182 |
if a is None:
|
|
|
183 |
break
|
184 |
self.online_asr_proc.insert_audio_chunk(a)
|
185 |
o = online.process_iter()
|
186 |
try:
|
187 |
self.send_result(o)
|
188 |
except BrokenPipeError:
|
189 |
+
logging.info("broken pipe -- connection closed?")
|
190 |
break
|
191 |
|
192 |
# o = online.finish() # this should be working
|