regularfry commited on
Commit
97a4ebd
·
1 Parent(s): 2ba48bc

Construct an explicit logger rather than using the root logger

Browse files
Files changed (2) hide show
  1. whisper_online.py +31 -29
  2. whisper_online_server.py +7 -12
whisper_online.py CHANGED
@@ -11,6 +11,8 @@ import io
11
  import soundfile as sf
12
  import math
13
 
 
 
14
  @lru_cache
15
  def load_audio(fname):
16
  a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
@@ -65,7 +67,7 @@ class WhisperTimestampedASR(ASRBase):
65
  from whisper_timestamped import transcribe_timestamped
66
  self.transcribe_timestamped = transcribe_timestamped
67
  if model_dir is not None:
68
- logging.debug("ignoring model_dir, not implemented")
69
  return whisper.load_model(modelsize, download_root=cache_dir)
70
 
71
  def transcribe(self, audio, init_prompt=""):
@@ -106,7 +108,7 @@ class FasterWhisperASR(ASRBase):
106
  from faster_whisper import WhisperModel
107
  logging.getLogger("faster_whisper").setLevel(logging.WARNING)
108
  if model_dir is not None:
109
- logging.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
110
  model_size_or_path = model_dir
111
  elif modelsize is not None:
112
  model_size_or_path = modelsize
@@ -229,7 +231,7 @@ class OpenaiApiASR(ASRBase):
229
 
230
  # Process transcription/translation
231
  transcript = proc.create(**params)
232
- logging.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
233
 
234
  return transcript
235
 
@@ -276,7 +278,7 @@ class HypothesisBuffer:
276
  for j in range(i):
277
  words.append(repr(self.new.pop(0)))
278
  words_msg = "\t".join(words)
279
- logging.debug(f"removing last {i} words: {words_msg}")
280
  break
281
 
282
  def flush(self):
@@ -365,9 +367,9 @@ class OnlineASRProcessor:
365
  """
366
 
367
  prompt, non_prompt = self.prompt()
368
- logging.debug(f"PROMPT: {prompt}")
369
- logging.debug(f"CONTEXT: {non_prompt}")
370
- logging.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
371
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
372
 
373
  # transform to [(beg,end,"word1"), ...]
@@ -377,9 +379,9 @@ class OnlineASRProcessor:
377
  o = self.transcript_buffer.flush()
378
  self.commited.extend(o)
379
  completed = self.to_flush(o)
380
- logging.debug(f">>>>COMPLETE NOW: {completed}")
381
  the_rest = self.to_flush(self.transcript_buffer.complete())
382
- logging.debug(f"INCOMPLETE: {the_rest}")
383
 
384
  # there is a newly confirmed text
385
 
@@ -403,18 +405,18 @@ class OnlineASRProcessor:
403
  #while k>0 and self.commited[k][1] > l:
404
  # k -= 1
405
  #t = self.commited[k][1]
406
- logging.debug(f"chunking segment")
407
  #self.chunk_at(t)
408
 
409
- logging.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
410
  return self.to_flush(o)
411
 
412
  def chunk_completed_sentence(self):
413
  if self.commited == []: return
414
- logging.debug(self.commited)
415
  sents = self.words_to_sentences(self.commited)
416
  for s in sents:
417
- logging.debug(f"\t\tSENT: {s}")
418
  if len(sents) < 2:
419
  return
420
  while len(sents) > 2:
@@ -422,7 +424,7 @@ class OnlineASRProcessor:
422
  # we will continue with audio processing at this timestamp
423
  chunk_at = sents[-2][1]
424
 
425
- logging.debug(f"--- sentence chunked at {chunk_at:2.2f}")
426
  self.chunk_at(chunk_at)
427
 
428
  def chunk_completed_segment(self, res):
@@ -439,12 +441,12 @@ class OnlineASRProcessor:
439
  ends.pop(-1)
440
  e = ends[-2]+self.buffer_time_offset
441
  if e <= t:
442
- logging.debug(f"--- segment chunked at {e:2.2f}")
443
  self.chunk_at(e)
444
  else:
445
- logging.debug(f"--- last segment not within commited area")
446
  else:
447
- logging.debug(f"--- not enough segments to chunk")
448
 
449
 
450
 
@@ -490,7 +492,7 @@ class OnlineASRProcessor:
490
  """
491
  o = self.transcript_buffer.complete()
492
  f = self.to_flush(o)
493
- logging.debug("last, noncommited: {f}")
494
  return f
495
 
496
 
@@ -530,7 +532,7 @@ def create_tokenizer(lan):
530
 
531
  # the following languages are in Whisper, but not in wtpsplit:
532
  if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
533
- logging.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
534
  lan = None
535
 
536
  from wtpsplit import WtP
@@ -563,7 +565,7 @@ def asr_factory(args, logfile=sys.stderr):
563
  """
564
  backend = args.backend
565
  if backend == "openai-api":
566
- logging.debug("Using OpenAI API.")
567
  asr = OpenaiApiASR(lan=args.lan)
568
  else:
569
  if backend == "faster-whisper":
@@ -574,14 +576,14 @@ def asr_factory(args, logfile=sys.stderr):
574
  # Only for FasterWhisperASR and WhisperTimestampedASR
575
  size = args.model
576
  t = time.time()
577
- logging.debug(f"Loading Whisper {size} model for {args.lan}...")
578
  asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
579
  e = time.time()
580
- logging.debug(f"done. It took {round(e-t,2)} seconds.")
581
 
582
  # Apply common configurations
583
  if getattr(args, 'vad', False): # Checks if VAD argument is present and True
584
- logging.info("Setting VAD filter")
585
  asr.use_vad()
586
 
587
  language = args.lan
@@ -619,14 +621,14 @@ if __name__ == "__main__":
619
  logfile = sys.stderr
620
 
621
  if args.offline and args.comp_unaware:
622
- logging.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
623
  sys.exit(1)
624
 
625
  audio_path = args.audio_path
626
 
627
  SAMPLING_RATE = 16000
628
  duration = len(load_audio(audio_path))/SAMPLING_RATE
629
- logging.info("Audio duration is: %2.2f seconds" % duration)
630
 
631
  asr, online = asr_factory(args, logfile=logfile)
632
  min_chunk = args.min_chunk_size
@@ -674,12 +676,12 @@ if __name__ == "__main__":
674
  try:
675
  o = online.process_iter()
676
  except AssertionError as e:
677
- logging.error(f"assertion error: {repr(e)}")
678
  pass
679
  else:
680
  output_transcript(o, now=end)
681
 
682
- logging.debug(f"## last processed {end:.2f}s")
683
 
684
  if end >= duration:
685
  break
@@ -706,12 +708,12 @@ if __name__ == "__main__":
706
  try:
707
  o = online.process_iter()
708
  except AssertionError as e:
709
- logging.error(f"assertion error: {e}")
710
  pass
711
  else:
712
  output_transcript(o)
713
  now = time.time() - start
714
- logging.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
715
 
716
  if end >= duration:
717
  break
 
11
  import soundfile as sf
12
  import math
13
 
14
+ logger = logging.getLogger(__name__)
15
+
16
  @lru_cache
17
  def load_audio(fname):
18
  a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
 
67
  from whisper_timestamped import transcribe_timestamped
68
  self.transcribe_timestamped = transcribe_timestamped
69
  if model_dir is not None:
70
+ logger.debug("ignoring model_dir, not implemented")
71
  return whisper.load_model(modelsize, download_root=cache_dir)
72
 
73
  def transcribe(self, audio, init_prompt=""):
 
108
  from faster_whisper import WhisperModel
109
  logging.getLogger("faster_whisper").setLevel(logging.WARNING)
110
  if model_dir is not None:
111
+ logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
112
  model_size_or_path = model_dir
113
  elif modelsize is not None:
114
  model_size_or_path = modelsize
 
231
 
232
  # Process transcription/translation
233
  transcript = proc.create(**params)
234
+ logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
235
 
236
  return transcript
237
 
 
278
  for j in range(i):
279
  words.append(repr(self.new.pop(0)))
280
  words_msg = "\t".join(words)
281
+ logger.debug(f"removing last {i} words: {words_msg}")
282
  break
283
 
284
  def flush(self):
 
367
  """
368
 
369
  prompt, non_prompt = self.prompt()
370
+ logger.debug(f"PROMPT: {prompt}")
371
+ logger.debug(f"CONTEXT: {non_prompt}")
372
+ logger.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
373
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
374
 
375
  # transform to [(beg,end,"word1"), ...]
 
379
  o = self.transcript_buffer.flush()
380
  self.commited.extend(o)
381
  completed = self.to_flush(o)
382
+ logger.debug(f">>>>COMPLETE NOW: {completed}")
383
  the_rest = self.to_flush(self.transcript_buffer.complete())
384
+ logger.debug(f"INCOMPLETE: {the_rest}")
385
 
386
  # there is a newly confirmed text
387
 
 
405
  #while k>0 and self.commited[k][1] > l:
406
  # k -= 1
407
  #t = self.commited[k][1]
408
+ logger.debug(f"chunking segment")
409
  #self.chunk_at(t)
410
 
411
+ logger.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
412
  return self.to_flush(o)
413
 
414
  def chunk_completed_sentence(self):
415
  if self.commited == []: return
416
+ logger.debug(self.commited)
417
  sents = self.words_to_sentences(self.commited)
418
  for s in sents:
419
+ logger.debug(f"\t\tSENT: {s}")
420
  if len(sents) < 2:
421
  return
422
  while len(sents) > 2:
 
424
  # we will continue with audio processing at this timestamp
425
  chunk_at = sents[-2][1]
426
 
427
+ logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
428
  self.chunk_at(chunk_at)
429
 
430
  def chunk_completed_segment(self, res):
 
441
  ends.pop(-1)
442
  e = ends[-2]+self.buffer_time_offset
443
  if e <= t:
444
+ logger.debug(f"--- segment chunked at {e:2.2f}")
445
  self.chunk_at(e)
446
  else:
447
+ logger.debug(f"--- last segment not within commited area")
448
  else:
449
+ logger.debug(f"--- not enough segments to chunk")
450
 
451
 
452
 
 
492
  """
493
  o = self.transcript_buffer.complete()
494
  f = self.to_flush(o)
495
+ logger.debug("last, noncommited: {f}")
496
  return f
497
 
498
 
 
532
 
533
  # the following languages are in Whisper, but not in wtpsplit:
534
  if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
535
+ logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
536
  lan = None
537
 
538
  from wtpsplit import WtP
 
565
  """
566
  backend = args.backend
567
  if backend == "openai-api":
568
+ logger.debug("Using OpenAI API.")
569
  asr = OpenaiApiASR(lan=args.lan)
570
  else:
571
  if backend == "faster-whisper":
 
576
  # Only for FasterWhisperASR and WhisperTimestampedASR
577
  size = args.model
578
  t = time.time()
579
+ logger.debug(f"Loading Whisper {size} model for {args.lan}...")
580
  asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
581
  e = time.time()
582
+ logger.debug(f"done. It took {round(e-t,2)} seconds.")
583
 
584
  # Apply common configurations
585
  if getattr(args, 'vad', False): # Checks if VAD argument is present and True
586
+ logger.info("Setting VAD filter")
587
  asr.use_vad()
588
 
589
  language = args.lan
 
621
  logfile = sys.stderr
622
 
623
  if args.offline and args.comp_unaware:
624
+ logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
625
  sys.exit(1)
626
 
627
  audio_path = args.audio_path
628
 
629
  SAMPLING_RATE = 16000
630
  duration = len(load_audio(audio_path))/SAMPLING_RATE
631
+ logger.info("Audio duration is: %2.2f seconds" % duration)
632
 
633
  asr, online = asr_factory(args, logfile=logfile)
634
  min_chunk = args.min_chunk_size
 
676
  try:
677
  o = online.process_iter()
678
  except AssertionError as e:
679
+ logger.error(f"assertion error: {repr(e)}")
680
  pass
681
  else:
682
  output_transcript(o, now=end)
683
 
684
+ logger.debug(f"## last processed {end:.2f}s")
685
 
686
  if end >= duration:
687
  break
 
708
  try:
709
  o = online.process_iter()
710
  except AssertionError as e:
711
+ logger.error(f"assertion error: {e}")
712
  pass
713
  else:
714
  output_transcript(o)
715
  now = time.time() - start
716
+ logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
717
 
718
  if end >= duration:
719
  break
whisper_online_server.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  import logging
8
  import numpy as np
9
 
 
 
10
  parser = argparse.ArgumentParser()
11
 
12
  # server options
@@ -38,13 +40,6 @@ language = args.lan
38
  asr, online = asr_factory(args)
39
  min_chunk = args.min_chunk_size
40
 
41
-
42
- if args.buffer_trimming == "sentence":
43
- tokenizer = create_tokenizer(tgt_language)
44
- else:
45
- tokenizer = None
46
- online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
47
-
48
  # warm up the ASR because the very first transcribe takes more time than the others.
49
  # Test results in https://github.com/ufal/whisper_streaming/pull/81
50
  msg = "Whisper is not warmed up. The first chunk processing may take longer."
@@ -161,7 +156,7 @@ class ServerProcessor:
161
  try:
162
  self.send_result(o)
163
  except BrokenPipeError:
164
- logging.info("broken pipe -- connection closed?")
165
  break
166
 
167
  # o = online.finish() # this should be working
@@ -175,13 +170,13 @@ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
175
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
176
  s.bind((args.host, args.port))
177
  s.listen(1)
178
- logging.info('Listening on'+str((args.host, args.port)))
179
  while True:
180
  conn, addr = s.accept()
181
- logging.info('Connected to client on {}'.format(addr))
182
  connection = Connection(conn)
183
  proc = ServerProcessor(connection, online, min_chunk)
184
  proc.process()
185
  conn.close()
186
- logging.info('Connection to client closed')
187
- logging.info('Connection closed, terminating.')
 
7
  import logging
8
  import numpy as np
9
 
10
+ logger = logging.getLogger(__name__)
11
+ print(__name__)
12
  parser = argparse.ArgumentParser()
13
 
14
  # server options
 
40
  asr, online = asr_factory(args)
41
  min_chunk = args.min_chunk_size
42
 
 
 
 
 
 
 
 
43
  # warm up the ASR because the very first transcribe takes more time than the others.
44
  # Test results in https://github.com/ufal/whisper_streaming/pull/81
45
  msg = "Whisper is not warmed up. The first chunk processing may take longer."
 
156
  try:
157
  self.send_result(o)
158
  except BrokenPipeError:
159
+ logger.info("broken pipe -- connection closed?")
160
  break
161
 
162
  # o = online.finish() # this should be working
 
170
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
171
  s.bind((args.host, args.port))
172
  s.listen(1)
173
+ logger.info('Listening on'+str((args.host, args.port)))
174
  while True:
175
  conn, addr = s.accept()
176
+ logger.info('Connected to client on {}'.format(addr))
177
  connection = Connection(conn)
178
  proc = ServerProcessor(connection, online, min_chunk)
179
  proc.process()
180
  conn.close()
181
+ logger.info('Connection to client closed')
182
+ logger.info('Connection closed, terminating.')