regularfry commited on
Commit
2ba48bc
·
2 Parent(s): 2afc97d dcddb17

Merge branch 'main' into ayo-logging-fixes

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. line_packet.py +1 -2
  3. whisper_online.py +19 -18
  4. whisper_online_server.py +15 -22
README.md CHANGED
@@ -183,7 +183,7 @@ online.init() # refresh if you're going to re-use the object for the next audio
183
 
184
  ### Server -- real-time from mic
185
 
186
- `whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection. See help message (`-h` option).
187
 
188
  Client example:
189
 
 
183
 
184
  ### Server -- real-time from mic
185
 
186
+ `whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. See the help message (`-h` option).
187
 
188
  Client example:
189
 
line_packet.py CHANGED
@@ -2,8 +2,6 @@
2
 
3
  """Functions for sending and receiving individual lines of text over a socket.
4
 
5
- Used by marian-server-server.py to communicate with the Marian worker.
6
-
7
  A line is transmitted using one or more fixed-size packets of UTF-8 bytes
8
  containing:
9
 
@@ -11,6 +9,7 @@ containing:
11
 
12
  - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
13
 
 
14
  """
15
 
16
  PACKET_SIZE = 65536
 
2
 
3
  """Functions for sending and receiving individual lines of text over a socket.
4
 
 
 
5
  A line is transmitted using one or more fixed-size packets of UTF-8 bytes
6
  containing:
7
 
 
9
 
10
  - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
11
 
12
+ Originally from the UEDIN team of the ELITR project.
13
  """
14
 
15
  PACKET_SIZE = 65536
whisper_online.py CHANGED
@@ -559,7 +559,7 @@ def add_shared_args(parser):
559
 
560
  def asr_factory(args, logfile=sys.stderr):
561
  """
562
- Creates and configures an ASR instance based on the specified backend and arguments.
563
  """
564
  backend = args.backend
565
  if backend == "openai-api":
@@ -584,8 +584,23 @@ def asr_factory(args, logfile=sys.stderr):
584
  logging.info("Setting VAD filter")
585
  asr.use_vad()
586
 
587
- return asr
 
 
 
 
 
 
 
 
 
 
 
588
 
 
 
 
 
589
  ## main:
590
 
591
  if __name__ == "__main__":
@@ -613,27 +628,13 @@ if __name__ == "__main__":
613
  duration = len(load_audio(audio_path))/SAMPLING_RATE
614
  logging.info("Audio duration is: %2.2f seconds" % duration)
615
 
616
- asr = asr_factory(args, logfile=logfile)
617
- language = args.lan
618
-
619
- if args.task == "translate":
620
- asr.set_translate_task()
621
- tgt_language = "en" # Whisper translates into English
622
- else:
623
- tgt_language = language # Whisper transcribes in this language
624
-
625
  min_chunk = args.min_chunk_size
626
- if args.buffer_trimming == "sentence":
627
- tokenizer = create_tokenizer(tgt_language)
628
- else:
629
- tokenizer = None
630
- online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
631
-
632
 
633
  # load the audio into the LRU cache before we start the timer
634
  a = load_audio_chunk(audio_path,0,1)
635
 
636
- # warm up the ASR, because the very first transcribe takes much more time than the other
637
  asr.transcribe(a)
638
 
639
  beg = args.start_at
 
559
 
560
  def asr_factory(args, logfile=sys.stderr):
561
  """
562
+ Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
563
  """
564
  backend = args.backend
565
  if backend == "openai-api":
 
584
  logging.info("Setting VAD filter")
585
  asr.use_vad()
586
 
587
+ language = args.lan
588
+ if args.task == "translate":
589
+ asr.set_translate_task()
590
+ tgt_language = "en" # Whisper translates into English
591
+ else:
592
+ tgt_language = language # Whisper transcribes in this language
593
+
594
+ # Create the tokenizer
595
+ if args.buffer_trimming == "sentence":
596
+ tokenizer = create_tokenizer(tgt_language)
597
+ else:
598
+ tokenizer = None
599
 
600
+ # Create the OnlineASRProcessor
601
+ online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
602
+
603
+ return asr, online
604
  ## main:
605
 
606
  if __name__ == "__main__":
 
628
  duration = len(load_audio(audio_path))/SAMPLING_RATE
629
  logging.info("Audio duration is: %2.2f seconds" % duration)
630
 
631
+ asr, online = asr_factory(args, logfile=logfile)
 
 
 
 
 
 
 
 
632
  min_chunk = args.min_chunk_size
 
 
 
 
 
 
633
 
634
  # load the audio into the LRU cache before we start the timer
635
  a = load_audio_chunk(audio_path,0,1)
636
 
637
+ # warm up the ASR because the very first transcribe takes much more time than the other
638
  asr.transcribe(a)
639
 
640
  beg = args.start_at
whisper_online_server.py CHANGED
@@ -12,6 +12,8 @@ parser = argparse.ArgumentParser()
12
  # server options
13
  parser.add_argument("--host", type=str, default='localhost')
14
  parser.add_argument("--port", type=int, default=43007)
 
 
15
 
16
  parser.add_argument("-l", "--log-level", dest="log_level",
17
  choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
@@ -33,37 +35,28 @@ SAMPLING_RATE = 16000
33
 
34
  size = args.model
35
  language = args.lan
36
-
37
- asr = asr_factory(args)
38
-
39
- if args.task == "translate":
40
- asr.set_translate_task()
41
- tgt_language = "en"
42
- else:
43
- tgt_language = language
44
-
45
  min_chunk = args.min_chunk_size
46
 
 
47
  if args.buffer_trimming == "sentence":
48
  tokenizer = create_tokenizer(tgt_language)
49
  else:
50
  tokenizer = None
51
  online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
52
 
53
-
54
-
55
- demo_audio_path = "cs-maji-2.16k.wav"
56
- if os.path.exists(demo_audio_path):
57
- # load the audio into the LRU cache before we start the timer
58
- logging.debug(f"Warming up on {demo_audio_path}")
59
- a = load_audio_chunk(demo_audio_path,0,1)
60
-
61
- # TODO: it should be tested whether it's meaningful
62
- # warm up the ASR, because the very first transcribe takes much more time than the other
63
- asr.transcribe(a)
64
- logging.debug("Whisper is warmed up")
65
  else:
66
- logging.debug("Whisper is not warmed up")
67
 
68
 
69
  ######### Server objects
 
12
  # server options
13
  parser.add_argument("--host", type=str, default='localhost')
14
  parser.add_argument("--port", type=int, default=43007)
15
+ parser.add_argument("--warmup-file", type=str, dest="warmup_file",
16
+ help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
17
 
18
  parser.add_argument("-l", "--log-level", dest="log_level",
19
  choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 
35
 
36
  size = args.model
37
  language = args.lan
38
+ asr, online = asr_factory(args)
 
 
 
 
 
 
 
 
39
  min_chunk = args.min_chunk_size
40
 
41
+
42
  if args.buffer_trimming == "sentence":
43
  tokenizer = create_tokenizer(tgt_language)
44
  else:
45
  tokenizer = None
46
  online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
47
 
48
+ # warm up the ASR because the very first transcribe takes more time than the others.
49
+ # Test results in https://github.com/ufal/whisper_streaming/pull/81
50
+ msg = "Whisper is not warmed up. The first chunk processing may take longer."
51
+ if args.warmup_file:
52
+ if os.path.isfile(args.warmup_file):
53
+ a = load_audio_chunk(args.warmup_file,0,1)
54
+ asr.transcribe(a)
55
+ print("INFO: Whisper is warmed up.",file=sys.stderr)
56
+ else:
57
+ print("WARNING: The warm up file is not available. "+msg,file=sys.stderr)
 
 
58
  else:
59
+ print("WARNING: " + msg, file=sys.stderr)
60
 
61
 
62
  ######### Server objects