Merge branch 'main' into ayo-logging-fixes
Browse files- README.md +1 -1
- line_packet.py +1 -2
- whisper_online.py +19 -18
- whisper_online_server.py +15 -22
README.md
CHANGED
@@ -183,7 +183,7 @@ online.init() # refresh if you're going to re-use the object for the next audio
|
|
183 |
|
184 |
### Server -- real-time from mic
|
185 |
|
186 |
-
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection
|
187 |
|
188 |
Client example:
|
189 |
|
|
|
183 |
|
184 |
### Server -- real-time from mic
|
185 |
|
186 |
+
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. See the help message (`-h` option).
|
187 |
|
188 |
Client example:
|
189 |
|
line_packet.py
CHANGED
@@ -2,8 +2,6 @@
|
|
2 |
|
3 |
"""Functions for sending and receiving individual lines of text over a socket.
|
4 |
|
5 |
-
Used by marian-server-server.py to communicate with the Marian worker.
|
6 |
-
|
7 |
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
|
8 |
containing:
|
9 |
|
@@ -11,6 +9,7 @@ containing:
|
|
11 |
|
12 |
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
|
13 |
|
|
|
14 |
"""
|
15 |
|
16 |
PACKET_SIZE = 65536
|
|
|
2 |
|
3 |
"""Functions for sending and receiving individual lines of text over a socket.
|
4 |
|
|
|
|
|
5 |
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
|
6 |
containing:
|
7 |
|
|
|
9 |
|
10 |
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
|
11 |
|
12 |
+
Originally from the UEDIN team of the ELITR project.
|
13 |
"""
|
14 |
|
15 |
PACKET_SIZE = 65536
|
whisper_online.py
CHANGED
@@ -559,7 +559,7 @@ def add_shared_args(parser):
|
|
559 |
|
560 |
def asr_factory(args, logfile=sys.stderr):
|
561 |
"""
|
562 |
-
Creates and configures an ASR instance based on the specified backend and arguments.
|
563 |
"""
|
564 |
backend = args.backend
|
565 |
if backend == "openai-api":
|
@@ -584,8 +584,23 @@ def asr_factory(args, logfile=sys.stderr):
|
|
584 |
logging.info("Setting VAD filter")
|
585 |
asr.use_vad()
|
586 |
|
587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
588 |
|
|
|
|
|
|
|
|
|
589 |
## main:
|
590 |
|
591 |
if __name__ == "__main__":
|
@@ -613,27 +628,13 @@ if __name__ == "__main__":
|
|
613 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
614 |
logging.info("Audio duration is: %2.2f seconds" % duration)
|
615 |
|
616 |
-
asr = asr_factory(args, logfile=logfile)
|
617 |
-
language = args.lan
|
618 |
-
|
619 |
-
if args.task == "translate":
|
620 |
-
asr.set_translate_task()
|
621 |
-
tgt_language = "en" # Whisper translates into English
|
622 |
-
else:
|
623 |
-
tgt_language = language # Whisper transcribes in this language
|
624 |
-
|
625 |
min_chunk = args.min_chunk_size
|
626 |
-
if args.buffer_trimming == "sentence":
|
627 |
-
tokenizer = create_tokenizer(tgt_language)
|
628 |
-
else:
|
629 |
-
tokenizer = None
|
630 |
-
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
631 |
-
|
632 |
|
633 |
# load the audio into the LRU cache before we start the timer
|
634 |
a = load_audio_chunk(audio_path,0,1)
|
635 |
|
636 |
-
# warm up the ASR
|
637 |
asr.transcribe(a)
|
638 |
|
639 |
beg = args.start_at
|
|
|
559 |
|
560 |
def asr_factory(args, logfile=sys.stderr):
|
561 |
"""
|
562 |
+
Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
|
563 |
"""
|
564 |
backend = args.backend
|
565 |
if backend == "openai-api":
|
|
|
584 |
logging.info("Setting VAD filter")
|
585 |
asr.use_vad()
|
586 |
|
587 |
+
language = args.lan
|
588 |
+
if args.task == "translate":
|
589 |
+
asr.set_translate_task()
|
590 |
+
tgt_language = "en" # Whisper translates into English
|
591 |
+
else:
|
592 |
+
tgt_language = language # Whisper transcribes in this language
|
593 |
+
|
594 |
+
# Create the tokenizer
|
595 |
+
if args.buffer_trimming == "sentence":
|
596 |
+
tokenizer = create_tokenizer(tgt_language)
|
597 |
+
else:
|
598 |
+
tokenizer = None
|
599 |
|
600 |
+
# Create the OnlineASRProcessor
|
601 |
+
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
602 |
+
|
603 |
+
return asr, online
|
604 |
## main:
|
605 |
|
606 |
if __name__ == "__main__":
|
|
|
628 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
629 |
logging.info("Audio duration is: %2.2f seconds" % duration)
|
630 |
|
631 |
+
asr, online = asr_factory(args, logfile=logfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
632 |
min_chunk = args.min_chunk_size
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
|
634 |
# load the audio into the LRU cache before we start the timer
|
635 |
a = load_audio_chunk(audio_path,0,1)
|
636 |
|
637 |
+
# warm up the ASR because the very first transcribe takes much more time than the other
|
638 |
asr.transcribe(a)
|
639 |
|
640 |
beg = args.start_at
|
whisper_online_server.py
CHANGED
@@ -12,6 +12,8 @@ parser = argparse.ArgumentParser()
|
|
12 |
# server options
|
13 |
parser.add_argument("--host", type=str, default='localhost')
|
14 |
parser.add_argument("--port", type=int, default=43007)
|
|
|
|
|
15 |
|
16 |
parser.add_argument("-l", "--log-level", dest="log_level",
|
17 |
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
@@ -33,37 +35,28 @@ SAMPLING_RATE = 16000
|
|
33 |
|
34 |
size = args.model
|
35 |
language = args.lan
|
36 |
-
|
37 |
-
asr = asr_factory(args)
|
38 |
-
|
39 |
-
if args.task == "translate":
|
40 |
-
asr.set_translate_task()
|
41 |
-
tgt_language = "en"
|
42 |
-
else:
|
43 |
-
tgt_language = language
|
44 |
-
|
45 |
min_chunk = args.min_chunk_size
|
46 |
|
|
|
47 |
if args.buffer_trimming == "sentence":
|
48 |
tokenizer = create_tokenizer(tgt_language)
|
49 |
else:
|
50 |
tokenizer = None
|
51 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
if
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
asr.transcribe(a)
|
64 |
-
logging.debug("Whisper is warmed up")
|
65 |
else:
|
66 |
-
|
67 |
|
68 |
|
69 |
######### Server objects
|
|
|
12 |
# server options
|
13 |
parser.add_argument("--host", type=str, default='localhost')
|
14 |
parser.add_argument("--port", type=int, default=43007)
|
15 |
+
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
|
16 |
+
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
|
17 |
|
18 |
parser.add_argument("-l", "--log-level", dest="log_level",
|
19 |
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
|
|
35 |
|
36 |
size = args.model
|
37 |
language = args.lan
|
38 |
+
asr, online = asr_factory(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
min_chunk = args.min_chunk_size
|
40 |
|
41 |
+
|
42 |
if args.buffer_trimming == "sentence":
|
43 |
tokenizer = create_tokenizer(tgt_language)
|
44 |
else:
|
45 |
tokenizer = None
|
46 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
47 |
|
48 |
+
# warm up the ASR because the very first transcribe takes more time than the others.
|
49 |
+
# Test results in https://github.com/ufal/whisper_streaming/pull/81
|
50 |
+
msg = "Whisper is not warmed up. The first chunk processing may take longer."
|
51 |
+
if args.warmup_file:
|
52 |
+
if os.path.isfile(args.warmup_file):
|
53 |
+
a = load_audio_chunk(args.warmup_file,0,1)
|
54 |
+
asr.transcribe(a)
|
55 |
+
print("INFO: Whisper is warmed up.",file=sys.stderr)
|
56 |
+
else:
|
57 |
+
print("WARNING: The warm up file is not available. "+msg,file=sys.stderr)
|
|
|
|
|
58 |
else:
|
59 |
+
print("WARNING: " + msg, file=sys.stderr)
|
60 |
|
61 |
|
62 |
######### Server objects
|