unfork project, indicate files from whisper streaming
Browse files- README.md +5 -0
- line_packet.py +0 -93
- whisper_online_server.py +0 -184
README.md
CHANGED
@@ -12,6 +12,11 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str
|
|
12 |
|
13 |

|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
## Installation
|
17 |
|
|
|
12 |
|
13 |

|
14 |
|
15 |
+
## Code Origins
|
16 |
+
|
17 |
+
This project reuses and extends code from the original Whisper Streaming repository:
|
18 |
+
- whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository.
|
19 |
+
- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
|
20 |
|
21 |
## Installation
|
22 |
|
line_packet.py
DELETED
@@ -1,93 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
|
3 |
-
"""Functions for sending and receiving individual lines of text over a socket.
|
4 |
-
|
5 |
-
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
|
6 |
-
containing:
|
7 |
-
|
8 |
-
- Zero or more bytes of UTF-8, excluding \n and \0, followed by
|
9 |
-
|
10 |
-
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
|
11 |
-
|
12 |
-
Originally from the UEDIN team of the ELITR project.
|
13 |
-
"""
|
14 |
-
|
15 |
-
PACKET_SIZE = 65536
|
16 |
-
|
17 |
-
|
18 |
-
def send_one_line(socket, text, pad_zeros=False):
|
19 |
-
"""Sends a line of text over the given socket.
|
20 |
-
|
21 |
-
The 'text' argument should contain a single line of text (line break
|
22 |
-
characters are optional). Line boundaries are determined by Python's
|
23 |
-
str.splitlines() function [1]. We also count '\0' as a line terminator.
|
24 |
-
If 'text' contains multiple lines then only the first will be sent.
|
25 |
-
|
26 |
-
If the send fails then an exception will be raised.
|
27 |
-
|
28 |
-
[1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines
|
29 |
-
|
30 |
-
Args:
|
31 |
-
socket: a socket object.
|
32 |
-
text: string containing a line of text for transmission.
|
33 |
-
"""
|
34 |
-
text.replace('\0', '\n')
|
35 |
-
lines = text.splitlines()
|
36 |
-
first_line = '' if len(lines) == 0 else lines[0]
|
37 |
-
# TODO Is there a better way of handling bad input than 'replace'?
|
38 |
-
data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'')
|
39 |
-
for offset in range(0, len(data), PACKET_SIZE):
|
40 |
-
bytes_remaining = len(data) - offset
|
41 |
-
if bytes_remaining < PACKET_SIZE:
|
42 |
-
padding_length = PACKET_SIZE - bytes_remaining
|
43 |
-
packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'')
|
44 |
-
else:
|
45 |
-
packet = data[offset:offset+PACKET_SIZE]
|
46 |
-
socket.sendall(packet)
|
47 |
-
|
48 |
-
|
49 |
-
def receive_one_line(socket):
|
50 |
-
"""Receives a line of text from the given socket.
|
51 |
-
|
52 |
-
This function will (attempt to) receive a single line of text. If data is
|
53 |
-
currently unavailable then it will block until data becomes available or
|
54 |
-
the sender has closed the connection (in which case it will return an
|
55 |
-
empty string).
|
56 |
-
|
57 |
-
The string should not contain any newline characters, but if it does then
|
58 |
-
only the first line will be returned.
|
59 |
-
|
60 |
-
Args:
|
61 |
-
socket: a socket object.
|
62 |
-
|
63 |
-
Returns:
|
64 |
-
A string representing a single line with a terminating newline or
|
65 |
-
None if the connection has been closed.
|
66 |
-
"""
|
67 |
-
data = b''
|
68 |
-
while True:
|
69 |
-
packet = socket.recv(PACKET_SIZE)
|
70 |
-
if not packet: # Connection has been closed.
|
71 |
-
return None
|
72 |
-
data += packet
|
73 |
-
if b'\0' in packet:
|
74 |
-
break
|
75 |
-
# TODO Is there a better way of handling bad input than 'replace'?
|
76 |
-
text = data.decode('utf-8', errors='replace').strip('\0')
|
77 |
-
lines = text.split('\n')
|
78 |
-
return lines[0] + '\n'
|
79 |
-
|
80 |
-
|
81 |
-
def receive_lines(socket):
|
82 |
-
try:
|
83 |
-
data = socket.recv(PACKET_SIZE)
|
84 |
-
except BlockingIOError:
|
85 |
-
return []
|
86 |
-
if data is None: # Connection has been closed.
|
87 |
-
return None
|
88 |
-
# TODO Is there a better way of handling bad input than 'replace'?
|
89 |
-
text = data.decode('utf-8', errors='replace').strip('\0')
|
90 |
-
lines = text.split('\n')
|
91 |
-
if len(lines)==1 and not lines[0]:
|
92 |
-
return None
|
93 |
-
return lines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
whisper_online_server.py
DELETED
@@ -1,184 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
from whisper_online import *
|
3 |
-
|
4 |
-
import sys
|
5 |
-
import argparse
|
6 |
-
import os
|
7 |
-
import logging
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
logger = logging.getLogger(__name__)
|
11 |
-
parser = argparse.ArgumentParser()
|
12 |
-
|
13 |
-
# server options
|
14 |
-
parser.add_argument("--host", type=str, default='localhost')
|
15 |
-
parser.add_argument("--port", type=int, default=43007)
|
16 |
-
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
|
17 |
-
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
|
18 |
-
|
19 |
-
# options from whisper_online
|
20 |
-
add_shared_args(parser)
|
21 |
-
args = parser.parse_args()
|
22 |
-
|
23 |
-
set_logging(args,logger,other="")
|
24 |
-
|
25 |
-
# setting whisper object by args
|
26 |
-
|
27 |
-
SAMPLING_RATE = 16000
|
28 |
-
|
29 |
-
size = args.model
|
30 |
-
language = args.lan
|
31 |
-
asr, online = asr_factory(args)
|
32 |
-
min_chunk = args.min_chunk_size
|
33 |
-
|
34 |
-
# warm up the ASR because the very first transcribe takes more time than the others.
|
35 |
-
# Test results in https://github.com/ufal/whisper_streaming/pull/81
|
36 |
-
msg = "Whisper is not warmed up. The first chunk processing may take longer."
|
37 |
-
if args.warmup_file:
|
38 |
-
if os.path.isfile(args.warmup_file):
|
39 |
-
a = load_audio_chunk(args.warmup_file,0,1)
|
40 |
-
asr.transcribe(a)
|
41 |
-
logger.info("Whisper is warmed up.")
|
42 |
-
else:
|
43 |
-
logger.critical("The warm up file is not available. "+msg)
|
44 |
-
sys.exit(1)
|
45 |
-
else:
|
46 |
-
logger.warning(msg)
|
47 |
-
|
48 |
-
|
49 |
-
######### Server objects
|
50 |
-
|
51 |
-
import line_packet
|
52 |
-
import socket
|
53 |
-
|
54 |
-
class Connection:
|
55 |
-
'''it wraps conn object'''
|
56 |
-
PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536
|
57 |
-
|
58 |
-
def __init__(self, conn):
|
59 |
-
self.conn = conn
|
60 |
-
self.last_line = ""
|
61 |
-
|
62 |
-
self.conn.setblocking(True)
|
63 |
-
|
64 |
-
def send(self, line):
|
65 |
-
'''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
|
66 |
-
if line == self.last_line:
|
67 |
-
return
|
68 |
-
line_packet.send_one_line(self.conn, line)
|
69 |
-
self.last_line = line
|
70 |
-
|
71 |
-
def receive_lines(self):
|
72 |
-
in_line = line_packet.receive_lines(self.conn)
|
73 |
-
return in_line
|
74 |
-
|
75 |
-
def non_blocking_receive_audio(self):
|
76 |
-
try:
|
77 |
-
r = self.conn.recv(self.PACKET_SIZE)
|
78 |
-
return r
|
79 |
-
except ConnectionResetError:
|
80 |
-
return None
|
81 |
-
|
82 |
-
|
83 |
-
import io
|
84 |
-
import soundfile
|
85 |
-
|
86 |
-
# wraps socket and ASR object, and serves one client connection.
|
87 |
-
# next client should be served by a new instance of this object
|
88 |
-
class ServerProcessor:
|
89 |
-
|
90 |
-
def __init__(self, c, online_asr_proc, min_chunk):
|
91 |
-
self.connection = c
|
92 |
-
self.online_asr_proc = online_asr_proc
|
93 |
-
self.min_chunk = min_chunk
|
94 |
-
|
95 |
-
self.last_end = None
|
96 |
-
|
97 |
-
self.is_first = True
|
98 |
-
|
99 |
-
def receive_audio_chunk(self):
|
100 |
-
# receive all audio that is available by this time
|
101 |
-
# blocks operation if less than self.min_chunk seconds is available
|
102 |
-
# unblocks if connection is closed or a chunk is available
|
103 |
-
out = []
|
104 |
-
minlimit = self.min_chunk*SAMPLING_RATE
|
105 |
-
while sum(len(x) for x in out) < minlimit:
|
106 |
-
raw_bytes = self.connection.non_blocking_receive_audio()
|
107 |
-
if not raw_bytes:
|
108 |
-
break
|
109 |
-
# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
|
110 |
-
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
111 |
-
audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
|
112 |
-
out.append(audio)
|
113 |
-
if not out:
|
114 |
-
return None
|
115 |
-
conc = np.concatenate(out)
|
116 |
-
if self.is_first and len(conc) < minlimit:
|
117 |
-
return None
|
118 |
-
self.is_first = False
|
119 |
-
return np.concatenate(out)
|
120 |
-
|
121 |
-
def format_output_transcript(self,o):
|
122 |
-
# output format in stdout is like:
|
123 |
-
# 0 1720 Takhle to je
|
124 |
-
# - the first two words are:
|
125 |
-
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
|
126 |
-
# - the next words: segment transcript
|
127 |
-
|
128 |
-
# This function differs from whisper_online.output_transcript in the following:
|
129 |
-
# succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
|
130 |
-
# Therefore, beg, is max of previous end and current beg outputed by Whisper.
|
131 |
-
# Usually it differs negligibly, by appx 20 ms.
|
132 |
-
|
133 |
-
if o[0] is not None:
|
134 |
-
beg, end = o[0]*1000,o[1]*1000
|
135 |
-
if self.last_end is not None:
|
136 |
-
beg = max(beg, self.last_end)
|
137 |
-
|
138 |
-
self.last_end = end
|
139 |
-
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
|
140 |
-
return "%1.0f %1.0f %s" % (beg,end,o[2])
|
141 |
-
else:
|
142 |
-
logger.debug("No text in this segment")
|
143 |
-
return None
|
144 |
-
|
145 |
-
def send_result(self, o):
|
146 |
-
msg = self.format_output_transcript(o)
|
147 |
-
if msg is not None:
|
148 |
-
self.connection.send(msg)
|
149 |
-
|
150 |
-
def process(self):
|
151 |
-
# handle one client connection
|
152 |
-
self.online_asr_proc.init()
|
153 |
-
while True:
|
154 |
-
a = self.receive_audio_chunk()
|
155 |
-
if a is None:
|
156 |
-
break
|
157 |
-
self.online_asr_proc.insert_audio_chunk(a)
|
158 |
-
o = online.process_iter()
|
159 |
-
try:
|
160 |
-
self.send_result(o)
|
161 |
-
except BrokenPipeError:
|
162 |
-
logger.info("broken pipe -- connection closed?")
|
163 |
-
break
|
164 |
-
|
165 |
-
# o = online.finish() # this should be working
|
166 |
-
# self.send_result(o)
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
# server loop
|
171 |
-
|
172 |
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
173 |
-
s.bind((args.host, args.port))
|
174 |
-
s.listen(1)
|
175 |
-
logger.info('Listening on'+str((args.host, args.port)))
|
176 |
-
while True:
|
177 |
-
conn, addr = s.accept()
|
178 |
-
logger.info('Connected to client on {}'.format(addr))
|
179 |
-
connection = Connection(conn)
|
180 |
-
proc = ServerProcessor(connection, online, args.min_chunk_size)
|
181 |
-
proc.process()
|
182 |
-
conn.close()
|
183 |
-
logger.info('Connection to client closed')
|
184 |
-
logger.info('Connection closed, terminating.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|