qfuxa commited on
Commit
0553b75
·
1 Parent(s): baa0172

unfork project, indicate files from whisper streaming

Browse files
Files changed (3) hide show
  1. README.md +5 -0
  2. line_packet.py +0 -93
  3. whisper_online_server.py +0 -184
README.md CHANGED
@@ -12,6 +12,11 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str
12
 
13
  ![Demo Screenshot](src/demo.png)
14
 
 
 
 
 
 
15
 
16
  ## Installation
17
 
 
12
 
13
  ![Demo Screenshot](src/demo.png)
14
 
15
+ ## Code Origins
16
+
17
+ This project reuses and extends code from the original Whisper Streaming repository:
18
+ - whisper_online.py: Contains code from whisper_streaming with the addition of the **MLX Whisper** backend for Apple Silicon, which is not present in the original repository.
19
+ - silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
20
 
21
  ## Installation
22
 
line_packet.py DELETED
@@ -1,93 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- """Functions for sending and receiving individual lines of text over a socket.
4
-
5
- A line is transmitted using one or more fixed-size packets of UTF-8 bytes
6
- containing:
7
-
8
- - Zero or more bytes of UTF-8, excluding \n and \0, followed by
9
-
10
- - Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
11
-
12
- Originally from the UEDIN team of the ELITR project.
13
- """
14
-
15
- PACKET_SIZE = 65536
16
-
17
-
18
- def send_one_line(socket, text, pad_zeros=False):
19
- """Sends a line of text over the given socket.
20
-
21
- The 'text' argument should contain a single line of text (line break
22
- characters are optional). Line boundaries are determined by Python's
23
- str.splitlines() function [1]. We also count '\0' as a line terminator.
24
- If 'text' contains multiple lines then only the first will be sent.
25
-
26
- If the send fails then an exception will be raised.
27
-
28
- [1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines
29
-
30
- Args:
31
- socket: a socket object.
32
- text: string containing a line of text for transmission.
33
- """
34
- text.replace('\0', '\n')
35
- lines = text.splitlines()
36
- first_line = '' if len(lines) == 0 else lines[0]
37
- # TODO Is there a better way of handling bad input than 'replace'?
38
- data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'')
39
- for offset in range(0, len(data), PACKET_SIZE):
40
- bytes_remaining = len(data) - offset
41
- if bytes_remaining < PACKET_SIZE:
42
- padding_length = PACKET_SIZE - bytes_remaining
43
- packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'')
44
- else:
45
- packet = data[offset:offset+PACKET_SIZE]
46
- socket.sendall(packet)
47
-
48
-
49
- def receive_one_line(socket):
50
- """Receives a line of text from the given socket.
51
-
52
- This function will (attempt to) receive a single line of text. If data is
53
- currently unavailable then it will block until data becomes available or
54
- the sender has closed the connection (in which case it will return an
55
- empty string).
56
-
57
- The string should not contain any newline characters, but if it does then
58
- only the first line will be returned.
59
-
60
- Args:
61
- socket: a socket object.
62
-
63
- Returns:
64
- A string representing a single line with a terminating newline or
65
- None if the connection has been closed.
66
- """
67
- data = b''
68
- while True:
69
- packet = socket.recv(PACKET_SIZE)
70
- if not packet: # Connection has been closed.
71
- return None
72
- data += packet
73
- if b'\0' in packet:
74
- break
75
- # TODO Is there a better way of handling bad input than 'replace'?
76
- text = data.decode('utf-8', errors='replace').strip('\0')
77
- lines = text.split('\n')
78
- return lines[0] + '\n'
79
-
80
-
81
- def receive_lines(socket):
82
- try:
83
- data = socket.recv(PACKET_SIZE)
84
- except BlockingIOError:
85
- return []
86
- if data is None: # Connection has been closed.
87
- return None
88
- # TODO Is there a better way of handling bad input than 'replace'?
89
- text = data.decode('utf-8', errors='replace').strip('\0')
90
- lines = text.split('\n')
91
- if len(lines)==1 and not lines[0]:
92
- return None
93
- return lines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_online_server.py DELETED
@@ -1,184 +0,0 @@
1
- #!/usr/bin/env python3
2
- from whisper_online import *
3
-
4
- import sys
5
- import argparse
6
- import os
7
- import logging
8
- import numpy as np
9
-
10
- logger = logging.getLogger(__name__)
11
- parser = argparse.ArgumentParser()
12
-
13
- # server options
14
- parser.add_argument("--host", type=str, default='localhost')
15
- parser.add_argument("--port", type=int, default=43007)
16
- parser.add_argument("--warmup-file", type=str, dest="warmup_file",
17
- help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
18
-
19
- # options from whisper_online
20
- add_shared_args(parser)
21
- args = parser.parse_args()
22
-
23
- set_logging(args,logger,other="")
24
-
25
- # setting whisper object by args
26
-
27
- SAMPLING_RATE = 16000
28
-
29
- size = args.model
30
- language = args.lan
31
- asr, online = asr_factory(args)
32
- min_chunk = args.min_chunk_size
33
-
34
- # warm up the ASR because the very first transcribe takes more time than the others.
35
- # Test results in https://github.com/ufal/whisper_streaming/pull/81
36
- msg = "Whisper is not warmed up. The first chunk processing may take longer."
37
- if args.warmup_file:
38
- if os.path.isfile(args.warmup_file):
39
- a = load_audio_chunk(args.warmup_file,0,1)
40
- asr.transcribe(a)
41
- logger.info("Whisper is warmed up.")
42
- else:
43
- logger.critical("The warm up file is not available. "+msg)
44
- sys.exit(1)
45
- else:
46
- logger.warning(msg)
47
-
48
-
49
- ######### Server objects
50
-
51
- import line_packet
52
- import socket
53
-
54
- class Connection:
55
- '''it wraps conn object'''
56
- PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536
57
-
58
- def __init__(self, conn):
59
- self.conn = conn
60
- self.last_line = ""
61
-
62
- self.conn.setblocking(True)
63
-
64
- def send(self, line):
65
- '''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
66
- if line == self.last_line:
67
- return
68
- line_packet.send_one_line(self.conn, line)
69
- self.last_line = line
70
-
71
- def receive_lines(self):
72
- in_line = line_packet.receive_lines(self.conn)
73
- return in_line
74
-
75
- def non_blocking_receive_audio(self):
76
- try:
77
- r = self.conn.recv(self.PACKET_SIZE)
78
- return r
79
- except ConnectionResetError:
80
- return None
81
-
82
-
83
- import io
84
- import soundfile
85
-
86
- # wraps socket and ASR object, and serves one client connection.
87
- # next client should be served by a new instance of this object
88
- class ServerProcessor:
89
-
90
- def __init__(self, c, online_asr_proc, min_chunk):
91
- self.connection = c
92
- self.online_asr_proc = online_asr_proc
93
- self.min_chunk = min_chunk
94
-
95
- self.last_end = None
96
-
97
- self.is_first = True
98
-
99
- def receive_audio_chunk(self):
100
- # receive all audio that is available by this time
101
- # blocks operation if less than self.min_chunk seconds is available
102
- # unblocks if connection is closed or a chunk is available
103
- out = []
104
- minlimit = self.min_chunk*SAMPLING_RATE
105
- while sum(len(x) for x in out) < minlimit:
106
- raw_bytes = self.connection.non_blocking_receive_audio()
107
- if not raw_bytes:
108
- break
109
- # print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
110
- sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
111
- audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
112
- out.append(audio)
113
- if not out:
114
- return None
115
- conc = np.concatenate(out)
116
- if self.is_first and len(conc) < minlimit:
117
- return None
118
- self.is_first = False
119
- return np.concatenate(out)
120
-
121
- def format_output_transcript(self,o):
122
- # output format in stdout is like:
123
- # 0 1720 Takhle to je
124
- # - the first two words are:
125
- # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
126
- # - the next words: segment transcript
127
-
128
- # This function differs from whisper_online.output_transcript in the following:
129
- # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
130
- # Therefore, beg, is max of previous end and current beg outputed by Whisper.
131
- # Usually it differs negligibly, by appx 20 ms.
132
-
133
- if o[0] is not None:
134
- beg, end = o[0]*1000,o[1]*1000
135
- if self.last_end is not None:
136
- beg = max(beg, self.last_end)
137
-
138
- self.last_end = end
139
- print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
140
- return "%1.0f %1.0f %s" % (beg,end,o[2])
141
- else:
142
- logger.debug("No text in this segment")
143
- return None
144
-
145
- def send_result(self, o):
146
- msg = self.format_output_transcript(o)
147
- if msg is not None:
148
- self.connection.send(msg)
149
-
150
- def process(self):
151
- # handle one client connection
152
- self.online_asr_proc.init()
153
- while True:
154
- a = self.receive_audio_chunk()
155
- if a is None:
156
- break
157
- self.online_asr_proc.insert_audio_chunk(a)
158
- o = online.process_iter()
159
- try:
160
- self.send_result(o)
161
- except BrokenPipeError:
162
- logger.info("broken pipe -- connection closed?")
163
- break
164
-
165
- # o = online.finish() # this should be working
166
- # self.send_result(o)
167
-
168
-
169
-
170
- # server loop
171
-
172
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
173
- s.bind((args.host, args.port))
174
- s.listen(1)
175
- logger.info('Listening on'+str((args.host, args.port)))
176
- while True:
177
- conn, addr = s.accept()
178
- logger.info('Connected to client on {}'.format(addr))
179
- connection = Connection(conn)
180
- proc = ServerProcessor(connection, online, args.min_chunk_size)
181
- proc.process()
182
- conn.close()
183
- logger.info('Connection to client closed')
184
- logger.info('Connection closed, terminating.')