Dominik Macháček commited on
Commit
819ac6c
·
1 Parent(s): 33369a9
Files changed (2) hide show
  1. README.md +13 -0
  2. whisper_online_server.py +212 -0
README.md CHANGED
@@ -110,6 +110,19 @@ print(o) # do something with the last output
110
  online.init() # refresh if you're going to re-use the object for the next audio
111
  ```
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
 
115
  ## Background
 
110
  online.init() # refresh if you're going to re-use the object for the next audio
111
  ```
112
 
113
+ ## Usage: Server
114
+
115
+ `whisper_online_server.py` entry point has the same model option sas the entry point above, plus `--host` and `--port`, and no audio path.
116
+
117
+ Client example:
118
+
119
+ ```
120
+ arecord -f S16_LE -c1 -r 16000 -t raw -D default | nc localhost 43001
121
+ ```
122
+
123
+ - arecord is an example program that sends audio from a sound device, in raw audio format -- 16000 sampling rate, mono channel, S16\_LE -- signed 16-bit integer low endian
124
+
125
+ - nc is netcat, server host and port are e.g. localhost 430001
126
 
127
 
128
  ## Background
whisper_online_server.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from whisper_online import *
3
+
4
+ import sys
5
+ import argparse
6
+ import os
7
+ parser = argparse.ArgumentParser()
8
+
9
+ # server options
10
+ parser.add_argument("--host", type=str, default='localhost')
11
+ parser.add_argument("--port", type=int, default=43007)
12
+
13
+
14
+ # options from whisper_online
15
+ # TODO: code repetition
16
+
17
+ parser.add_argument('--min-chunk-size', type=float, default=1.0, help='Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.')
18
+ parser.add_argument('--model', type=str, default='large-v2', choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large".split(","),help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.")
19
+ parser.add_argument('--model_cache_dir', type=str, default=None, help="Overriding the default model cache dir where models downloaded from the hub are saved")
20
+ parser.add_argument('--model_dir', type=str, default=None, help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.")
21
+ parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
22
+ parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
23
+ parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
24
+ parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
25
+ parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
26
+ parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
27
+ args = parser.parse_args()
28
+
29
+
30
+ # setting whisper object by args
31
+
32
+ SAMPLING_RATE = 16000
33
+
34
+ size = args.model
35
+ language = args.lan
36
+
37
+ t = time.time()
38
+ print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
39
+
40
+ if args.backend == "faster-whisper":
41
+ from faster_whisper import WhisperModel
42
+ asr_cls = FasterWhisperASR
43
+ else:
44
+ import whisper
45
+ import whisper_timestamped
46
+ # from whisper_timestamped_model import WhisperTimestampedASR
47
+ asr_cls = WhisperTimestampedASR
48
+
49
+ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
50
+
51
+ if args.task == "translate":
52
+ asr.set_translate_task()
53
+
54
+ e = time.time()
55
+ print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
56
+
57
+ if args.vad:
58
+ print("setting VAD filter",file=sys.stderr)
59
+ asr.use_vad()
60
+
61
+
62
+ min_chunk = args.min_chunk_size
63
+ online = OnlineASRProcessor(language,asr)
64
+
65
+
66
+
67
+ demo_audio_path = "cs-maji-2.16k.wav"
68
+ if os.path.exists(demo_audio_path):
69
+ # load the audio into the LRU cache before we start the timer
70
+ a = load_audio_chunk(demo_audio_path,0,1)
71
+
72
+ # TODO: it should be tested whether it's meaningful
73
+ # warm up the ASR, because the very first transcribe takes much more time than the other
74
+ asr.transcribe(a)
75
+ else:
76
+ print("Whisper is not warmed up",file=sys.stderr)
77
+
78
+
79
+
80
+
81
+ ######### Server objects
82
+
83
+ import line_packet
84
+ import socket
85
+
86
+ import logging
87
+
88
+
89
+ class Connection:
90
+ '''it wraps conn object'''
91
+ PACKET_SIZE = 65536
92
+
93
+ def __init__(self, conn):
94
+ self.conn = conn
95
+ self.last_line = ""
96
+
97
+ self.conn.setblocking(True)
98
+
99
+ def send(self, line):
100
+ '''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
101
+ if line == self.last_line:
102
+ return
103
+ line_packet.send_one_line(self.conn, line)
104
+ self.last_line = line
105
+
106
+ def receive_lines(self):
107
+ in_line = line_packet.receive_lines(self.conn)
108
+ return in_line
109
+
110
+ def non_blocking_receive_audio(self):
111
+ r = self.conn.recv(self.PACKET_SIZE)
112
+ return r
113
+
114
+
115
+ import io
116
+ import soundfile
117
+
118
+ # wraps socket and ASR object, and serves one client connection.
119
+ # next client should be served by a new instance of this object
120
+ class ServerProcessor:
121
+
122
+ def __init__(self, c, online_asr_proc, min_chunk):
123
+ self.connection = c
124
+ self.online_asr_proc = online_asr_proc
125
+ self.min_chunk = min_chunk
126
+
127
+ self.last_end = None
128
+
129
+ def receive_audio_chunk(self):
130
+ # receive all audio that is available by this time
131
+ # blocks operation if less than self.min_chunk seconds is available
132
+ # unblocks if connection is closed or a chunk is available
133
+ out = []
134
+ while sum(len(x) for x in out) < self.min_chunk*SAMPLING_RATE:
135
+ raw_bytes = self.connection.non_blocking_receive_audio()
136
+ print(raw_bytes[:10])
137
+ print(len(raw_bytes))
138
+ if not raw_bytes:
139
+ break
140
+ sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
141
+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
142
+ out.append(audio)
143
+ if not out:
144
+ return None
145
+ return np.concatenate(out)
146
+
147
+ def format_output_transcript(self,o):
148
+ # output format in stdout is like:
149
+ # 0 1720 Takhle to je
150
+ # - the first two words are:
151
+ # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
152
+ # - the next words: segment transcript
153
+
154
+ # This function differs from whisper_online.output_transcript in the following:
155
+ # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
156
+ # Therefore, beg, is max of previous end and current beg outputed by Whisper.
157
+ # Usually it differs negligibly, by appx 20 ms.
158
+
159
+ if o[0] is not None:
160
+ beg, end = o[0]*1000,o[1]*1000
161
+ if self.last_end is not None:
162
+ beg = max(beg, self.last_end)
163
+
164
+ self.last_end = end
165
+ print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
166
+ return "%1.0f %1.0f %s" % (beg,end,o[2])
167
+ else:
168
+ print(o,file=sys.stderr,flush=True)
169
+ return None
170
+
171
+ def send_result(self, o):
172
+ msg = self.format_output_transcript(o)
173
+ if msg is not None:
174
+ self.connection.send(msg)
175
+
176
+ def process(self):
177
+ # handle one client connection
178
+ self.online_asr_proc.init()
179
+ while True:
180
+ a = self.receive_audio_chunk()
181
+ if a is None:
182
+ print("break here",file=sys.stderr)
183
+ break
184
+ self.online_asr_proc.insert_audio_chunk(a)
185
+ o = online.process_iter()
186
+ self.send_result(o)
187
+ # o = online.finish() # this should be working
188
+ # self.send_result(o)
189
+
190
+
191
+
192
+
193
+ # Start logging.
194
+ level = logging.INFO
195
+ logging.basicConfig(level=level, format='whisper-server-%(levelname)s: %(message)s')
196
+
197
+ # server loop
198
+
199
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
200
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
201
+ s.bind((args.host, args.port))
202
+ s.listen(1)
203
+ logging.info('INFO: Listening on'+str((args.host, args.port)))
204
+ while True:
205
+ conn, addr = s.accept()
206
+ logging.info('INFO: Connected to client on {}'.format(addr))
207
+ connection = Connection(conn)
208
+ proc = ServerProcessor(connection, online, min_chunk)
209
+ proc.process()
210
+ conn.close()
211
+ logging.info('INFO: Connection to client closed')
212
+ logging.info('INFO: Connection closed, terminating.')