File size: 7,251 Bytes
819ac6c 6fa0080 819ac6c aa51e39 819ac6c 13fd21a 819ac6c 13fd21a 819ac6c 13fd21a 819ac6c 2625be1 819ac6c 13fd21a 819ac6c aa51e39 6fa0080 13fd21a 6fa0080 13fd21a 819ac6c 726fa57 819ac6c 726fa57 819ac6c 8849581 819ac6c 8849581 819ac6c 8849581 819ac6c 8849581 819ac6c 260b1f8 819ac6c 13fd21a 819ac6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
#!/usr/bin/env python3
from whisper_online import *
import sys
import argparse
import os
parser = argparse.ArgumentParser()
# server options
parser.add_argument("--host", type=str, default='localhost')
parser.add_argument("--port", type=int, default=43007)
parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.')
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
# options from whisper_online
add_shared_args(parser)
args = parser.parse_args()
# setting whisper object by args
SAMPLING_RATE = 16000
size = args.model
language = args.lan
t = time.time()
print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
if args.backend == "faster-whisper":
from faster_whisper import WhisperModel
asr_cls = FasterWhisperASR
elif args.backend == "whisper_timestamped":
import whisper
from whisper_online import WhisperTimestampedASR
asr_cls = WhisperTimestampedASR
else:
raise ValueError(f"Unknown {args.backend=}")
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
if args.task == "translate":
asr.set_translate_task()
tgt_language = "en"
else:
tgt_language = language
print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr)
if args.vad:
print("setting VAD filter",file=sys.stderr)
asr.use_vad()
if args.buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
if not args.vac:
from whisper_online import OnlineASRProcessor
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
else:
from whisper_online_vac import VACOnlineASRProcessor
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
demo_audio_path = "cs-maji-2.16k.wav"
if os.path.exists(demo_audio_path):
# load the audio into the LRU cache before we start the timer
a = load_audio_chunk(demo_audio_path,0,1)
# TODO: it should be tested whether it's meaningful
# warm up the ASR, because the very first transcribe takes much more time than the other
asr.transcribe(a)
else:
print("Whisper is not warmed up",file=sys.stderr)
######### Server objects
import line_packet
import socket
import logging
class Connection:
'''it wraps conn object'''
PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536
def __init__(self, conn):
self.conn = conn
self.last_line = ""
self.conn.setblocking(True)
def send(self, line):
'''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
if line == self.last_line:
return
line_packet.send_one_line(self.conn, line)
self.last_line = line
def receive_lines(self):
in_line = line_packet.receive_lines(self.conn)
return in_line
def non_blocking_receive_audio(self):
try:
r = self.conn.recv(self.PACKET_SIZE)
return r
except ConnectionResetError:
return None
import io
import soundfile
# wraps socket and ASR object, and serves one client connection.
# next client should be served by a new instance of this object
class ServerProcessor:
def __init__(self, c, online_asr_proc, min_chunk):
self.connection = c
self.online_asr_proc = online_asr_proc
self.min_chunk = min_chunk
self.last_end = None
self.is_first = True
def receive_audio_chunk(self):
# receive all audio that is available by this time
# blocks operation if less than self.min_chunk seconds is available
# unblocks if connection is closed or a chunk is available
out = []
minlimit = self.min_chunk*SAMPLING_RATE
while sum(len(x) for x in out) < minlimit:
raw_bytes = self.connection.non_blocking_receive_audio()
if not raw_bytes:
break
print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
out.append(audio)
if not out:
return None
conc = np.concatenate(out)
if self.is_first and len(conc) < minlimit:
return None
self.is_first = False
return np.concatenate(out)
def format_output_transcript(self,o):
# output format in stdout is like:
# 0 1720 Takhle to je
# - the first two words are:
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
# - the next words: segment transcript
# This function differs from whisper_online.output_transcript in the following:
# succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
# Therefore, beg, is max of previous end and current beg outputed by Whisper.
# Usually it differs negligibly, by appx 20 ms.
if o[0] is not None:
beg, end = o[0]*1000,o[1]*1000
if self.last_end is not None:
beg = max(beg, self.last_end)
self.last_end = end
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
return "%1.0f %1.0f %s" % (beg,end,o[2])
else:
print(o,file=sys.stderr,flush=True)
return None
def send_result(self, o):
msg = self.format_output_transcript(o)
if msg is not None:
self.connection.send(msg)
def process(self):
# handle one client connection
self.online_asr_proc.init()
while True:
a = self.receive_audio_chunk()
if a is None:
print("break here",file=sys.stderr)
break
self.online_asr_proc.insert_audio_chunk(a)
o = online.process_iter()
try:
self.send_result(o)
except BrokenPipeError:
print("broken pipe -- connection closed?",file=sys.stderr)
break
# o = online.finish() # this should be working
# self.send_result(o)
# Start logging.
level = logging.INFO
logging.basicConfig(level=level, format='whisper-server-%(levelname)s: %(message)s')
# server loop
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind((args.host, args.port))
s.listen(1)
logging.info('INFO: Listening on'+str((args.host, args.port)))
while True:
conn, addr = s.accept()
logging.info('INFO: Connected to client on {}'.format(addr))
connection = Connection(conn)
proc = ServerProcessor(connection, online, args.min_chunk_size)
proc.process()
conn.close()
logging.info('INFO: Connection to client closed')
logging.info('INFO: Connection closed, terminating.')
|