|
|
|
from whisper_online import * |
|
|
|
import sys |
|
import argparse |
|
import os |
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
parser.add_argument("--host", type=str, default='localhost') |
|
parser.add_argument("--port", type=int, default=43007) |
|
parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller.') |
|
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') |
|
|
|
|
|
add_shared_args(parser) |
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
|
SAMPLING_RATE = 16000 |
|
|
|
size = args.model |
|
language = args.lan |
|
|
|
t = time.time() |
|
print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True) |
|
|
|
if args.backend == "faster-whisper": |
|
from faster_whisper import WhisperModel |
|
asr_cls = FasterWhisperASR |
|
elif args.backend == "whisper_timestamped": |
|
import whisper |
|
from whisper_online import WhisperTimestampedASR |
|
asr_cls = WhisperTimestampedASR |
|
else: |
|
raise ValueError(f"Unknown {args.backend=}") |
|
|
|
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) |
|
|
|
if args.task == "translate": |
|
asr.set_translate_task() |
|
tgt_language = "en" |
|
else: |
|
tgt_language = language |
|
|
|
print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr) |
|
|
|
if args.vad: |
|
print("setting VAD filter",file=sys.stderr) |
|
asr.use_vad() |
|
|
|
|
|
if args.buffer_trimming == "sentence": |
|
tokenizer = create_tokenizer(tgt_language) |
|
else: |
|
tokenizer = None |
|
if not args.vac: |
|
from whisper_online import OnlineASRProcessor |
|
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
else: |
|
from whisper_online_vac import VACOnlineASRProcessor |
|
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
|
|
|
|
demo_audio_path = "cs-maji-2.16k.wav" |
|
if os.path.exists(demo_audio_path): |
|
|
|
a = load_audio_chunk(demo_audio_path,0,1) |
|
|
|
|
|
|
|
asr.transcribe(a) |
|
else: |
|
print("Whisper is not warmed up",file=sys.stderr) |
|
|
|
|
|
|
|
|
|
|
|
|
|
import line_packet |
|
import socket |
|
|
|
import logging |
|
|
|
|
|
class Connection: |
|
'''it wraps conn object''' |
|
PACKET_SIZE = 32000*5*60 |
|
|
|
def __init__(self, conn): |
|
self.conn = conn |
|
self.last_line = "" |
|
|
|
self.conn.setblocking(True) |
|
|
|
def send(self, line): |
|
'''it doesn't send the same line twice, because it was problematic in online-text-flow-events''' |
|
if line == self.last_line: |
|
return |
|
line_packet.send_one_line(self.conn, line) |
|
self.last_line = line |
|
|
|
def receive_lines(self): |
|
in_line = line_packet.receive_lines(self.conn) |
|
return in_line |
|
|
|
def non_blocking_receive_audio(self): |
|
try: |
|
r = self.conn.recv(self.PACKET_SIZE) |
|
return r |
|
except ConnectionResetError: |
|
return None |
|
|
|
|
|
import io |
|
import soundfile |
|
|
|
|
|
|
|
class ServerProcessor: |
|
|
|
def __init__(self, c, online_asr_proc, min_chunk): |
|
self.connection = c |
|
self.online_asr_proc = online_asr_proc |
|
self.min_chunk = min_chunk |
|
|
|
self.last_end = None |
|
|
|
self.is_first = True |
|
|
|
def receive_audio_chunk(self): |
|
|
|
|
|
|
|
out = [] |
|
minlimit = self.min_chunk*SAMPLING_RATE |
|
while sum(len(x) for x in out) < minlimit: |
|
raw_bytes = self.connection.non_blocking_receive_audio() |
|
if not raw_bytes: |
|
break |
|
print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10]) |
|
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") |
|
audio, _ = librosa.load(sf,sr=SAMPLING_RATE) |
|
out.append(audio) |
|
if not out: |
|
return None |
|
conc = np.concatenate(out) |
|
if self.is_first and len(conc) < minlimit: |
|
return None |
|
self.is_first = False |
|
return np.concatenate(out) |
|
|
|
def format_output_transcript(self,o): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if o[0] is not None: |
|
beg, end = o[0]*1000,o[1]*1000 |
|
if self.last_end is not None: |
|
beg = max(beg, self.last_end) |
|
|
|
self.last_end = end |
|
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr) |
|
return "%1.0f %1.0f %s" % (beg,end,o[2]) |
|
else: |
|
print(o,file=sys.stderr,flush=True) |
|
return None |
|
|
|
def send_result(self, o): |
|
msg = self.format_output_transcript(o) |
|
if msg is not None: |
|
self.connection.send(msg) |
|
|
|
def process(self): |
|
|
|
self.online_asr_proc.init() |
|
while True: |
|
a = self.receive_audio_chunk() |
|
if a is None: |
|
print("break here",file=sys.stderr) |
|
break |
|
self.online_asr_proc.insert_audio_chunk(a) |
|
o = online.process_iter() |
|
try: |
|
self.send_result(o) |
|
except BrokenPipeError: |
|
print("broken pipe -- connection closed?",file=sys.stderr) |
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
level = logging.INFO |
|
logging.basicConfig(level=level, format='whisper-server-%(levelname)s: %(message)s') |
|
|
|
|
|
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: |
|
s.bind((args.host, args.port)) |
|
s.listen(1) |
|
logging.info('INFO: Listening on'+str((args.host, args.port))) |
|
while True: |
|
conn, addr = s.accept() |
|
logging.info('INFO: Connected to client on {}'.format(addr)) |
|
connection = Connection(conn) |
|
proc = ServerProcessor(connection, online, args.min_chunk_size) |
|
proc.process() |
|
conn.close() |
|
logging.info('INFO: Connection to client closed') |
|
logging.info('INFO: Connection closed, terminating.') |
|
|