File size: 5,872 Bytes
7222c68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import json
import logging
import threading
import time
from openvino import Core
from whisper_live.backend.base import ServeClientBase
from whisper_live.transcriber.transcriber_openvino import WhisperOpenVINO
class ServeClientOpenVINO(ServeClientBase):
SINGLE_MODEL = None
SINGLE_MODEL_LOCK = threading.Lock()
def __init__(
self,
websocket,
task="transcribe",
device=None,
language=None,
client_uid=None,
model="small.en",
initial_prompt=None,
vad_parameters=None,
use_vad=True,
single_model=False,
send_last_n_segments=10,
no_speech_thresh=0.45,
clip_audio=False,
same_output_threshold=10,
):
"""
Initialize a ServeClient instance.
The Whisper model is initialized based on the client's language and device availability.
The transcription thread is started upon initialization. A "SERVER_READY" message is sent
to the client to indicate that the server is ready.
Args:
websocket (WebSocket): The WebSocket connection for the client.
task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe".
device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
language (str, optional): The language for transcription. Defaults to None.
client_uid (str, optional): A unique identifier for the client. Defaults to None.
model (str, optional): Huggingface model_id for a valid OpenVINO model.
initial_prompt (str, optional): Prompt for whisper inference. Defaults to None.
single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
"""
super().__init__(
client_uid,
websocket,
send_last_n_segments,
no_speech_thresh,
clip_audio,
same_output_threshold,
)
self.language = "en" if language is None else language
if not self.language.startswith("<|"):
self.language = f"<|{self.language}|>"
self.task = "transcribe" if task is None else task
self.clip_audio = True
core = Core()
available_devices = core.available_devices
if 'GPU' in available_devices:
selected_device = 'GPU'
else:
gpu_devices = [d for d in available_devices if d.startswith('GPU')]
selected_device = gpu_devices[0] if gpu_devices else 'CPU'
self.device = selected_device
if single_model:
if ServeClientOpenVINO.SINGLE_MODEL is None:
self.create_model(model)
ServeClientOpenVINO.SINGLE_MODEL = self.transcriber
else:
self.transcriber = ServeClientOpenVINO.SINGLE_MODEL
else:
self.create_model(model)
# threading
self.trans_thread = threading.Thread(target=self.speech_to_text)
self.trans_thread.start()
self.websocket.send(json.dumps({
"uid": self.client_uid,
"message": self.SERVER_READY,
"backend": "openvino"
}))
logging.info(f"Using OpenVINO device: {self.device}")
logging.info(f"Running OpenVINO backend with language: {self.language} and task: {self.task}")
def create_model(self, model_id):
"""
Instantiates a new model, sets it as the transcriber.
"""
self.transcriber = WhisperOpenVINO(
model_id,
device=self.device,
language=self.language,
task=self.task
)
def transcribe_audio(self, input_sample):
"""
Transcribes the provided audio sample using the configured transcriber instance.
If the language has not been set, it updates the session's language based on the transcription
information.
Args:
input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy
array representing the audio data.
Returns:
The transcription result from the transcriber. The exact format of this result
depends on the implementation of the `transcriber.transcribe` method but typically
includes the transcribed text.
"""
if ServeClientOpenVINO.SINGLE_MODEL:
ServeClientOpenVINO.SINGLE_MODEL_LOCK.acquire()
result = self.transcriber.transcribe(input_sample)
if ServeClientOpenVINO.SINGLE_MODEL:
ServeClientOpenVINO.SINGLE_MODEL_LOCK.release()
return result
def handle_transcription_output(self, result, duration):
"""
Handle the transcription output, updating the transcript and sending data to the client.
Args:
result (str): The result from whisper inference i.e. the list of segments.
duration (float): Duration of the transcribed audio chunk.
"""
segments = []
if len(result):
self.t_start = None
last_segment = self.update_segments(result, duration)
segments = self.prepare_segments(last_segment)
if len(segments):
self.send_transcription_to_client(segments)
|