lag information in real time even when no audio is detected
Browse files
whisperlivekit/audio_processor.py
CHANGED
@@ -292,6 +292,7 @@ class AudioProcessor:
|
|
292 |
"""Process audio chunks for transcription."""
|
293 |
self.full_transcription = ""
|
294 |
self.sep = self.online.asr.sep
|
|
|
295 |
|
296 |
while True:
|
297 |
try:
|
@@ -315,25 +316,38 @@ class AudioProcessor:
|
|
315 |
)
|
316 |
|
317 |
# Process transcription
|
318 |
-
self.
|
319 |
-
|
|
|
|
|
|
|
|
|
320 |
|
321 |
if new_tokens:
|
322 |
self.full_transcription += self.sep.join([t.text for t in new_tokens])
|
323 |
|
324 |
# Get buffer information
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
# Avoid duplicating content
|
332 |
-
if
|
333 |
-
|
334 |
|
335 |
await self.update_transcription(
|
336 |
-
new_tokens,
|
337 |
)
|
338 |
self.transcription_queue.task_done()
|
339 |
|
|
|
292 |
"""Process audio chunks for transcription."""
|
293 |
self.full_transcription = ""
|
294 |
self.sep = self.online.asr.sep
|
295 |
+
cumulative_pcm_duration_stream_time = 0.0
|
296 |
|
297 |
while True:
|
298 |
try:
|
|
|
316 |
)
|
317 |
|
318 |
# Process transcription
|
319 |
+
duration_this_chunk = len(pcm_array) / self.sample_rate if isinstance(pcm_array, np.ndarray) else 0
|
320 |
+
cumulative_pcm_duration_stream_time += duration_this_chunk
|
321 |
+
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
|
322 |
+
|
323 |
+
self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
|
324 |
+
new_tokens, current_audio_processed_upto = self.online.process_iter()
|
325 |
|
326 |
if new_tokens:
|
327 |
self.full_transcription += self.sep.join([t.text for t in new_tokens])
|
328 |
|
329 |
# Get buffer information
|
330 |
+
_buffer_transcript_obj = self.online.get_buffer()
|
331 |
+
buffer_text = _buffer_transcript_obj.text
|
332 |
+
|
333 |
+
candidate_end_times = [self.end_buffer]
|
334 |
+
|
335 |
+
if new_tokens:
|
336 |
+
candidate_end_times.append(new_tokens[-1].end)
|
337 |
+
|
338 |
+
if _buffer_transcript_obj.end is not None:
|
339 |
+
candidate_end_times.append(_buffer_transcript_obj.end)
|
340 |
+
|
341 |
+
candidate_end_times.append(current_audio_processed_upto)
|
342 |
+
|
343 |
+
new_end_buffer = max(candidate_end_times)
|
344 |
|
345 |
# Avoid duplicating content
|
346 |
+
if buffer_text in self.full_transcription:
|
347 |
+
buffer_text = ""
|
348 |
|
349 |
await self.update_transcription(
|
350 |
+
new_tokens, buffer_text, new_end_buffer, self.full_transcription, self.sep
|
351 |
)
|
352 |
self.transcription_queue.task_done()
|
353 |
|
whisperlivekit/whisper_streaming_custom/online_asr.py
CHANGED
@@ -144,7 +144,11 @@ class OnlineASRProcessor:
|
|
144 |
self.transcript_buffer.last_committed_time = self.buffer_time_offset
|
145 |
self.committed: List[ASRToken] = []
|
146 |
|
147 |
-
def
|
|
|
|
|
|
|
|
|
148 |
"""Append an audio chunk (a numpy array) to the current audio buffer."""
|
149 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
150 |
|
@@ -179,18 +183,19 @@ class OnlineASRProcessor:
|
|
179 |
return self.concatenate_tokens(self.transcript_buffer.buffer)
|
180 |
|
181 |
|
182 |
-
def process_iter(self) ->
|
183 |
"""
|
184 |
Processes the current audio buffer.
|
185 |
|
186 |
-
Returns a
|
187 |
"""
|
|
|
188 |
prompt_text, _ = self.prompt()
|
189 |
logger.debug(
|
190 |
f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
|
191 |
)
|
192 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
|
193 |
-
tokens = self.asr.ts_words(res)
|
194 |
self.transcript_buffer.insert(tokens, self.buffer_time_offset)
|
195 |
committed_tokens = self.transcript_buffer.flush()
|
196 |
self.committed.extend(committed_tokens)
|
@@ -210,7 +215,7 @@ class OnlineASRProcessor:
|
|
210 |
logger.debug(
|
211 |
f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
|
212 |
)
|
213 |
-
return committed_tokens
|
214 |
|
215 |
def chunk_completed_sentence(self):
|
216 |
"""
|
@@ -344,14 +349,16 @@ class OnlineASRProcessor:
|
|
344 |
sentences.append(sentence)
|
345 |
return sentences
|
346 |
|
347 |
-
def finish(self) -> List[ASRToken]:
|
348 |
"""
|
349 |
Flush the remaining transcript when processing ends.
|
|
|
350 |
"""
|
351 |
remaining_tokens = self.transcript_buffer.buffer
|
352 |
logger.debug(f"Final non-committed tokens: {remaining_tokens}")
|
353 |
-
self.buffer_time_offset
|
354 |
-
|
|
|
355 |
|
356 |
def concatenate_tokens(
|
357 |
self,
|
@@ -393,28 +400,35 @@ class VACOnlineASRProcessor:
|
|
393 |
|
394 |
self.vac = FixedVADIterator(model)
|
395 |
self.logfile = self.online.logfile
|
|
|
396 |
self.init()
|
397 |
|
398 |
def init(self):
|
399 |
self.online.init()
|
400 |
self.vac.reset_states()
|
401 |
self.current_online_chunk_buffer_size = 0
|
|
|
402 |
self.is_currently_final = False
|
403 |
self.status: Optional[str] = None # "voice" or "nonvoice"
|
404 |
self.audio_buffer = np.array([], dtype=np.float32)
|
405 |
self.buffer_offset = 0 # in frames
|
406 |
|
|
|
|
|
|
|
|
|
407 |
def clear_buffer(self):
|
408 |
self.buffer_offset += len(self.audio_buffer)
|
409 |
self.audio_buffer = np.array([], dtype=np.float32)
|
410 |
|
411 |
-
def insert_audio_chunk(self, audio: np.ndarray):
|
412 |
"""
|
413 |
Process an incoming small audio chunk:
|
414 |
- run VAD on the chunk,
|
415 |
- decide whether to send the audio to the online ASR processor immediately,
|
416 |
- and/or to mark the current utterance as finished.
|
417 |
"""
|
|
|
418 |
res = self.vac(audio)
|
419 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
420 |
|
@@ -456,10 +470,11 @@ class VACOnlineASRProcessor:
|
|
456 |
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
457 |
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
458 |
|
459 |
-
def process_iter(self) -> List[ASRToken]:
|
460 |
"""
|
461 |
Depending on the VAD status and the amount of accumulated audio,
|
462 |
process the current audio chunk.
|
|
|
463 |
"""
|
464 |
if self.is_currently_final:
|
465 |
return self.finish()
|
@@ -468,14 +483,17 @@ class VACOnlineASRProcessor:
|
|
468 |
return self.online.process_iter()
|
469 |
else:
|
470 |
logger.debug("No online update, only VAD")
|
471 |
-
return []
|
472 |
|
473 |
-
def finish(self) -> List[ASRToken]:
|
474 |
-
"""
|
475 |
-
|
|
|
|
|
|
|
476 |
self.current_online_chunk_buffer_size = 0
|
477 |
self.is_currently_final = False
|
478 |
-
return
|
479 |
|
480 |
def get_buffer(self):
|
481 |
"""
|
|
|
144 |
self.transcript_buffer.last_committed_time = self.buffer_time_offset
|
145 |
self.committed: List[ASRToken] = []
|
146 |
|
147 |
+
def get_audio_buffer_end_time(self) -> float:
|
148 |
+
"""Returns the absolute end time of the current audio_buffer."""
|
149 |
+
return self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
|
150 |
+
|
151 |
+
def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
|
152 |
"""Append an audio chunk (a numpy array) to the current audio buffer."""
|
153 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
154 |
|
|
|
183 |
return self.concatenate_tokens(self.transcript_buffer.buffer)
|
184 |
|
185 |
|
186 |
+
def process_iter(self) -> Tuple[List[ASRToken], float]:
|
187 |
"""
|
188 |
Processes the current audio buffer.
|
189 |
|
190 |
+
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
|
191 |
"""
|
192 |
+
current_audio_processed_upto = self.get_audio_buffer_end_time()
|
193 |
prompt_text, _ = self.prompt()
|
194 |
logger.debug(
|
195 |
f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
|
196 |
)
|
197 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
|
198 |
+
tokens = self.asr.ts_words(res)
|
199 |
self.transcript_buffer.insert(tokens, self.buffer_time_offset)
|
200 |
committed_tokens = self.transcript_buffer.flush()
|
201 |
self.committed.extend(committed_tokens)
|
|
|
215 |
logger.debug(
|
216 |
f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
|
217 |
)
|
218 |
+
return committed_tokens, current_audio_processed_upto
|
219 |
|
220 |
def chunk_completed_sentence(self):
|
221 |
"""
|
|
|
349 |
sentences.append(sentence)
|
350 |
return sentences
|
351 |
|
352 |
+
def finish(self) -> Tuple[List[ASRToken], float]:
|
353 |
"""
|
354 |
Flush the remaining transcript when processing ends.
|
355 |
+
Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
|
356 |
"""
|
357 |
remaining_tokens = self.transcript_buffer.buffer
|
358 |
logger.debug(f"Final non-committed tokens: {remaining_tokens}")
|
359 |
+
final_processed_upto = self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
|
360 |
+
self.buffer_time_offset = final_processed_upto
|
361 |
+
return remaining_tokens, final_processed_upto
|
362 |
|
363 |
def concatenate_tokens(
|
364 |
self,
|
|
|
400 |
|
401 |
self.vac = FixedVADIterator(model)
|
402 |
self.logfile = self.online.logfile
|
403 |
+
self.last_input_audio_stream_end_time: float = 0.0
|
404 |
self.init()
|
405 |
|
406 |
def init(self):
|
407 |
self.online.init()
|
408 |
self.vac.reset_states()
|
409 |
self.current_online_chunk_buffer_size = 0
|
410 |
+
self.last_input_audio_stream_end_time = self.online.buffer_time_offset
|
411 |
self.is_currently_final = False
|
412 |
self.status: Optional[str] = None # "voice" or "nonvoice"
|
413 |
self.audio_buffer = np.array([], dtype=np.float32)
|
414 |
self.buffer_offset = 0 # in frames
|
415 |
|
416 |
+
def get_audio_buffer_end_time(self) -> float:
|
417 |
+
"""Returns the absolute end time of the audio processed by the underlying OnlineASRProcessor."""
|
418 |
+
return self.online.get_audio_buffer_end_time()
|
419 |
+
|
420 |
def clear_buffer(self):
|
421 |
self.buffer_offset += len(self.audio_buffer)
|
422 |
self.audio_buffer = np.array([], dtype=np.float32)
|
423 |
|
424 |
+
def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: float):
|
425 |
"""
|
426 |
Process an incoming small audio chunk:
|
427 |
- run VAD on the chunk,
|
428 |
- decide whether to send the audio to the online ASR processor immediately,
|
429 |
- and/or to mark the current utterance as finished.
|
430 |
"""
|
431 |
+
self.last_input_audio_stream_end_time = audio_stream_end_time
|
432 |
res = self.vac(audio)
|
433 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
434 |
|
|
|
470 |
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
471 |
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
472 |
|
473 |
+
def process_iter(self) -> Tuple[List[ASRToken], float]:
|
474 |
"""
|
475 |
Depending on the VAD status and the amount of accumulated audio,
|
476 |
process the current audio chunk.
|
477 |
+
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
|
478 |
"""
|
479 |
if self.is_currently_final:
|
480 |
return self.finish()
|
|
|
483 |
return self.online.process_iter()
|
484 |
else:
|
485 |
logger.debug("No online update, only VAD")
|
486 |
+
return [], self.last_input_audio_stream_end_time
|
487 |
|
488 |
+
def finish(self) -> Tuple[List[ASRToken], float]:
|
489 |
+
"""
|
490 |
+
Finish processing by flushing any remaining text.
|
491 |
+
Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
|
492 |
+
"""
|
493 |
+
result_tokens, processed_upto = self.online.finish()
|
494 |
self.current_online_chunk_buffer_size = 0
|
495 |
self.is_currently_final = False
|
496 |
+
return result_tokens, processed_upto
|
497 |
|
498 |
def get_buffer(self):
|
499 |
"""
|