qfuxa commited on
Commit
78d7306
·
1 Parent(s): 9e3b2a9

lag information in real time even when no audio is detected

Browse files
whisperlivekit/audio_processor.py CHANGED
@@ -292,6 +292,7 @@ class AudioProcessor:
292
  """Process audio chunks for transcription."""
293
  self.full_transcription = ""
294
  self.sep = self.online.asr.sep
 
295
 
296
  while True:
297
  try:
@@ -315,25 +316,38 @@ class AudioProcessor:
315
  )
316
 
317
  # Process transcription
318
- self.online.insert_audio_chunk(pcm_array)
319
- new_tokens = self.online.process_iter()
 
 
 
 
320
 
321
  if new_tokens:
322
  self.full_transcription += self.sep.join([t.text for t in new_tokens])
323
 
324
  # Get buffer information
325
- _buffer = self.online.get_buffer()
326
- buffer = _buffer.text
327
- end_buffer = _buffer.end if _buffer.end else (
328
- new_tokens[-1].end if new_tokens else 0
329
- )
 
 
 
 
 
 
 
 
 
330
 
331
  # Avoid duplicating content
332
- if buffer in self.full_transcription:
333
- buffer = ""
334
 
335
  await self.update_transcription(
336
- new_tokens, buffer, end_buffer, self.full_transcription, self.sep
337
  )
338
  self.transcription_queue.task_done()
339
 
 
292
  """Process audio chunks for transcription."""
293
  self.full_transcription = ""
294
  self.sep = self.online.asr.sep
295
+ cumulative_pcm_duration_stream_time = 0.0
296
 
297
  while True:
298
  try:
 
316
  )
317
 
318
  # Process transcription
319
+ duration_this_chunk = len(pcm_array) / self.sample_rate if isinstance(pcm_array, np.ndarray) else 0
320
+ cumulative_pcm_duration_stream_time += duration_this_chunk
321
+ stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
322
+
323
+ self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
324
+ new_tokens, current_audio_processed_upto = self.online.process_iter()
325
 
326
  if new_tokens:
327
  self.full_transcription += self.sep.join([t.text for t in new_tokens])
328
 
329
  # Get buffer information
330
+ _buffer_transcript_obj = self.online.get_buffer()
331
+ buffer_text = _buffer_transcript_obj.text
332
+
333
+ candidate_end_times = [self.end_buffer]
334
+
335
+ if new_tokens:
336
+ candidate_end_times.append(new_tokens[-1].end)
337
+
338
+ if _buffer_transcript_obj.end is not None:
339
+ candidate_end_times.append(_buffer_transcript_obj.end)
340
+
341
+ candidate_end_times.append(current_audio_processed_upto)
342
+
343
+ new_end_buffer = max(candidate_end_times)
344
 
345
  # Avoid duplicating content
346
+ if buffer_text in self.full_transcription:
347
+ buffer_text = ""
348
 
349
  await self.update_transcription(
350
+ new_tokens, buffer_text, new_end_buffer, self.full_transcription, self.sep
351
  )
352
  self.transcription_queue.task_done()
353
 
whisperlivekit/whisper_streaming_custom/online_asr.py CHANGED
@@ -144,7 +144,11 @@ class OnlineASRProcessor:
144
  self.transcript_buffer.last_committed_time = self.buffer_time_offset
145
  self.committed: List[ASRToken] = []
146
 
147
- def insert_audio_chunk(self, audio: np.ndarray):
 
 
 
 
148
  """Append an audio chunk (a numpy array) to the current audio buffer."""
149
  self.audio_buffer = np.append(self.audio_buffer, audio)
150
 
@@ -179,18 +183,19 @@ class OnlineASRProcessor:
179
  return self.concatenate_tokens(self.transcript_buffer.buffer)
180
 
181
 
182
- def process_iter(self) -> Transcript:
183
  """
184
  Processes the current audio buffer.
185
 
186
- Returns a Transcript object representing the committed transcript.
187
  """
 
188
  prompt_text, _ = self.prompt()
189
  logger.debug(
190
  f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
191
  )
192
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
193
- tokens = self.asr.ts_words(res) # Expecting List[ASRToken]
194
  self.transcript_buffer.insert(tokens, self.buffer_time_offset)
195
  committed_tokens = self.transcript_buffer.flush()
196
  self.committed.extend(committed_tokens)
@@ -210,7 +215,7 @@ class OnlineASRProcessor:
210
  logger.debug(
211
  f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
212
  )
213
- return committed_tokens
214
 
215
  def chunk_completed_sentence(self):
216
  """
@@ -344,14 +349,16 @@ class OnlineASRProcessor:
344
  sentences.append(sentence)
345
  return sentences
346
 
347
- def finish(self) -> List[ASRToken]:
348
  """
349
  Flush the remaining transcript when processing ends.
 
350
  """
351
  remaining_tokens = self.transcript_buffer.buffer
352
  logger.debug(f"Final non-committed tokens: {remaining_tokens}")
353
- self.buffer_time_offset += len(self.audio_buffer) / self.SAMPLING_RATE
354
- return remaining_tokens
 
355
 
356
  def concatenate_tokens(
357
  self,
@@ -393,28 +400,35 @@ class VACOnlineASRProcessor:
393
 
394
  self.vac = FixedVADIterator(model)
395
  self.logfile = self.online.logfile
 
396
  self.init()
397
 
398
  def init(self):
399
  self.online.init()
400
  self.vac.reset_states()
401
  self.current_online_chunk_buffer_size = 0
 
402
  self.is_currently_final = False
403
  self.status: Optional[str] = None # "voice" or "nonvoice"
404
  self.audio_buffer = np.array([], dtype=np.float32)
405
  self.buffer_offset = 0 # in frames
406
 
 
 
 
 
407
  def clear_buffer(self):
408
  self.buffer_offset += len(self.audio_buffer)
409
  self.audio_buffer = np.array([], dtype=np.float32)
410
 
411
- def insert_audio_chunk(self, audio: np.ndarray):
412
  """
413
  Process an incoming small audio chunk:
414
  - run VAD on the chunk,
415
  - decide whether to send the audio to the online ASR processor immediately,
416
  - and/or to mark the current utterance as finished.
417
  """
 
418
  res = self.vac(audio)
419
  self.audio_buffer = np.append(self.audio_buffer, audio)
420
 
@@ -456,10 +470,11 @@ class VACOnlineASRProcessor:
456
  self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
457
  self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
458
 
459
- def process_iter(self) -> List[ASRToken]:
460
  """
461
  Depending on the VAD status and the amount of accumulated audio,
462
  process the current audio chunk.
 
463
  """
464
  if self.is_currently_final:
465
  return self.finish()
@@ -468,14 +483,17 @@ class VACOnlineASRProcessor:
468
  return self.online.process_iter()
469
  else:
470
  logger.debug("No online update, only VAD")
471
- return []
472
 
473
- def finish(self) -> List[ASRToken]:
474
- """Finish processing by flushing any remaining text."""
475
- result = self.online.finish()
 
 
 
476
  self.current_online_chunk_buffer_size = 0
477
  self.is_currently_final = False
478
- return result
479
 
480
  def get_buffer(self):
481
  """
 
144
  self.transcript_buffer.last_committed_time = self.buffer_time_offset
145
  self.committed: List[ASRToken] = []
146
 
147
+ def get_audio_buffer_end_time(self) -> float:
148
+ """Returns the absolute end time of the current audio_buffer."""
149
+ return self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
150
+
151
+ def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
152
  """Append an audio chunk (a numpy array) to the current audio buffer."""
153
  self.audio_buffer = np.append(self.audio_buffer, audio)
154
 
 
183
  return self.concatenate_tokens(self.transcript_buffer.buffer)
184
 
185
 
186
+ def process_iter(self) -> Tuple[List[ASRToken], float]:
187
  """
188
  Processes the current audio buffer.
189
 
190
+ Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
191
  """
192
+ current_audio_processed_upto = self.get_audio_buffer_end_time()
193
  prompt_text, _ = self.prompt()
194
  logger.debug(
195
  f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
196
  )
197
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
198
+ tokens = self.asr.ts_words(res)
199
  self.transcript_buffer.insert(tokens, self.buffer_time_offset)
200
  committed_tokens = self.transcript_buffer.flush()
201
  self.committed.extend(committed_tokens)
 
215
  logger.debug(
216
  f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
217
  )
218
+ return committed_tokens, current_audio_processed_upto
219
 
220
  def chunk_completed_sentence(self):
221
  """
 
349
  sentences.append(sentence)
350
  return sentences
351
 
352
+ def finish(self) -> Tuple[List[ASRToken], float]:
353
  """
354
  Flush the remaining transcript when processing ends.
355
+ Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
356
  """
357
  remaining_tokens = self.transcript_buffer.buffer
358
  logger.debug(f"Final non-committed tokens: {remaining_tokens}")
359
+ final_processed_upto = self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
360
+ self.buffer_time_offset = final_processed_upto
361
+ return remaining_tokens, final_processed_upto
362
 
363
  def concatenate_tokens(
364
  self,
 
400
 
401
  self.vac = FixedVADIterator(model)
402
  self.logfile = self.online.logfile
403
+ self.last_input_audio_stream_end_time: float = 0.0
404
  self.init()
405
 
406
  def init(self):
407
  self.online.init()
408
  self.vac.reset_states()
409
  self.current_online_chunk_buffer_size = 0
410
+ self.last_input_audio_stream_end_time = self.online.buffer_time_offset
411
  self.is_currently_final = False
412
  self.status: Optional[str] = None # "voice" or "nonvoice"
413
  self.audio_buffer = np.array([], dtype=np.float32)
414
  self.buffer_offset = 0 # in frames
415
 
416
+ def get_audio_buffer_end_time(self) -> float:
417
+ """Returns the absolute end time of the audio processed by the underlying OnlineASRProcessor."""
418
+ return self.online.get_audio_buffer_end_time()
419
+
420
  def clear_buffer(self):
421
  self.buffer_offset += len(self.audio_buffer)
422
  self.audio_buffer = np.array([], dtype=np.float32)
423
 
424
+ def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: float):
425
  """
426
  Process an incoming small audio chunk:
427
  - run VAD on the chunk,
428
  - decide whether to send the audio to the online ASR processor immediately,
429
  - and/or to mark the current utterance as finished.
430
  """
431
+ self.last_input_audio_stream_end_time = audio_stream_end_time
432
  res = self.vac(audio)
433
  self.audio_buffer = np.append(self.audio_buffer, audio)
434
 
 
470
  self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
471
  self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
472
 
473
+ def process_iter(self) -> Tuple[List[ASRToken], float]:
474
  """
475
  Depending on the VAD status and the amount of accumulated audio,
476
  process the current audio chunk.
477
+ Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
478
  """
479
  if self.is_currently_final:
480
  return self.finish()
 
483
  return self.online.process_iter()
484
  else:
485
  logger.debug("No online update, only VAD")
486
+ return [], self.last_input_audio_stream_end_time
487
 
488
+ def finish(self) -> Tuple[List[ASRToken], float]:
489
+ """
490
+ Finish processing by flushing any remaining text.
491
+ Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
492
+ """
493
+ result_tokens, processed_upto = self.online.finish()
494
  self.current_online_chunk_buffer_size = 0
495
  self.is_currently_final = False
496
+ return result_tokens, processed_upto
497
 
498
  def get_buffer(self):
499
  """