qfuxa commited on
Commit
37749d3
·
1 Parent(s): 333d52d

enhance chunking to handle audio buffer time limits

Browse files
whisperlivekit/whisper_streaming_custom/online_asr.py CHANGED
@@ -216,31 +216,54 @@ class OnlineASRProcessor:
216
  """
217
  If the committed tokens form at least two sentences, chunk the audio
218
  buffer at the end time of the penultimate sentence.
 
219
  """
 
220
  if not self.committed:
 
 
 
 
221
  return
 
222
  logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
223
  sentences = self.words_to_sentences(self.committed)
224
  for sentence in sentences:
225
  logger.debug(f"\tSentence: {sentence.text}")
226
- if len(sentences) < 2:
227
- return
228
- # Keep the last two sentences.
229
- while len(sentences) > 2:
230
- sentences.pop(0)
231
- chunk_time = sentences[-2].end
232
- logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
233
- self.chunk_at(chunk_time)
 
 
 
 
 
 
234
 
235
  def chunk_completed_segment(self, res):
236
  """
237
  Chunk the audio buffer based on segment-end timestamps reported by the ASR.
 
238
  """
 
239
  if not self.committed:
 
 
 
 
240
  return
 
 
241
  ends = self.asr.segments_end_ts(res)
242
- last_committed_time = self.committed[-1].end
 
243
  if len(ends) > 1:
 
244
  e = ends[-2] + self.buffer_time_offset
245
  while len(ends) > 2 and e > last_committed_time:
246
  ends.pop(-1)
@@ -248,11 +271,18 @@ class OnlineASRProcessor:
248
  if e <= last_committed_time:
249
  logger.debug(f"--- Segment chunked at {e:.2f}")
250
  self.chunk_at(e)
 
251
  else:
252
  logger.debug("--- Last segment not within committed area")
253
  else:
254
  logger.debug("--- Not enough segments to chunk")
255
-
 
 
 
 
 
 
256
  def chunk_at(self, time: float):
257
  """
258
  Trim both the hypothesis and audio buffer at the given time.
 
216
  """
217
  If the committed tokens form at least two sentences, chunk the audio
218
  buffer at the end time of the penultimate sentence.
219
+ Also ensures chunking happens if audio buffer exceeds a time limit.
220
  """
221
+ buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
222
  if not self.committed:
223
+ if buffer_duration > self.buffer_trimming_sec:
224
+ chunk_time = self.buffer_time_offset + (buffer_duration / 2)
225
+ logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
226
+ self.chunk_at(chunk_time)
227
  return
228
+
229
  logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
230
  sentences = self.words_to_sentences(self.committed)
231
  for sentence in sentences:
232
  logger.debug(f"\tSentence: {sentence.text}")
233
+
234
+ chunk_done = False
235
+ if len(sentences) >= 2:
236
+ while len(sentences) > 2:
237
+ sentences.pop(0)
238
+ chunk_time = sentences[-2].end
239
+ logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
240
+ self.chunk_at(chunk_time)
241
+ chunk_done = True
242
+
243
+ if not chunk_done and buffer_duration > self.buffer_trimming_sec:
244
+ last_committed_time = self.committed[-1].end
245
+ logger.debug(f"--- Not enough sentences, chunking at last committed time {last_committed_time:.2f}")
246
+ self.chunk_at(last_committed_time)
247
 
248
  def chunk_completed_segment(self, res):
249
  """
250
  Chunk the audio buffer based on segment-end timestamps reported by the ASR.
251
+ Also ensures chunking happens if audio buffer exceeds a time limit.
252
  """
253
+ buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
254
  if not self.committed:
255
+ if buffer_duration > self.buffer_trimming_sec:
256
+ chunk_time = self.buffer_time_offset + (buffer_duration / 2)
257
+ logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
258
+ self.chunk_at(chunk_time)
259
  return
260
+
261
+ logger.debug("Processing committed tokens for segmenting")
262
  ends = self.asr.segments_end_ts(res)
263
+ last_committed_time = self.committed[-1].end
264
+ chunk_done = False
265
  if len(ends) > 1:
266
+ logger.debug("Multiple segments available for chunking")
267
  e = ends[-2] + self.buffer_time_offset
268
  while len(ends) > 2 and e > last_committed_time:
269
  ends.pop(-1)
 
271
  if e <= last_committed_time:
272
  logger.debug(f"--- Segment chunked at {e:.2f}")
273
  self.chunk_at(e)
274
+ chunk_done = True
275
  else:
276
  logger.debug("--- Last segment not within committed area")
277
  else:
278
  logger.debug("--- Not enough segments to chunk")
279
+
280
+ if not chunk_done and buffer_duration > self.buffer_trimming_sec:
281
+ logger.debug(f"--- Buffer too large, chunking at last committed time {last_committed_time:.2f}")
282
+ self.chunk_at(last_committed_time)
283
+
284
+ logger.debug("Segment chunking complete")
285
+
286
  def chunk_at(self, time: float):
287
  """
288
  Trim both the hypothesis and audio buffer at the given time.