SilasKieser commited on
Commit
b7c5736
·
1 Parent(s): 33d0ade

rename to_flush to concatenate_tsw

Browse files
src/whisper_streaming/online_asr.py CHANGED
@@ -170,15 +170,15 @@ class OnlineASRProcessor:
170
 
171
  # transform to [(beg,end,"word1"), ...]
172
  tsw = self.asr.ts_words(res)
173
-
174
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
175
  o = self.transcript_buffer.flush()
176
  # Completed words
177
  self.commited.extend(o)
178
- completed = self.to_flush(o)
179
  logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
180
  ## The rest is incomplete
181
- the_rest = self.to_flush(self.transcript_buffer.complete())
182
  logger.debug(f"INCOMPLETE: {the_rest[2]}")
183
 
184
  # there is a newly confirmed text
@@ -245,7 +245,7 @@ class OnlineASRProcessor:
245
  # we will continue with audio processing at this timestamp
246
  chunk_at = sents[-2][1]
247
 
248
- logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}")
249
  self.chunk_at(chunk_at)
250
 
251
  def chunk_completed_segment(self, res):
@@ -275,6 +275,11 @@ class OnlineASRProcessor:
275
  """trims the hypothesis and audio buffer at "time" """
276
  logger.debug(f"chunking at {time:2.2f}s")
277
 
 
 
 
 
 
278
  self.transcript_buffer.pop_commited(time)
279
  cut_seconds = time - self.buffer_time_offset
280
  self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
@@ -316,14 +321,14 @@ class OnlineASRProcessor:
316
  Returns: the same format as self.process_iter()
317
  """
318
  o = self.transcript_buffer.complete()
319
- f = self.to_flush(o)
320
  logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
321
  self.buffer_time_offset += len(self.audio_buffer) / 16000
322
  return f
323
 
324
- def to_flush(
325
  self,
326
- sents,
327
  sep=None,
328
  offset=0,
329
  ):
@@ -332,13 +337,14 @@ class OnlineASRProcessor:
332
  # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
333
  if sep is None:
334
  sep = self.asr.sep
335
- t = sep.join(s[2] for s in sents)
336
- if len(sents) == 0:
 
337
  b = None
338
  e = None
339
  else:
340
- b = offset + sents[0][0]
341
- e = offset + sents[-1][1]
342
  return (b, e, t)
343
 
344
 
 
170
 
171
  # transform to [(beg,end,"word1"), ...]
172
  tsw = self.asr.ts_words(res)
173
+ # insert into HypothesisBuffer
174
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
175
  o = self.transcript_buffer.flush()
176
  # Completed words
177
  self.commited.extend(o)
178
+ completed = self.concatenate_tsw(o) # This will be returned at the end of the function
179
  logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
180
  ## The rest is incomplete
181
+ the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
182
  logger.debug(f"INCOMPLETE: {the_rest[2]}")
183
 
184
  # there is a newly confirmed text
 
245
  # we will continue with audio processing at this timestamp
246
  chunk_at = sents[-2][1]
247
 
248
+
249
  self.chunk_at(chunk_at)
250
 
251
  def chunk_completed_segment(self, res):
 
275
  """trims the hypothesis and audio buffer at "time" """
276
  logger.debug(f"chunking at {time:2.2f}s")
277
 
278
+ logger.debug(
279
+ f"len of audio buffer before chunking is: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}s"
280
+ )
281
+
282
+
283
  self.transcript_buffer.pop_commited(time)
284
  cut_seconds = time - self.buffer_time_offset
285
  self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
 
321
  Returns: the same format as self.process_iter()
322
  """
323
  o = self.transcript_buffer.complete()
324
+ f = self.concatenate_tsw(o)
325
  logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
326
  self.buffer_time_offset += len(self.audio_buffer) / 16000
327
  return f
328
 
329
+ def concatenate_tsw(
330
  self,
331
+ tsw,
332
  sep=None,
333
  offset=0,
334
  ):
 
337
  # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
338
  if sep is None:
339
  sep = self.asr.sep
340
+
341
+ t = sep.join(s[2] for s in tsw)
342
+ if len(tsw) == 0:
343
  b = None
344
  e = None
345
  else:
346
+ b = offset + tsw[0][0]
347
+ e = offset + tsw[-1][1]
348
  return (b, e, t)
349
 
350
 
whisper_fastapi_online_server.py CHANGED
@@ -125,13 +125,13 @@ async def websocket_endpoint(websocket: WebSocket):
125
  transcription = online.process_iter()[2]
126
  full_transcription += transcription
127
  if args.vac:
128
- buffer = online.online.to_flush(
129
  online.online.transcript_buffer.buffer
130
  )[
131
  2
132
  ] # We need to access the underlying online object to get the buffer
133
  else:
134
- buffer = online.to_flush(online.transcript_buffer.buffer)[2]
135
  if (
136
  buffer in full_transcription
137
  ): # With VAC, the buffer is not updated until the next chunk is processed
 
125
  transcription = online.process_iter()[2]
126
  full_transcription += transcription
127
  if args.vac:
128
+ buffer = online.online.concatenate_tsw(
129
  online.online.transcript_buffer.buffer
130
  )[
131
  2
132
  ] # We need to access the underlying online object to get the buffer
133
  else:
134
+ buffer = online.concatenate_tsw(online.transcript_buffer.buffer)[2]
135
  if (
136
  buffer in full_transcription
137
  ): # With VAC, the buffer is not updated until the next chunk is processed