Commit
·
b7c5736
1
Parent(s):
33d0ade
rename to_flush to concatenate_tsw
Browse files
src/whisper_streaming/online_asr.py
CHANGED
@@ -170,15 +170,15 @@ class OnlineASRProcessor:
|
|
170 |
|
171 |
# transform to [(beg,end,"word1"), ...]
|
172 |
tsw = self.asr.ts_words(res)
|
173 |
-
|
174 |
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
175 |
o = self.transcript_buffer.flush()
|
176 |
# Completed words
|
177 |
self.commited.extend(o)
|
178 |
-
completed = self.
|
179 |
logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
|
180 |
## The rest is incomplete
|
181 |
-
the_rest = self.
|
182 |
logger.debug(f"INCOMPLETE: {the_rest[2]}")
|
183 |
|
184 |
# there is a newly confirmed text
|
@@ -245,7 +245,7 @@ class OnlineASRProcessor:
|
|
245 |
# we will continue with audio processing at this timestamp
|
246 |
chunk_at = sents[-2][1]
|
247 |
|
248 |
-
|
249 |
self.chunk_at(chunk_at)
|
250 |
|
251 |
def chunk_completed_segment(self, res):
|
@@ -275,6 +275,11 @@ class OnlineASRProcessor:
|
|
275 |
"""trims the hypothesis and audio buffer at "time" """
|
276 |
logger.debug(f"chunking at {time:2.2f}s")
|
277 |
|
|
|
|
|
|
|
|
|
|
|
278 |
self.transcript_buffer.pop_commited(time)
|
279 |
cut_seconds = time - self.buffer_time_offset
|
280 |
self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
|
@@ -316,14 +321,14 @@ class OnlineASRProcessor:
|
|
316 |
Returns: the same format as self.process_iter()
|
317 |
"""
|
318 |
o = self.transcript_buffer.complete()
|
319 |
-
f = self.
|
320 |
logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
|
321 |
self.buffer_time_offset += len(self.audio_buffer) / 16000
|
322 |
return f
|
323 |
|
324 |
-
def
|
325 |
self,
|
326 |
-
|
327 |
sep=None,
|
328 |
offset=0,
|
329 |
):
|
@@ -332,13 +337,14 @@ class OnlineASRProcessor:
|
|
332 |
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
333 |
if sep is None:
|
334 |
sep = self.asr.sep
|
335 |
-
|
336 |
-
|
|
|
337 |
b = None
|
338 |
e = None
|
339 |
else:
|
340 |
-
b = offset +
|
341 |
-
e = offset +
|
342 |
return (b, e, t)
|
343 |
|
344 |
|
|
|
170 |
|
171 |
# transform to [(beg,end,"word1"), ...]
|
172 |
tsw = self.asr.ts_words(res)
|
173 |
+
# insert into HypothesisBuffer
|
174 |
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
175 |
o = self.transcript_buffer.flush()
|
176 |
# Completed words
|
177 |
self.commited.extend(o)
|
178 |
+
completed = self.concatenate_tsw(o) # This will be returned at the end of the function
|
179 |
logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
|
180 |
## The rest is incomplete
|
181 |
+
the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
|
182 |
logger.debug(f"INCOMPLETE: {the_rest[2]}")
|
183 |
|
184 |
# there is a newly confirmed text
|
|
|
245 |
# we will continue with audio processing at this timestamp
|
246 |
chunk_at = sents[-2][1]
|
247 |
|
248 |
+
|
249 |
self.chunk_at(chunk_at)
|
250 |
|
251 |
def chunk_completed_segment(self, res):
|
|
|
275 |
"""trims the hypothesis and audio buffer at "time" """
|
276 |
logger.debug(f"chunking at {time:2.2f}s")
|
277 |
|
278 |
+
logger.debug(
|
279 |
+
f"len of audio buffer before chunking is: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}s"
|
280 |
+
)
|
281 |
+
|
282 |
+
|
283 |
self.transcript_buffer.pop_commited(time)
|
284 |
cut_seconds = time - self.buffer_time_offset
|
285 |
self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
|
|
|
321 |
Returns: the same format as self.process_iter()
|
322 |
"""
|
323 |
o = self.transcript_buffer.complete()
|
324 |
+
f = self.concatenate_tsw(o)
|
325 |
logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
|
326 |
self.buffer_time_offset += len(self.audio_buffer) / 16000
|
327 |
return f
|
328 |
|
329 |
+
def concatenate_tsw(
|
330 |
self,
|
331 |
+
tsw,
|
332 |
sep=None,
|
333 |
offset=0,
|
334 |
):
|
|
|
337 |
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
338 |
if sep is None:
|
339 |
sep = self.asr.sep
|
340 |
+
|
341 |
+
t = sep.join(s[2] for s in tsw)
|
342 |
+
if len(tsw) == 0:
|
343 |
b = None
|
344 |
e = None
|
345 |
else:
|
346 |
+
b = offset + tsw[0][0]
|
347 |
+
e = offset + tsw[-1][1]
|
348 |
return (b, e, t)
|
349 |
|
350 |
|
whisper_fastapi_online_server.py
CHANGED
@@ -125,13 +125,13 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
125 |
transcription = online.process_iter()[2]
|
126 |
full_transcription += transcription
|
127 |
if args.vac:
|
128 |
-
buffer = online.online.
|
129 |
online.online.transcript_buffer.buffer
|
130 |
)[
|
131 |
2
|
132 |
] # We need to access the underlying online object to get the buffer
|
133 |
else:
|
134 |
-
buffer = online.
|
135 |
if (
|
136 |
buffer in full_transcription
|
137 |
): # With VAC, the buffer is not updated until the next chunk is processed
|
|
|
125 |
transcription = online.process_iter()[2]
|
126 |
full_transcription += transcription
|
127 |
if args.vac:
|
128 |
+
buffer = online.online.concatenate_tsw(
|
129 |
online.online.transcript_buffer.buffer
|
130 |
)[
|
131 |
2
|
132 |
] # We need to access the underlying online object to get the buffer
|
133 |
else:
|
134 |
+
buffer = online.concatenate_tsw(online.transcript_buffer.buffer)[2]
|
135 |
if (
|
136 |
buffer in full_transcription
|
137 |
): # With VAC, the buffer is not updated until the next chunk is processed
|