Commit
·
a4ee6eb
1
Parent(s):
02f90cf
sentence work again!
Browse files- src/whisper_streaming/online_asr.py +125 -77
src/whisper_streaming/online_asr.py
CHANGED
@@ -17,8 +17,10 @@ class HypothesisBuffer:
|
|
17 |
self.logfile = logfile
|
18 |
|
19 |
def insert(self, new, offset):
|
20 |
-
|
21 |
-
|
|
|
|
|
22 |
|
23 |
new = [(a + offset, b + offset, t) for a, b, t in new]
|
24 |
self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
|
@@ -77,6 +79,9 @@ class HypothesisBuffer:
|
|
77 |
return self.buffer
|
78 |
|
79 |
|
|
|
|
|
|
|
80 |
class OnlineASRProcessor:
|
81 |
|
82 |
SAMPLING_RATE = 16000
|
@@ -128,7 +133,9 @@ class OnlineASRProcessor:
|
|
128 |
if offset is not None:
|
129 |
self.buffer_time_offset = offset
|
130 |
self.transcript_buffer.last_commited_time = self.buffer_time_offset
|
131 |
-
self.
|
|
|
|
|
132 |
|
133 |
def insert_audio_chunk(self, audio):
|
134 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
@@ -136,23 +143,42 @@ class OnlineASRProcessor:
|
|
136 |
def prompt(self):
|
137 |
"""Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
|
138 |
"context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
def process_iter(self):
|
158 |
"""Runs on the current audio buffer.
|
@@ -161,93 +187,111 @@ class OnlineASRProcessor:
|
|
161 |
"""
|
162 |
|
163 |
prompt, non_prompt = self.prompt()
|
164 |
-
|
165 |
-
logger.debug(f"CONTEXT: {non_prompt}")
|
166 |
logger.debug(
|
167 |
f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
|
168 |
)
|
169 |
-
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
170 |
|
171 |
-
|
|
|
172 |
tsw = self.asr.ts_words(res)
|
173 |
-
|
|
|
|
|
174 |
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
|
180 |
-
## The rest is incomplete
|
181 |
-
the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
|
182 |
-
logger.debug(f"INCOMPLETE: {the_rest[2]}")
|
183 |
|
184 |
-
# there is a newly confirmed text
|
185 |
|
186 |
-
|
187 |
|
188 |
-
self.chunk_completed_sentence(self.commited)
|
189 |
|
190 |
-
|
|
|
|
|
191 |
|
192 |
-
|
193 |
-
# TODO: only complete sentences should go to completed
|
194 |
|
|
|
|
|
195 |
|
196 |
-
|
|
|
|
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
" Even if no sentence was found!"
|
201 |
-
)
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
# alternative: on any word
|
209 |
-
# l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
|
210 |
-
# let's find commited word that is less
|
211 |
-
# k = len(self.commited)-1
|
212 |
-
# while k>0 and self.commited[k][1] > l:
|
213 |
-
# k -= 1
|
214 |
-
# t = self.commited[k][1]
|
215 |
-
# self.chunk_at(t)
|
216 |
|
217 |
-
|
218 |
-
return completed
|
219 |
|
220 |
-
|
221 |
-
if self.commited == []:
|
222 |
-
return
|
223 |
-
raw_text = self.asr.sep.join([s[2] for s in self.commited])
|
224 |
-
logger.debug(f"COMPLETED SENTENCE: {raw_text}")
|
225 |
-
sents = self.words_to_sentences(self.commited)
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
-
if len(sents) < 2:
|
229 |
-
logger.debug(f"[Sentence-segmentation] no sentence segmented.")
|
230 |
-
return
|
231 |
|
232 |
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
-
# we will continue with audio processing at this timestamp
|
239 |
-
chunk_at = sents[-2][1]
|
240 |
|
|
|
241 |
|
242 |
-
self.chunk_at(chunk_at)
|
243 |
|
244 |
def chunk_completed_segment(self, res):
|
245 |
-
if self.
|
246 |
return
|
247 |
|
248 |
ends = self.asr.segments_end_ts(res)
|
249 |
|
250 |
-
t = self.
|
251 |
|
252 |
if len(ends) <= 1:
|
253 |
logger.debug(f"--- not enough segments to chunk (<=1 words)")
|
@@ -287,9 +331,11 @@ class OnlineASRProcessor:
|
|
287 |
Returns: [(beg,end,"sentence 1"),...]
|
288 |
"""
|
289 |
|
|
|
290 |
cwords = [w for w in words]
|
291 |
t = self.asr.sep.join(o[2] for o in cwords)
|
292 |
logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
|
|
|
293 |
s = self.tokenize([t])
|
294 |
out = []
|
295 |
while s:
|
@@ -302,11 +348,13 @@ class OnlineASRProcessor:
|
|
302 |
w = w.strip()
|
303 |
if beg is None and sent.startswith(w):
|
304 |
beg = b
|
305 |
-
|
306 |
end = e
|
|
|
307 |
out.append((beg, end, fsent))
|
308 |
break
|
309 |
sent = sent[len(w) :].strip()
|
|
|
310 |
return out
|
311 |
|
312 |
def finish(self):
|
@@ -330,7 +378,7 @@ class OnlineASRProcessor:
|
|
330 |
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
331 |
if sep is None:
|
332 |
sep = self.asr.sep
|
333 |
-
|
334 |
t = sep.join(s[2] for s in tsw)
|
335 |
if len(tsw) == 0:
|
336 |
b = None
|
|
|
17 |
self.logfile = logfile
|
18 |
|
19 |
def insert(self, new, offset):
|
20 |
+
"""
|
21 |
+
compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
|
22 |
+
The new tail is added to self.new
|
23 |
+
"""
|
24 |
|
25 |
new = [(a + offset, b + offset, t) for a, b, t in new]
|
26 |
self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
|
|
|
79 |
return self.buffer
|
80 |
|
81 |
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
class OnlineASRProcessor:
|
86 |
|
87 |
SAMPLING_RATE = 16000
|
|
|
133 |
if offset is not None:
|
134 |
self.buffer_time_offset = offset
|
135 |
self.transcript_buffer.last_commited_time = self.buffer_time_offset
|
136 |
+
self.final_transcript = []
|
137 |
+
self.commited_not_final = []
|
138 |
+
|
139 |
|
140 |
def insert_audio_chunk(self, audio):
|
141 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
|
|
143 |
def prompt(self):
|
144 |
"""Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
|
145 |
"context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
|
146 |
+
|
147 |
+
|
148 |
+
"""
|
149 |
+
|
150 |
+
if len(self.final_transcript) == 0:
|
151 |
+
prompt=""
|
152 |
+
|
153 |
+
if len(self.final_transcript) == 1:
|
154 |
+
prompt = self.final_transcript[0][2][-200:]
|
155 |
+
|
156 |
+
else:
|
157 |
+
prompt = self.concatenate_tsw(self.final_transcript)[2][-200:]
|
158 |
+
# TODO: this is not ideal as we concatenate each time the whole transcript
|
159 |
+
|
160 |
+
# k = max(0, len(self.final_transcript) - 1)
|
161 |
+
# while k > 1 and self.final_transcript[k - 1][1] > self.buffer_time_offset:
|
162 |
+
# k -= 1
|
163 |
+
|
164 |
+
# p = self.final_transcript[:k]
|
165 |
+
|
166 |
+
|
167 |
+
# p = [t for _, _, t in p]
|
168 |
+
# prompt = []
|
169 |
+
# l = 0
|
170 |
+
# while p and l < 200: # 200 characters prompt size
|
171 |
+
# x = p.pop(-1)
|
172 |
+
# l += len(x) + 1
|
173 |
+
# prompt.append(x)
|
174 |
+
|
175 |
+
non_prompt = self.concatenate_tsw(self.commited_not_final)[2]
|
176 |
+
|
177 |
+
logger.debug(f"PROMPT(previous): {prompt[:20]}...{prompt[-20:]} (length={len(prompt)}chars)")
|
178 |
+
logger.debug(f"CONTEXT: {non_prompt}")
|
179 |
+
|
180 |
+
return prompt, non_prompt
|
181 |
+
|
182 |
|
183 |
def process_iter(self):
|
184 |
"""Runs on the current audio buffer.
|
|
|
187 |
"""
|
188 |
|
189 |
prompt, non_prompt = self.prompt()
|
190 |
+
|
|
|
191 |
logger.debug(
|
192 |
f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
|
193 |
)
|
|
|
194 |
|
195 |
+
## Transcribe and format the result to [(beg,end,"word1"), ...]
|
196 |
+
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
197 |
tsw = self.asr.ts_words(res)
|
198 |
+
|
199 |
+
|
200 |
+
# insert into HypothesisBuffer, and get back the commited words
|
201 |
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
202 |
+
commited_tsw = self.transcript_buffer.flush()
|
203 |
+
|
204 |
+
if len(commited_tsw) == 0:
|
205 |
+
return (None, None, "")
|
|
|
|
|
|
|
|
|
206 |
|
|
|
207 |
|
208 |
+
self.commited_not_final.extend(commited_tsw)
|
209 |
|
|
|
210 |
|
211 |
+
# Define `completed` and `the_rest` based on the buffer_trimming_way
|
212 |
+
# completed will be returned at the end of the function.
|
213 |
+
# completed is a transcribed text with (beg,end,"sentence ...") format.
|
214 |
|
215 |
+
if self.buffer_trimming_way == "sentence":
|
|
|
216 |
|
217 |
+
sentences = self.words_to_sentences(self.commited_not_final)
|
218 |
+
|
219 |
|
220 |
+
|
221 |
+
if len(sentences) < 2:
|
222 |
+
logger.debug(f"[Sentence-segmentation] no full sentence segmented, do not commit anything.")
|
223 |
|
224 |
+
completed = []
|
225 |
+
|
|
|
|
|
226 |
|
227 |
+
else:
|
228 |
+
identified_sentence= "\n - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sentences])
|
229 |
+
logger.debug(f"[Sentence-segmentation] identified sentences:\n - {identified_sentence}")
|
230 |
+
|
231 |
+
# assume last sentence is incomplete, which is not always true
|
232 |
+
|
233 |
+
# we will continue with audio processing at this timestamp
|
234 |
+
chunk_at = sentences[-2][1]
|
235 |
+
|
236 |
+
self.chunk_at(chunk_at)
|
237 |
+
# TODO: here paragraph breaks can be added
|
238 |
+
self.commited_not_final = sentences[-1:]
|
239 |
+
|
240 |
+
completed= sentences[:-1]
|
241 |
+
|
242 |
|
243 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
|
|
|
|
245 |
|
246 |
+
# break audio buffer anyway if it is too long
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
+
if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
|
249 |
+
|
250 |
+
if self.buffer_trimming_way == "sentence":
|
251 |
+
logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!"
|
252 |
+
" Even if no sentence was found!"
|
253 |
+
)
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
completed = self.concatenate_tsw(self.commited_not_final)
|
260 |
+
self.commited_not_final = []
|
261 |
+
self.chunk_completed_segment(res)
|
262 |
+
# TODO: I don't know if res is the correct variable to pass here
|
263 |
+
else:
|
264 |
+
completed = []
|
265 |
+
|
266 |
|
|
|
|
|
|
|
267 |
|
268 |
|
269 |
+
if len(completed) == 0:
|
270 |
+
return (None, None, "")
|
271 |
+
else:
|
272 |
+
self.final_transcript.extend(completed) # add whole time stamped sentences / or words to commited list
|
273 |
+
|
274 |
|
275 |
+
completed_text_segment= self.concatenate_tsw(completed)
|
276 |
+
|
277 |
+
the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
|
278 |
+
commited_but_not_final = self.concatenate_tsw(self.commited_not_final)
|
279 |
+
logger.debug(f"\n COMPLETE NOW: {completed_text_segment[2]}\n"
|
280 |
+
f" COMMITTED (but not Final): {commited_but_not_final[2]}\n"
|
281 |
+
f" INCOMPLETE: {the_rest[2]}"
|
282 |
+
)
|
283 |
|
|
|
|
|
284 |
|
285 |
+
return completed_text_segment
|
286 |
|
|
|
287 |
|
288 |
def chunk_completed_segment(self, res):
|
289 |
+
if self.final_transcript == []:
|
290 |
return
|
291 |
|
292 |
ends = self.asr.segments_end_ts(res)
|
293 |
|
294 |
+
t = self.final_transcript[-1][1]
|
295 |
|
296 |
if len(ends) <= 1:
|
297 |
logger.debug(f"--- not enough segments to chunk (<=1 words)")
|
|
|
331 |
Returns: [(beg,end,"sentence 1"),...]
|
332 |
"""
|
333 |
|
334 |
+
|
335 |
cwords = [w for w in words]
|
336 |
t = self.asr.sep.join(o[2] for o in cwords)
|
337 |
logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
|
338 |
+
|
339 |
s = self.tokenize([t])
|
340 |
out = []
|
341 |
while s:
|
|
|
348 |
w = w.strip()
|
349 |
if beg is None and sent.startswith(w):
|
350 |
beg = b
|
351 |
+
if end is None and sent == w:
|
352 |
end = e
|
353 |
+
if beg is not None and end is not None:
|
354 |
out.append((beg, end, fsent))
|
355 |
break
|
356 |
sent = sent[len(w) :].strip()
|
357 |
+
|
358 |
return out
|
359 |
|
360 |
def finish(self):
|
|
|
378 |
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
379 |
if sep is None:
|
380 |
sep = self.asr.sep
|
381 |
+
|
382 |
t = sep.join(s[2] for s in tsw)
|
383 |
if len(tsw) == 0:
|
384 |
b = None
|