SilasKieser commited on
Commit
a4ee6eb
·
1 Parent(s): 02f90cf

sentence work again!

Browse files
Files changed (1) hide show
  1. src/whisper_streaming/online_asr.py +125 -77
src/whisper_streaming/online_asr.py CHANGED
@@ -17,8 +17,10 @@ class HypothesisBuffer:
17
  self.logfile = logfile
18
 
19
  def insert(self, new, offset):
20
- # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
21
- # the new tail is added to self.new
 
 
22
 
23
  new = [(a + offset, b + offset, t) for a, b, t in new]
24
  self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
@@ -77,6 +79,9 @@ class HypothesisBuffer:
77
  return self.buffer
78
 
79
 
 
 
 
80
  class OnlineASRProcessor:
81
 
82
  SAMPLING_RATE = 16000
@@ -128,7 +133,9 @@ class OnlineASRProcessor:
128
  if offset is not None:
129
  self.buffer_time_offset = offset
130
  self.transcript_buffer.last_commited_time = self.buffer_time_offset
131
- self.commited = []
 
 
132
 
133
  def insert_audio_chunk(self, audio):
134
  self.audio_buffer = np.append(self.audio_buffer, audio)
@@ -136,23 +143,42 @@ class OnlineASRProcessor:
136
  def prompt(self):
137
  """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
138
  "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
139
- """
140
- k = max(0, len(self.commited) - 1)
141
- while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset:
142
- k -= 1
143
-
144
- p = self.commited[:k]
145
- p = [t for _, _, t in p]
146
- prompt = []
147
- l = 0
148
- while p and l < 200: # 200 characters prompt size
149
- x = p.pop(-1)
150
- l += len(x) + 1
151
- prompt.append(x)
152
- non_prompt = self.commited[k:]
153
- return self.asr.sep.join(prompt[::-1]), self.asr.sep.join(
154
- t for _, _, t in non_prompt
155
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  def process_iter(self):
158
  """Runs on the current audio buffer.
@@ -161,93 +187,111 @@ class OnlineASRProcessor:
161
  """
162
 
163
  prompt, non_prompt = self.prompt()
164
- logger.debug(f"PROMPT(previous): {prompt}")
165
- logger.debug(f"CONTEXT: {non_prompt}")
166
  logger.debug(
167
  f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
168
  )
169
- res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
170
 
171
- # transform to [(beg,end,"word1"), ...]
 
172
  tsw = self.asr.ts_words(res)
173
- # insert into HypothesisBuffer
 
 
174
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
175
- o = self.transcript_buffer.flush()
176
- # Completed words
177
- self.commited.extend(o)
178
- completed = self.concatenate_tsw(o) # This will be returned at the end of the function
179
- logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
180
- ## The rest is incomplete
181
- the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
182
- logger.debug(f"INCOMPLETE: {the_rest[2]}")
183
 
184
- # there is a newly confirmed text
185
 
186
- if self.buffer_trimming_way == "sentence":
187
 
188
- self.chunk_completed_sentence(self.commited)
189
 
190
-
 
 
191
 
192
- # TODO: new words in `completed` should not be reterned unless they form a sentence
193
- # TODO: only complete sentences should go to completed
194
 
 
 
195
 
196
- if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
 
 
197
 
198
- if self.buffer_trimming_way == "sentence":
199
- logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!"
200
- " Even if no sentence was found!"
201
- )
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- self.chunk_completed_segment(res)
205
-
206
-
207
-
208
- # alternative: on any word
209
- # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
210
- # let's find commited word that is less
211
- # k = len(self.commited)-1
212
- # while k>0 and self.commited[k][1] > l:
213
- # k -= 1
214
- # t = self.commited[k][1]
215
- # self.chunk_at(t)
216
 
217
-
218
- return completed
219
 
220
- def chunk_completed_sentence(self):
221
- if self.commited == []:
222
- return
223
- raw_text = self.asr.sep.join([s[2] for s in self.commited])
224
- logger.debug(f"COMPLETED SENTENCE: {raw_text}")
225
- sents = self.words_to_sentences(self.commited)
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- if len(sents) < 2:
229
- logger.debug(f"[Sentence-segmentation] no sentence segmented.")
230
- return
231
 
232
 
 
 
 
 
 
233
 
234
- identified_sentence= "\n - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sents])
235
- logger.debug(f"[Sentence-segmentation] identified sentences:\n - {identified_sentence}")
236
-
 
 
 
 
 
237
 
238
- # we will continue with audio processing at this timestamp
239
- chunk_at = sents[-2][1]
240
 
 
241
 
242
- self.chunk_at(chunk_at)
243
 
244
  def chunk_completed_segment(self, res):
245
- if self.commited == []:
246
  return
247
 
248
  ends = self.asr.segments_end_ts(res)
249
 
250
- t = self.commited[-1][1]
251
 
252
  if len(ends) <= 1:
253
  logger.debug(f"--- not enough segments to chunk (<=1 words)")
@@ -287,9 +331,11 @@ class OnlineASRProcessor:
287
  Returns: [(beg,end,"sentence 1"),...]
288
  """
289
 
 
290
  cwords = [w for w in words]
291
  t = self.asr.sep.join(o[2] for o in cwords)
292
  logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
 
293
  s = self.tokenize([t])
294
  out = []
295
  while s:
@@ -302,11 +348,13 @@ class OnlineASRProcessor:
302
  w = w.strip()
303
  if beg is None and sent.startswith(w):
304
  beg = b
305
- elif end is None and sent == w:
306
  end = e
 
307
  out.append((beg, end, fsent))
308
  break
309
  sent = sent[len(w) :].strip()
 
310
  return out
311
 
312
  def finish(self):
@@ -330,7 +378,7 @@ class OnlineASRProcessor:
330
  # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
331
  if sep is None:
332
  sep = self.asr.sep
333
-
334
  t = sep.join(s[2] for s in tsw)
335
  if len(tsw) == 0:
336
  b = None
 
17
  self.logfile = logfile
18
 
19
  def insert(self, new, offset):
20
+ """
21
+ compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
22
+ The new tail is added to self.new
23
+ """
24
 
25
  new = [(a + offset, b + offset, t) for a, b, t in new]
26
  self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
 
79
  return self.buffer
80
 
81
 
82
+
83
+
84
+
85
  class OnlineASRProcessor:
86
 
87
  SAMPLING_RATE = 16000
 
133
  if offset is not None:
134
  self.buffer_time_offset = offset
135
  self.transcript_buffer.last_commited_time = self.buffer_time_offset
136
+ self.final_transcript = []
137
+ self.commited_not_final = []
138
+
139
 
140
  def insert_audio_chunk(self, audio):
141
  self.audio_buffer = np.append(self.audio_buffer, audio)
 
143
  def prompt(self):
144
  """Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
145
  "context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
146
+
147
+
148
+ """
149
+
150
+ if len(self.final_transcript) == 0:
151
+ prompt=""
152
+
153
+ if len(self.final_transcript) == 1:
154
+ prompt = self.final_transcript[0][2][-200:]
155
+
156
+ else:
157
+ prompt = self.concatenate_tsw(self.final_transcript)[2][-200:]
158
+ # TODO: this is not ideal as we concatenate each time the whole transcript
159
+
160
+ # k = max(0, len(self.final_transcript) - 1)
161
+ # while k > 1 and self.final_transcript[k - 1][1] > self.buffer_time_offset:
162
+ # k -= 1
163
+
164
+ # p = self.final_transcript[:k]
165
+
166
+
167
+ # p = [t for _, _, t in p]
168
+ # prompt = []
169
+ # l = 0
170
+ # while p and l < 200: # 200 characters prompt size
171
+ # x = p.pop(-1)
172
+ # l += len(x) + 1
173
+ # prompt.append(x)
174
+
175
+ non_prompt = self.concatenate_tsw(self.commited_not_final)[2]
176
+
177
+ logger.debug(f"PROMPT(previous): {prompt[:20]}...{prompt[-20:]} (length={len(prompt)}chars)")
178
+ logger.debug(f"CONTEXT: {non_prompt}")
179
+
180
+ return prompt, non_prompt
181
+
182
 
183
  def process_iter(self):
184
  """Runs on the current audio buffer.
 
187
  """
188
 
189
  prompt, non_prompt = self.prompt()
190
+
 
191
  logger.debug(
192
  f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
193
  )
 
194
 
195
+ ## Transcribe and format the result to [(beg,end,"word1"), ...]
196
+ res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
197
  tsw = self.asr.ts_words(res)
198
+
199
+
200
+ # insert into HypothesisBuffer, and get back the commited words
201
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
202
+ commited_tsw = self.transcript_buffer.flush()
203
+
204
+ if len(commited_tsw) == 0:
205
+ return (None, None, "")
 
 
 
 
206
 
 
207
 
208
+ self.commited_not_final.extend(commited_tsw)
209
 
 
210
 
211
+ # Define `completed` and `the_rest` based on the buffer_trimming_way
212
+ # completed will be returned at the end of the function.
213
+ # completed is a transcribed text with (beg,end,"sentence ...") format.
214
 
215
+ if self.buffer_trimming_way == "sentence":
 
216
 
217
+ sentences = self.words_to_sentences(self.commited_not_final)
218
+
219
 
220
+
221
+ if len(sentences) < 2:
222
+ logger.debug(f"[Sentence-segmentation] no full sentence segmented, do not commit anything.")
223
 
224
+ completed = []
225
+
 
 
226
 
227
+ else:
228
+ identified_sentence= "\n - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sentences])
229
+ logger.debug(f"[Sentence-segmentation] identified sentences:\n - {identified_sentence}")
230
+
231
+ # assume last sentence is incomplete, which is not always true
232
+
233
+ # we will continue with audio processing at this timestamp
234
+ chunk_at = sentences[-2][1]
235
+
236
+ self.chunk_at(chunk_at)
237
+ # TODO: here paragraph breaks can be added
238
+ self.commited_not_final = sentences[-1:]
239
+
240
+ completed= sentences[:-1]
241
+
242
 
243
+ else:
 
 
 
 
 
 
 
 
 
 
 
244
 
 
 
245
 
246
+ # break audio buffer anyway if it is too long
 
 
 
 
 
247
 
248
+ if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
249
+
250
+ if self.buffer_trimming_way == "sentence":
251
+ logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!"
252
+ " Even if no sentence was found!"
253
+ )
254
+
255
+
256
+
257
+
258
+
259
+ completed = self.concatenate_tsw(self.commited_not_final)
260
+ self.commited_not_final = []
261
+ self.chunk_completed_segment(res)
262
+ # TODO: I don't know if res is the correct variable to pass here
263
+ else:
264
+ completed = []
265
+
266
 
 
 
 
267
 
268
 
269
+ if len(completed) == 0:
270
+ return (None, None, "")
271
+ else:
272
+ self.final_transcript.extend(completed) # add whole time stamped sentences / or words to commited list
273
+
274
 
275
+ completed_text_segment= self.concatenate_tsw(completed)
276
+
277
+ the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
278
+ commited_but_not_final = self.concatenate_tsw(self.commited_not_final)
279
+ logger.debug(f"\n COMPLETE NOW: {completed_text_segment[2]}\n"
280
+ f" COMMITTED (but not Final): {commited_but_not_final[2]}\n"
281
+ f" INCOMPLETE: {the_rest[2]}"
282
+ )
283
 
 
 
284
 
285
+ return completed_text_segment
286
 
 
287
 
288
  def chunk_completed_segment(self, res):
289
+ if self.final_transcript == []:
290
  return
291
 
292
  ends = self.asr.segments_end_ts(res)
293
 
294
+ t = self.final_transcript[-1][1]
295
 
296
  if len(ends) <= 1:
297
  logger.debug(f"--- not enough segments to chunk (<=1 words)")
 
331
  Returns: [(beg,end,"sentence 1"),...]
332
  """
333
 
334
+
335
  cwords = [w for w in words]
336
  t = self.asr.sep.join(o[2] for o in cwords)
337
  logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
338
+
339
  s = self.tokenize([t])
340
  out = []
341
  while s:
 
348
  w = w.strip()
349
  if beg is None and sent.startswith(w):
350
  beg = b
351
+ if end is None and sent == w:
352
  end = e
353
+ if beg is not None and end is not None:
354
  out.append((beg, end, fsent))
355
  break
356
  sent = sent[len(w) :].strip()
357
+
358
  return out
359
 
360
  def finish(self):
 
378
  # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
379
  if sep is None:
380
  sep = self.asr.sep
381
+
382
  t = sep.join(s[2] for s in tsw)
383
  if len(tsw) == 0:
384
  b = None