qfuxa commited on
Commit
1cb2d95
·
2 Parent(s): beb3190 944f370

Merge remote-tracking branch 'contrib/fix-sentencesegmenter'

Browse files
.gitignore CHANGED
@@ -127,3 +127,6 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
 
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+
131
+ *.wav
132
+ run_*.sh
src/whisper_streaming/online_asr.py CHANGED
@@ -87,11 +87,20 @@ class OnlineASRProcessor:
87
  buffer_trimming=("segment", 15),
88
  logfile=sys.stderr,
89
  ):
90
- """asr: WhisperASR object
91
- tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
92
- ("segment", 15)
93
- buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
94
- logfile: where to store the log.
 
 
 
 
 
 
 
 
 
95
  """
96
  self.asr = asr
97
  self.tokenize = tokenize_method
@@ -142,7 +151,7 @@ class OnlineASRProcessor:
142
  """
143
 
144
  prompt, non_prompt = self.prompt()
145
- logger.debug(f"PROMPT: {prompt}")
146
  logger.debug(f"CONTEXT: {non_prompt}")
147
  logger.debug(
148
  f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
@@ -199,22 +208,27 @@ class OnlineASRProcessor:
199
 
200
  def chunk_completed_sentence(self):
201
  if self.commited == []:
202
- return
203
-
204
- import pdb; pdb.set_trace()
205
  raw_text = self.asr.sep.join([s[2] for s in self.commited])
206
  logger.debug(f"COMPLETED SENTENCE: {raw_text}")
207
  sents = self.words_to_sentences(self.commited)
208
- for s in sents:
209
- logger.debug(f"\t\tSENT: {s}")
 
210
  if len(sents) < 2:
 
211
  return
212
- while len(sents) > 2:
213
- sents.pop(0)
 
 
 
 
 
214
  # we will continue with audio processing at this timestamp
215
  chunk_at = sents[-2][1]
216
 
217
- logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
218
  self.chunk_at(chunk_at)
219
 
220
  def chunk_completed_segment(self, res):
@@ -253,7 +267,8 @@ class OnlineASRProcessor:
253
 
254
  cwords = [w for w in words]
255
  t = self.asr.sep.join(o[2] for o in cwords)
256
- s = self.tokenize(t)
 
257
  out = []
258
  while s:
259
  beg = None
@@ -278,7 +293,7 @@ class OnlineASRProcessor:
278
  """
279
  o = self.transcript_buffer.complete()
280
  f = self.to_flush(o)
281
- logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
282
  self.buffer_time_offset += len(self.audio_buffer) / 16000
283
  return f
284
 
 
87
  buffer_trimming=("segment", 15),
88
  logfile=sys.stderr,
89
  ):
90
+ """
91
+ Initialize OnlineASRProcessor.
92
+
93
+ Args:
94
+ asr: WhisperASR object
95
+ tokenize_method: Sentence tokenizer function for the target language.
96
+ Must be a function that takes a list of text as input like MosesSentenceSplitter.
97
+ Can be None if using "segment" buffer trimming option.
98
+ buffer_trimming: Tuple of (option, seconds) where:
99
+ - option: Either "sentence" or "segment"
100
+ - seconds: Number of seconds threshold for buffer trimming
101
+ Default is ("segment", 15)
102
+ logfile: File to store logs
103
+
104
  """
105
  self.asr = asr
106
  self.tokenize = tokenize_method
 
151
  """
152
 
153
  prompt, non_prompt = self.prompt()
154
+ logger.debug(f"PROMPT(previous): {prompt}")
155
  logger.debug(f"CONTEXT: {non_prompt}")
156
  logger.debug(
157
  f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
 
208
 
209
  def chunk_completed_sentence(self):
210
  if self.commited == []:
211
+ return
 
 
212
  raw_text = self.asr.sep.join([s[2] for s in self.commited])
213
  logger.debug(f"COMPLETED SENTENCE: {raw_text}")
214
  sents = self.words_to_sentences(self.commited)
215
+
216
+
217
+
218
  if len(sents) < 2:
219
+ logger.debug(f"[Sentence-segmentation] no sentence segmented.")
220
  return
221
+
222
+
223
+
224
+ identified_sentence= "\n - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sents])
225
+ logger.debug(f"[Sentence-segmentation] identified sentences:\n - {identified_sentence}")
226
+
227
+
228
  # we will continue with audio processing at this timestamp
229
  chunk_at = sents[-2][1]
230
 
231
+ logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}")
232
  self.chunk_at(chunk_at)
233
 
234
  def chunk_completed_segment(self, res):
 
267
 
268
  cwords = [w for w in words]
269
  t = self.asr.sep.join(o[2] for o in cwords)
270
+ logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
271
+ s = self.tokenize([t])
272
  out = []
273
  while s:
274
  beg = None
 
293
  """
294
  o = self.transcript_buffer.complete()
295
  f = self.to_flush(o)
296
+ logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2][0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
297
  self.buffer_time_offset += len(self.audio_buffer) / 16000
298
  return f
299
 
src/whisper_streaming/whisper_online.py CHANGED
@@ -49,9 +49,9 @@ def create_tokenizer(lan):
49
  lan
50
  in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split()
51
  ):
52
- from mosestokenizer import MosesTokenizer
53
 
54
- return MosesTokenizer(lan)
55
 
56
  # the following languages are in Whisper, but not in wtpsplit:
57
  if (
@@ -204,6 +204,7 @@ def backend_factory(args):
204
 
205
  # Create the tokenizer
206
  if args.buffer_trimming == "sentence":
 
207
  tokenizer = create_tokenizer(tgt_language)
208
  else:
209
  tokenizer = None
@@ -235,10 +236,12 @@ def asr_factory(args, logfile=sys.stderr):
235
  online = online_factory(args, asr, tokenizer, logfile=logfile)
236
  return asr, online
237
 
238
- def set_logging(args, logger, other="_server"):
239
  logging.basicConfig(format="%(levelname)s\t%(message)s") # format='%(name)s
240
  logger.setLevel(args.log_level)
241
- logging.getLogger("whisper_online" + other).setLevel(args.log_level)
 
 
242
 
243
 
244
  # logging.getLogger("whisper_online_server").setLevel(args.log_level)
@@ -275,7 +278,7 @@ if __name__ == "__main__":
275
  args = parser.parse_args()
276
 
277
  # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
278
- logfile = sys.stderr
279
 
280
  if args.offline and args.comp_unaware:
281
  logger.error(
@@ -287,7 +290,7 @@ if __name__ == "__main__":
287
  # logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
288
  # level=getattr(logging, args.log_level))
289
 
290
- set_logging(args, logger)
291
 
292
  audio_path = args.audio_path
293
 
@@ -320,15 +323,18 @@ if __name__ == "__main__":
320
  if now is None:
321
  now = time.time() - start
322
  if o[0] is not None:
323
- print(
324
- "%1.4f %1.0f %1.0f %s" % (now * 1000, o[0] * 1000, o[1] * 1000, o[2]),
325
- file=logfile,
326
- flush=True,
327
- )
328
- print(
329
- "%1.4f %1.0f %1.0f %s" % (now * 1000, o[0] * 1000, o[1] * 1000, o[2]),
330
- flush=True,
331
  )
 
 
 
 
 
 
 
332
  else:
333
  # No text, so no output
334
  pass
 
49
  lan
50
  in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split()
51
  ):
52
+ from mosestokenizer import MosesSentenceSplitter
53
 
54
+ return MosesSentenceSplitter(lan)
55
 
56
  # the following languages are in Whisper, but not in wtpsplit:
57
  if (
 
204
 
205
  # Create the tokenizer
206
  if args.buffer_trimming == "sentence":
207
+
208
  tokenizer = create_tokenizer(tgt_language)
209
  else:
210
  tokenizer = None
 
236
  online = online_factory(args, asr, tokenizer, logfile=logfile)
237
  return asr, online
238
 
239
+ def set_logging(args, logger, others=[]):
240
  logging.basicConfig(format="%(levelname)s\t%(message)s") # format='%(name)s
241
  logger.setLevel(args.log_level)
242
+
243
+ for other in others:
244
+ logging.getLogger(other).setLevel(args.log_level)
245
 
246
 
247
  # logging.getLogger("whisper_online_server").setLevel(args.log_level)
 
278
  args = parser.parse_args()
279
 
280
  # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
281
+ logfile = None # sys.stderr
282
 
283
  if args.offline and args.comp_unaware:
284
  logger.error(
 
290
  # logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
291
  # level=getattr(logging, args.log_level))
292
 
293
+ set_logging(args, logger,others=["src.whisper_streaming.online_asr"])
294
 
295
  audio_path = args.audio_path
296
 
 
323
  if now is None:
324
  now = time.time() - start
325
  if o[0] is not None:
326
+ log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
327
+
328
+ logger.debug(
329
+ log_string
 
 
 
 
330
  )
331
+
332
+ if logfile is not None:
333
+ print(
334
+ log_string,
335
+ file=logfile,
336
+ flush=True,
337
+ )
338
  else:
339
  # No text, so no output
340
  pass