qfuxa commited on
Commit
33573ca
·
1 Parent(s): 1ebc0b3

Improve sentence tokenization handling - MosesSentenceSplitter now works with list input

Browse files
Files changed (1) hide show
  1. src/whisper_streaming/online_asr.py +18 -4
src/whisper_streaming/online_asr.py CHANGED
@@ -263,11 +263,26 @@ class OnlineASRProcessor:
263
 
264
  def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
265
  """
266
- Converts a list of tokens to a list of Sentence objects by using the provided
267
  sentence tokenizer.
268
  """
 
 
 
269
  full_text = " ".join(token.text for token in tokens)
270
- sentence_texts = self.tokenize(full_text) if self.tokenize else [full_text]
 
 
 
 
 
 
 
 
 
 
 
 
271
  sentences: List[Sentence] = []
272
  token_index = 0
273
  for sent_text in sentence_texts:
@@ -276,7 +291,7 @@ class OnlineASRProcessor:
276
  continue
277
  sent_tokens = []
278
  accumulated = ""
279
- # Accumulate tokens until roughly matching the sentence text.
280
  while token_index < len(tokens) and len(accumulated) < len(sent_text):
281
  token = tokens[token_index]
282
  accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
@@ -290,7 +305,6 @@ class OnlineASRProcessor:
290
  )
291
  sentences.append(sentence)
292
  return sentences
293
-
294
  def finish(self) -> Transcript:
295
  """
296
  Flush the remaining transcript when processing ends.
 
263
 
264
  def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
265
  """
266
+ Converts a list of tokens to a list of Sentence objects using the provided
267
  sentence tokenizer.
268
  """
269
+ if not tokens:
270
+ return []
271
+
272
  full_text = " ".join(token.text for token in tokens)
273
+
274
+ if self.tokenize:
275
+ try:
276
+ sentence_texts = self.tokenize(full_text)
277
+ except Exception as e:
278
+ # Some tokenizers (e.g., MosesSentenceSplitter) expect a list input.
279
+ try:
280
+ sentence_texts = self.tokenize([full_text])
281
+ except Exception as e2:
282
+ raise ValueError("Tokenization failed") from e2
283
+ else:
284
+ sentence_texts = [full_text]
285
+
286
  sentences: List[Sentence] = []
287
  token_index = 0
288
  for sent_text in sentence_texts:
 
291
  continue
292
  sent_tokens = []
293
  accumulated = ""
294
+ # Accumulate tokens until roughly matching the length of the sentence text.
295
  while token_index < len(tokens) and len(accumulated) < len(sent_text):
296
  token = tokens[token_index]
297
  accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
 
305
  )
306
  sentences.append(sentence)
307
  return sentences
 
308
  def finish(self) -> Transcript:
309
  """
310
  Flush the remaining transcript when processing ends.