Improve sentence tokenization handling - MosesSentenceSplitter now works with list input
Browse files
src/whisper_streaming/online_asr.py
CHANGED
@@ -263,11 +263,26 @@ class OnlineASRProcessor:
|
|
263 |
|
264 |
def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
|
265 |
"""
|
266 |
-
Converts a list of tokens to a list of Sentence objects
|
267 |
sentence tokenizer.
|
268 |
"""
|
|
|
|
|
|
|
269 |
full_text = " ".join(token.text for token in tokens)
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
sentences: List[Sentence] = []
|
272 |
token_index = 0
|
273 |
for sent_text in sentence_texts:
|
@@ -276,7 +291,7 @@ class OnlineASRProcessor:
|
|
276 |
continue
|
277 |
sent_tokens = []
|
278 |
accumulated = ""
|
279 |
-
# Accumulate tokens until roughly matching the sentence text.
|
280 |
while token_index < len(tokens) and len(accumulated) < len(sent_text):
|
281 |
token = tokens[token_index]
|
282 |
accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
|
@@ -290,7 +305,6 @@ class OnlineASRProcessor:
|
|
290 |
)
|
291 |
sentences.append(sentence)
|
292 |
return sentences
|
293 |
-
|
294 |
def finish(self) -> Transcript:
|
295 |
"""
|
296 |
Flush the remaining transcript when processing ends.
|
|
|
263 |
|
264 |
def words_to_sentences(self, tokens: List[ASRToken]) -> List[Sentence]:
|
265 |
"""
|
266 |
+
Converts a list of tokens to a list of Sentence objects using the provided
|
267 |
sentence tokenizer.
|
268 |
"""
|
269 |
+
if not tokens:
|
270 |
+
return []
|
271 |
+
|
272 |
full_text = " ".join(token.text for token in tokens)
|
273 |
+
|
274 |
+
if self.tokenize:
|
275 |
+
try:
|
276 |
+
sentence_texts = self.tokenize(full_text)
|
277 |
+
except Exception as e:
|
278 |
+
# Some tokenizers (e.g., MosesSentenceSplitter) expect a list input.
|
279 |
+
try:
|
280 |
+
sentence_texts = self.tokenize([full_text])
|
281 |
+
except Exception as e2:
|
282 |
+
raise ValueError("Tokenization failed") from e2
|
283 |
+
else:
|
284 |
+
sentence_texts = [full_text]
|
285 |
+
|
286 |
sentences: List[Sentence] = []
|
287 |
token_index = 0
|
288 |
for sent_text in sentence_texts:
|
|
|
291 |
continue
|
292 |
sent_tokens = []
|
293 |
accumulated = ""
|
294 |
+
# Accumulate tokens until roughly matching the length of the sentence text.
|
295 |
while token_index < len(tokens) and len(accumulated) < len(sent_text):
|
296 |
token = tokens[token_index]
|
297 |
accumulated = (accumulated + " " + token.text).strip() if accumulated else token.text
|
|
|
305 |
)
|
306 |
sentences.append(sentence)
|
307 |
return sentences
|
|
|
308 |
def finish(self) -> Transcript:
|
309 |
"""
|
310 |
Flush the remaining transcript when processing ends.
|