Dominik Macháček commited on
Commit
2625be1
·
1 Parent(s): 260b1f8

Ukrainian tokenizer support

Browse files
Files changed (2) hide show
  1. whisper_online.py +17 -9
  2. whisper_online_server.py +4 -1
whisper_online.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  import librosa
5
  from functools import lru_cache
6
  import time
7
- from mosestokenizer import MosesTokenizer
8
 
9
 
10
  @lru_cache
@@ -207,14 +207,12 @@ class OnlineASRProcessor:
207
 
208
  SAMPLING_RATE = 16000
209
 
210
- def __init__(self, language, asr):
211
- """language: lang. code that MosesTokenizer uses for sentence segmentation
212
- asr: WhisperASR object
213
- chunk: number of seconds for intended size of audio interval that is inserted and looped
214
  """
215
- self.language = language
216
  self.asr = asr
217
- self.tokenizer = MosesTokenizer(self.language)
218
 
219
  self.init()
220
 
@@ -369,7 +367,7 @@ class OnlineASRProcessor:
369
  self.last_chunked_at = time
370
 
371
  def words_to_sentences(self, words):
372
- """Uses mosestokenizer for sentence segmentation of words.
373
  Returns: [(beg,end,"sentence 1"),...]
374
  """
375
 
@@ -419,6 +417,15 @@ class OnlineASRProcessor:
419
  return (b,e,t)
420
 
421
 
 
 
 
 
 
 
 
 
 
422
 
423
  ## main:
424
 
@@ -482,8 +489,9 @@ if __name__ == "__main__":
482
  print("setting VAD filter",file=sys.stderr)
483
  asr.use_vad()
484
 
 
485
  min_chunk = args.min_chunk_size
486
- online = OnlineASRProcessor(tgt_language,asr)
487
 
488
 
489
  # load the audio into the LRU cache before we start the timer
 
4
  import librosa
5
  from functools import lru_cache
6
  import time
7
+
8
 
9
 
10
  @lru_cache
 
207
 
208
  SAMPLING_RATE = 16000
209
 
210
+ def __init__(self, asr, tokenizer):
211
+ """asr: WhisperASR object
212
+ tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
 
213
  """
 
214
  self.asr = asr
215
+ self.tokenizer = tokenizer
216
 
217
  self.init()
218
 
 
367
  self.last_chunked_at = time
368
 
369
  def words_to_sentences(self, words):
370
+ """Uses self.tokenizer for sentence segmentation of words.
371
  Returns: [(beg,end,"sentence 1"),...]
372
  """
373
 
 
417
  return (b,e,t)
418
 
419
 
420
+ def create_tokenizer(lan):
421
+ if lan == "uk":
422
+ import tokenize_uk
423
+ class UkrainianTokenizer:
424
+ def split(self, text):
425
+ return tokenize_uk.tokenize_sents(text)
426
+ return UkrainianTokenizer()
427
+ from mosestokenizer import MosesTokenizer
428
+ return MosesTokenizer(lan)
429
 
430
  ## main:
431
 
 
489
  print("setting VAD filter",file=sys.stderr)
490
  asr.use_vad()
491
 
492
+
493
  min_chunk = args.min_chunk_size
494
+ online = OnlineASRProcessor(asr,create_tokenizer(tgt_language))
495
 
496
 
497
  # load the audio into the LRU cache before we start the timer
whisper_online_server.py CHANGED
@@ -48,6 +48,9 @@ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, mode
48
 
49
  if args.task == "translate":
50
  asr.set_translate_task()
 
 
 
51
 
52
  e = time.time()
53
  print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
@@ -58,7 +61,7 @@ if args.vad:
58
 
59
 
60
  min_chunk = args.min_chunk_size
61
- online = OnlineASRProcessor(language,asr)
62
 
63
 
64
 
 
48
 
49
  if args.task == "translate":
50
  asr.set_translate_task()
51
+ tgt_language = "en"
52
+ else:
53
+ tgt_language = language
54
 
55
  e = time.time()
56
  print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
 
61
 
62
 
63
  min_chunk = args.min_chunk_size
64
+ online = OnlineASRProcessor(asr,create_tokenizer(tgt_language))
65
 
66
 
67