Dominik Macháček commited on
Commit
ef08538
·
1 Parent(s): 99aef35

buffer trimming options + most recommendable default

Browse files
Files changed (1) hide show
  1. whisper_online.py +19 -33
whisper_online.py CHANGED
@@ -212,7 +212,7 @@ class OnlineASRProcessor:
212
 
213
  SAMPLING_RATE = 16000
214
 
215
- def __init__(self, asr, tokenizer, logfile=sys.stderr):
216
  """asr: WhisperASR object
217
  tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
218
  logfile: where to store the log.
@@ -223,6 +223,8 @@ class OnlineASRProcessor:
223
 
224
  self.init()
225
 
 
 
226
  def init(self):
227
  """run this when starting or restarting processing"""
228
  self.audio_buffer = np.array([],dtype=np.float32)
@@ -278,36 +280,18 @@ class OnlineASRProcessor:
278
  print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
279
 
280
  # there is a newly confirmed text
281
- if o:
282
- # we trim all the completed sentences from the audio buffer
283
- self.chunk_completed_sentence()
284
-
285
- # ...segments could be considered
286
- #self.chunk_completed_segment(res)
287
-
288
- #
289
- # self.silence_iters = 0
290
-
291
- # this was an attempt to trim silence/non-linguistic noise detected by the fact that Whisper doesn't transcribe anything for 3-times in a row.
292
- # It seemed not working better, or needs to be debugged.
293
-
294
- # elif self.transcript_buffer.complete():
295
- # self.silence_iters = 0
296
- # elif not self.transcript_buffer.complete():
297
- # # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
298
- # self.silence_iters += 1
299
- # if self.silence_iters >= 3:
300
- # n = self.last_chunked_at
301
- ## self.chunk_completed_sentence()
302
- ## if n == self.last_chunked_at:
303
- # self.chunk_at(self.last_chunked_at+self.chunk)
304
- # print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
305
- ## self.silence_iters = 0
306
-
307
-
308
- # if the audio buffer is longer than 30s, trim it...
309
- if len(self.audio_buffer)/self.SAMPLING_RATE > 30:
310
- # ...on the last completed segment (labeled by Whisper)
311
  self.chunk_completed_segment(res)
312
 
313
  # alternative: on any word
@@ -317,7 +301,7 @@ class OnlineASRProcessor:
317
  #while k>0 and self.commited[k][1] > l:
318
  # k -= 1
319
  #t = self.commited[k][1]
320
- print(f"chunking because of len",file=self.logfile)
321
  #self.chunk_at(t)
322
 
323
  print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
@@ -477,6 +461,8 @@ if __name__ == "__main__":
477
  parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
478
  parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
479
  parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
 
 
480
  args = parser.parse_args()
481
 
482
  # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
@@ -521,7 +507,7 @@ if __name__ == "__main__":
521
 
522
 
523
  min_chunk = args.min_chunk_size
524
- online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile)
525
 
526
 
527
  # load the audio into the LRU cache before we start the timer
 
212
 
213
  SAMPLING_RATE = 16000
214
 
215
+ def __init__(self, asr, tokenizer=None, logfile=sys.stderr, buffer_trimming=("segment", 15)):
216
  """asr: WhisperASR object
217
  tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
218
  logfile: where to store the log.
 
223
 
224
  self.init()
225
 
226
+ self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
227
+
228
  def init(self):
229
  """run this when starting or restarting processing"""
230
  self.audio_buffer = np.array([],dtype=np.float32)
 
280
  print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
281
 
282
  # there is a newly confirmed text
283
+
284
+ if o and self.buffer_trimming_way == "sentence": # trim the completed sentences
285
+ if len(self.audio_buffer)/self.SAMPLING_RATE > self.buffer_trimming_sec: # longer than this
286
+ self.chunk_completed_sentence()
287
+
288
+
289
+ if self.buffer_trimming_way == "segment":
290
+ s = self.buffer_trimming_sec # trim the completed segments longer than s,
291
+ else:
292
+ s = 30 # if the audio buffer is longer than 30s, trim it
293
+
294
+ if len(self.audio_buffer)/self.SAMPLING_RATE > s:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  self.chunk_completed_segment(res)
296
 
297
  # alternative: on any word
 
301
  #while k>0 and self.commited[k][1] > l:
302
  # k -= 1
303
  #t = self.commited[k][1]
304
+ print(f"chunking segment",file=self.logfile)
305
  #self.chunk_at(t)
306
 
307
  print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
 
461
  parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
462
  parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
463
  parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
464
+ parser.add_argument('--buffer_trimming', type=str, default="sentence", choices=["sentence", "segment"],help='Buffer trimming strategy')
465
+ parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming lenght threshold in seconds. If buffer length longer, trimming sentence/segment is triggered.')
466
  args = parser.parse_args()
467
 
468
  # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
 
507
 
508
 
509
  min_chunk = args.min_chunk_size
510
+ online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
511
 
512
 
513
  # load the audio into the LRU cache before we start the timer