bluegiraffe-sc commited on
Commit
6e6b619
·
1 Parent(s): c0dd2e2

add option to save log to file

Browse files
Files changed (1) hide show
  1. whisper_online.py +29 -39
whisper_online.py CHANGED
@@ -46,10 +46,6 @@ class ASRBase:
46
  raise NotImplemented("must be implemented in the child class")
47
 
48
 
49
- ## requires imports:
50
- # import whisper
51
- # import whisper_timestamped
52
-
53
  class WhisperTimestampedASR(ASRBase):
54
  """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
55
  On the other hand, the installation for GPU could be easier.
@@ -64,7 +60,7 @@ class WhisperTimestampedASR(ASRBase):
64
 
65
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
66
  if model_dir is not None:
67
- print("ignoring model_dir, not implemented",file=sys.stderr)
68
  return whisper.load_model(modelsize, download_root=cache_dir)
69
 
70
  def transcribe(self, audio, init_prompt=""):
@@ -89,9 +85,6 @@ class WhisperTimestampedASR(ASRBase):
89
 
90
  class FasterWhisperASR(ASRBase):
91
  """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
92
-
93
- Requires imports, if used:
94
- import faster_whisper
95
  """
96
 
97
  sep = ""
@@ -101,11 +94,8 @@ class FasterWhisperASR(ASRBase):
101
  import faster_whisper
102
 
103
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
104
- #from faster_whisper import WhisperModel
105
-
106
-
107
  if model_dir is not None:
108
- print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=sys.stderr)
109
  model_size_or_path = model_dir
110
  elif modelsize is not None:
111
  model_size_or_path = modelsize
@@ -153,7 +143,8 @@ class FasterWhisperASR(ASRBase):
153
 
154
  class HypothesisBuffer:
155
 
156
- def __init__(self):
 
157
  self.commited_in_buffer = []
158
  self.buffer = []
159
  self.new = []
@@ -161,6 +152,8 @@ class HypothesisBuffer:
161
  self.last_commited_time = 0
162
  self.last_commited_word = None
163
 
 
 
164
  def insert(self, new, offset):
165
  # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
166
  # the new tail is added to self.new
@@ -179,9 +172,9 @@ class HypothesisBuffer:
179
  c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
180
  tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
181
  if c == tail:
182
- print("removing last",i,"words:",file=sys.stderr)
183
  for j in range(i):
184
- print("\t",self.new.pop(0),file=sys.stderr)
185
  break
186
 
187
  def flush(self):
@@ -218,12 +211,14 @@ class OnlineASRProcessor:
218
 
219
  SAMPLING_RATE = 16000
220
 
221
- def __init__(self, asr, tokenizer):
222
  """asr: WhisperASR object
223
  tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
 
224
  """
225
  self.asr = asr
226
  self.tokenizer = tokenizer
 
227
 
228
  self.init()
229
 
@@ -232,7 +227,7 @@ class OnlineASRProcessor:
232
  self.audio_buffer = np.array([],dtype=np.float32)
233
  self.buffer_time_offset = 0
234
 
235
- self.transcript_buffer = HypothesisBuffer()
236
  self.commited = []
237
  self.last_chunked_at = 0
238
 
@@ -263,13 +258,13 @@ class OnlineASRProcessor:
263
  def process_iter(self):
264
  """Runs on the current audio buffer.
265
  Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, "").
266
- The non-emty text is confirmed (commited) partial transcript.
267
  """
268
 
269
  prompt, non_prompt = self.prompt()
270
- print("PROMPT:", prompt, file=sys.stderr)
271
- print("CONTEXT:", non_prompt, file=sys.stderr)
272
- print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=sys.stderr)
273
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
274
 
275
  # transform to [(beg,end,"word1"), ...]
@@ -278,8 +273,8 @@ class OnlineASRProcessor:
278
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
279
  o = self.transcript_buffer.flush()
280
  self.commited.extend(o)
281
- print(">>>>COMPLETE NOW:",self.to_flush(o),file=sys.stderr,flush=True)
282
- print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=sys.stderr,flush=True)
283
 
284
  # there is a newly confirmed text
285
  if o:
@@ -298,14 +293,14 @@ class OnlineASRProcessor:
298
  # elif self.transcript_buffer.complete():
299
  # self.silence_iters = 0
300
  # elif not self.transcript_buffer.complete():
301
- # # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=sys.stderr,flush=True)
302
  # self.silence_iters += 1
303
  # if self.silence_iters >= 3:
304
  # n = self.last_chunked_at
305
  ## self.chunk_completed_sentence()
306
  ## if n == self.last_chunked_at:
307
  # self.chunk_at(self.last_chunked_at+self.chunk)
308
- # print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=sys.stderr)
309
  ## self.silence_iters = 0
310
 
311
 
@@ -321,18 +316,18 @@ class OnlineASRProcessor:
321
  #while k>0 and self.commited[k][1] > l:
322
  # k -= 1
323
  #t = self.commited[k][1]
324
- print(f"chunking because of len",file=sys.stderr)
325
  #self.chunk_at(t)
326
 
327
- print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=sys.stderr)
328
  return self.to_flush(o)
329
 
330
  def chunk_completed_sentence(self):
331
  if self.commited == []: return
332
- print(self.commited,file=sys.stderr)
333
  sents = self.words_to_sentences(self.commited)
334
  for s in sents:
335
- print("\t\tSENT:",s,file=sys.stderr)
336
  if len(sents) < 2:
337
  return
338
  while len(sents) > 2:
@@ -340,7 +335,7 @@ class OnlineASRProcessor:
340
  # we will continue with audio processing at this timestamp
341
  chunk_at = sents[-2][1]
342
 
343
- print(f"--- sentence chunked at {chunk_at:2.2f}",file=sys.stderr)
344
  self.chunk_at(chunk_at)
345
 
346
  def chunk_completed_segment(self, res):
@@ -357,12 +352,12 @@ class OnlineASRProcessor:
357
  ends.pop(-1)
358
  e = ends[-2]+self.buffer_time_offset
359
  if e <= t:
360
- print(f"--- segment chunked at {e:2.2f}",file=sys.stderr)
361
  self.chunk_at(e)
362
  else:
363
- print(f"--- last segment not within commited area",file=sys.stderr)
364
  else:
365
- print(f"--- not enough segments to chunk",file=sys.stderr)
366
 
367
 
368
 
@@ -408,7 +403,7 @@ class OnlineASRProcessor:
408
  """
409
  o = self.transcript_buffer.complete()
410
  f = self.to_flush(o)
411
- print("last, noncommited:",f,file=sys.stderr)
412
  return f
413
 
414
 
@@ -473,15 +468,10 @@ if __name__ == "__main__":
473
 
474
  t = time.time()
475
  print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
476
- #asr = WhisperASR(lan=language, modelsize=size)
477
 
478
  if args.backend == "faster-whisper":
479
- #from faster_whisper import WhisperModel
480
  asr_cls = FasterWhisperASR
481
  else:
482
- #import whisper
483
- #import whisper_timestamped
484
- # from whisper_timestamped_model import WhisperTimestampedASR
485
  asr_cls = WhisperTimestampedASR
486
 
487
  asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
 
46
  raise NotImplemented("must be implemented in the child class")
47
 
48
 
 
 
 
 
49
  class WhisperTimestampedASR(ASRBase):
50
  """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
51
  On the other hand, the installation for GPU could be easier.
 
60
 
61
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
62
  if model_dir is not None:
63
+ print("ignoring model_dir, not implemented",file=self.output)
64
  return whisper.load_model(modelsize, download_root=cache_dir)
65
 
66
  def transcribe(self, audio, init_prompt=""):
 
85
 
86
  class FasterWhisperASR(ASRBase):
87
  """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
 
 
 
88
  """
89
 
90
  sep = ""
 
94
  import faster_whisper
95
 
96
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
 
 
 
97
  if model_dir is not None:
98
+ print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.output)
99
  model_size_or_path = model_dir
100
  elif modelsize is not None:
101
  model_size_or_path = modelsize
 
143
 
144
  class HypothesisBuffer:
145
 
146
+ def __init__(self, output=sys.stderr):
147
+ """output: where to store the log. Leave it unchanged to print to terminal."""
148
  self.commited_in_buffer = []
149
  self.buffer = []
150
  self.new = []
 
152
  self.last_commited_time = 0
153
  self.last_commited_word = None
154
 
155
+ self.output = output
156
+
157
  def insert(self, new, offset):
158
  # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
159
  # the new tail is added to self.new
 
172
  c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
173
  tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
174
  if c == tail:
175
+ print("removing last",i,"words:",file=self.output)
176
  for j in range(i):
177
+ print("\t",self.new.pop(0),file=self.output)
178
  break
179
 
180
  def flush(self):
 
211
 
212
  SAMPLING_RATE = 16000
213
 
214
+ def __init__(self, asr, tokenizer, output=sys.stderr):
215
  """asr: WhisperASR object
216
  tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
217
+ output: where to store the log. Leave it unchanged to print to terminal.
218
  """
219
  self.asr = asr
220
  self.tokenizer = tokenizer
221
+ self.output = output
222
 
223
  self.init()
224
 
 
227
  self.audio_buffer = np.array([],dtype=np.float32)
228
  self.buffer_time_offset = 0
229
 
230
+ self.transcript_buffer = HypothesisBuffer(output=self.output)
231
  self.commited = []
232
  self.last_chunked_at = 0
233
 
 
258
  def process_iter(self):
259
  """Runs on the current audio buffer.
260
  Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, "").
261
+ The non-emty text is confirmed (committed) partial transcript.
262
  """
263
 
264
  prompt, non_prompt = self.prompt()
265
+ print("PROMPT:", prompt, file=self.output)
266
+ print("CONTEXT:", non_prompt, file=self.output)
267
+ print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.output)
268
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
269
 
270
  # transform to [(beg,end,"word1"), ...]
 
273
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
274
  o = self.transcript_buffer.flush()
275
  self.commited.extend(o)
276
+ print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.output,flush=True)
277
+ print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
278
 
279
  # there is a newly confirmed text
280
  if o:
 
293
  # elif self.transcript_buffer.complete():
294
  # self.silence_iters = 0
295
  # elif not self.transcript_buffer.complete():
296
+ # # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
297
  # self.silence_iters += 1
298
  # if self.silence_iters >= 3:
299
  # n = self.last_chunked_at
300
  ## self.chunk_completed_sentence()
301
  ## if n == self.last_chunked_at:
302
  # self.chunk_at(self.last_chunked_at+self.chunk)
303
+ # print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.output)
304
  ## self.silence_iters = 0
305
 
306
 
 
316
  #while k>0 and self.commited[k][1] > l:
317
  # k -= 1
318
  #t = self.commited[k][1]
319
+ print(f"chunking because of len",file=self.output)
320
  #self.chunk_at(t)
321
 
322
+ print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.output)
323
  return self.to_flush(o)
324
 
325
  def chunk_completed_sentence(self):
326
  if self.commited == []: return
327
+ print(self.commited,file=self.output)
328
  sents = self.words_to_sentences(self.commited)
329
  for s in sents:
330
+ print("\t\tSENT:",s,file=self.output)
331
  if len(sents) < 2:
332
  return
333
  while len(sents) > 2:
 
335
  # we will continue with audio processing at this timestamp
336
  chunk_at = sents[-2][1]
337
 
338
+ print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.output)
339
  self.chunk_at(chunk_at)
340
 
341
  def chunk_completed_segment(self, res):
 
352
  ends.pop(-1)
353
  e = ends[-2]+self.buffer_time_offset
354
  if e <= t:
355
+ print(f"--- segment chunked at {e:2.2f}",file=self.output)
356
  self.chunk_at(e)
357
  else:
358
+ print(f"--- last segment not within commited area",file=self.output)
359
  else:
360
+ print(f"--- not enough segments to chunk",file=self.output)
361
 
362
 
363
 
 
403
  """
404
  o = self.transcript_buffer.complete()
405
  f = self.to_flush(o)
406
+ print("last, noncommited:",f,file=self.output)
407
  return f
408
 
409
 
 
468
 
469
  t = time.time()
470
  print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
 
471
 
472
  if args.backend == "faster-whisper":
 
473
  asr_cls = FasterWhisperASR
474
  else:
 
 
 
475
  asr_cls = WhisperTimestampedASR
476
 
477
  asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)