bluegiraffe-sc commited on
Commit
18c1434
·
1 Parent(s): f97a253

backend import in child load_model method and expose logfile arg

Browse files
Files changed (1) hide show
  1. whisper_online.py +29 -38
whisper_online.py CHANGED
@@ -30,12 +30,8 @@ class ASRBase:
30
  self.transcribe_kargs = {}
31
  self.original_language = lan
32
 
33
- self.import_backend()
34
  self.model = self.load_model(modelsize, cache_dir, model_dir)
35
 
36
- def import_backend(self):
37
- raise NotImplemented("must be implemented in the child class")
38
-
39
  def load_model(self, modelsize, cache_dir):
40
  raise NotImplemented("must be implemented in the child class")
41
 
@@ -52,15 +48,13 @@ class WhisperTimestampedASR(ASRBase):
52
  """
53
 
54
  sep = " "
55
-
56
- def import_backend(self):
57
- global whisper, whisper_timestamped
58
- import whisper
59
- import whisper_timestamped
60
 
61
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
 
 
 
62
  if model_dir is not None:
63
- print("ignoring model_dir, not implemented",file=self.output)
64
  return whisper.load_model(modelsize, download_root=cache_dir)
65
 
66
  def transcribe(self, audio, init_prompt=""):
@@ -89,13 +83,10 @@ class FasterWhisperASR(ASRBase):
89
 
90
  sep = ""
91
 
92
- def import_backend(self):
93
- global faster_whisper
94
- import faster_whisper
95
-
96
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
 
97
  if model_dir is not None:
98
- print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.output)
99
  model_size_or_path = model_dir
100
  elif modelsize is not None:
101
  model_size_or_path = modelsize
@@ -143,7 +134,7 @@ class FasterWhisperASR(ASRBase):
143
 
144
  class HypothesisBuffer:
145
 
146
- def __init__(self, output=sys.stderr):
147
  """output: where to store the log. Leave it unchanged to print to terminal."""
148
  self.commited_in_buffer = []
149
  self.buffer = []
@@ -152,7 +143,7 @@ class HypothesisBuffer:
152
  self.last_commited_time = 0
153
  self.last_commited_word = None
154
 
155
- self.output = output
156
 
157
  def insert(self, new, offset):
158
  # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
@@ -172,9 +163,9 @@ class HypothesisBuffer:
172
  c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
173
  tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
174
  if c == tail:
175
- print("removing last",i,"words:",file=self.output)
176
  for j in range(i):
177
- print("\t",self.new.pop(0),file=self.output)
178
  break
179
 
180
  def flush(self):
@@ -211,14 +202,14 @@ class OnlineASRProcessor:
211
 
212
  SAMPLING_RATE = 16000
213
 
214
- def __init__(self, asr, tokenizer, output=sys.stderr):
215
  """asr: WhisperASR object
216
  tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
217
  output: where to store the log. Leave it unchanged to print to terminal.
218
  """
219
  self.asr = asr
220
  self.tokenizer = tokenizer
221
- self.output = output
222
 
223
  self.init()
224
 
@@ -227,7 +218,7 @@ class OnlineASRProcessor:
227
  self.audio_buffer = np.array([],dtype=np.float32)
228
  self.buffer_time_offset = 0
229
 
230
- self.transcript_buffer = HypothesisBuffer(output=self.output)
231
  self.commited = []
232
  self.last_chunked_at = 0
233
 
@@ -262,9 +253,9 @@ class OnlineASRProcessor:
262
  """
263
 
264
  prompt, non_prompt = self.prompt()
265
- print("PROMPT:", prompt, file=self.output)
266
- print("CONTEXT:", non_prompt, file=self.output)
267
- print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.output)
268
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
269
 
270
  # transform to [(beg,end,"word1"), ...]
@@ -273,8 +264,8 @@ class OnlineASRProcessor:
273
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
274
  o = self.transcript_buffer.flush()
275
  self.commited.extend(o)
276
- print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.output,flush=True)
277
- print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
278
 
279
  # there is a newly confirmed text
280
  if o:
@@ -293,14 +284,14 @@ class OnlineASRProcessor:
293
  # elif self.transcript_buffer.complete():
294
  # self.silence_iters = 0
295
  # elif not self.transcript_buffer.complete():
296
- # # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
297
  # self.silence_iters += 1
298
  # if self.silence_iters >= 3:
299
  # n = self.last_chunked_at
300
  ## self.chunk_completed_sentence()
301
  ## if n == self.last_chunked_at:
302
  # self.chunk_at(self.last_chunked_at+self.chunk)
303
- # print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.output)
304
  ## self.silence_iters = 0
305
 
306
 
@@ -316,18 +307,18 @@ class OnlineASRProcessor:
316
  #while k>0 and self.commited[k][1] > l:
317
  # k -= 1
318
  #t = self.commited[k][1]
319
- print(f"chunking because of len",file=self.output)
320
  #self.chunk_at(t)
321
 
322
- print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.output)
323
  return self.to_flush(o)
324
 
325
  def chunk_completed_sentence(self):
326
  if self.commited == []: return
327
- print(self.commited,file=self.output)
328
  sents = self.words_to_sentences(self.commited)
329
  for s in sents:
330
- print("\t\tSENT:",s,file=self.output)
331
  if len(sents) < 2:
332
  return
333
  while len(sents) > 2:
@@ -335,7 +326,7 @@ class OnlineASRProcessor:
335
  # we will continue with audio processing at this timestamp
336
  chunk_at = sents[-2][1]
337
 
338
- print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.output)
339
  self.chunk_at(chunk_at)
340
 
341
  def chunk_completed_segment(self, res):
@@ -352,12 +343,12 @@ class OnlineASRProcessor:
352
  ends.pop(-1)
353
  e = ends[-2]+self.buffer_time_offset
354
  if e <= t:
355
- print(f"--- segment chunked at {e:2.2f}",file=self.output)
356
  self.chunk_at(e)
357
  else:
358
- print(f"--- last segment not within commited area",file=self.output)
359
  else:
360
- print(f"--- not enough segments to chunk",file=self.output)
361
 
362
 
363
 
@@ -403,7 +394,7 @@ class OnlineASRProcessor:
403
  """
404
  o = self.transcript_buffer.complete()
405
  f = self.to_flush(o)
406
- print("last, noncommited:",f,file=self.output)
407
  return f
408
 
409
 
 
30
  self.transcribe_kargs = {}
31
  self.original_language = lan
32
 
 
33
  self.model = self.load_model(modelsize, cache_dir, model_dir)
34
 
 
 
 
35
  def load_model(self, modelsize, cache_dir):
36
  raise NotImplemented("must be implemented in the child class")
37
 
 
48
  """
49
 
50
  sep = " "
 
 
 
 
 
51
 
52
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
53
+ global whisper_timestamped # has to be global as it is used at each `transcribe` call
54
+ import whisper
55
+ import whisper_timestamped
56
  if model_dir is not None:
57
+ print("ignoring model_dir, not implemented",file=self.logfile)
58
  return whisper.load_model(modelsize, download_root=cache_dir)
59
 
60
  def transcribe(self, audio, init_prompt=""):
 
83
 
84
  sep = ""
85
 
 
 
 
 
86
  def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
87
+ from faster_whisper import WhisperModel
88
  if model_dir is not None:
89
+ print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.logfile)
90
  model_size_or_path = model_dir
91
  elif modelsize is not None:
92
  model_size_or_path = modelsize
 
134
 
135
  class HypothesisBuffer:
136
 
137
+ def __init__(self, logfile=sys.stderr):
138
  """output: where to store the log. Leave it unchanged to print to terminal."""
139
  self.commited_in_buffer = []
140
  self.buffer = []
 
143
  self.last_commited_time = 0
144
  self.last_commited_word = None
145
 
146
+ self.logfile = logfile
147
 
148
  def insert(self, new, offset):
149
  # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
 
163
  c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
164
  tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
165
  if c == tail:
166
+ print("removing last",i,"words:",file=self.logfile)
167
  for j in range(i):
168
+ print("\t",self.new.pop(0),file=self.logfile)
169
  break
170
 
171
  def flush(self):
 
202
 
203
  SAMPLING_RATE = 16000
204
 
205
+ def __init__(self, asr, tokenizer, logfile=sys.stderr):
206
  """asr: WhisperASR object
207
  tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
208
  output: where to store the log. Leave it unchanged to print to terminal.
209
  """
210
  self.asr = asr
211
  self.tokenizer = tokenizer
212
+ self.logfile = logfile
213
 
214
  self.init()
215
 
 
218
  self.audio_buffer = np.array([],dtype=np.float32)
219
  self.buffer_time_offset = 0
220
 
221
+ self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
222
  self.commited = []
223
  self.last_chunked_at = 0
224
 
 
253
  """
254
 
255
  prompt, non_prompt = self.prompt()
256
+ print("PROMPT:", prompt, file=self.logfile)
257
+ print("CONTEXT:", non_prompt, file=self.logfile)
258
+ print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.logfile)
259
  res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
260
 
261
  # transform to [(beg,end,"word1"), ...]
 
264
  self.transcript_buffer.insert(tsw, self.buffer_time_offset)
265
  o = self.transcript_buffer.flush()
266
  self.commited.extend(o)
267
+ print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.logfile,flush=True)
268
+ print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
269
 
270
  # there is a newly confirmed text
271
  if o:
 
284
  # elif self.transcript_buffer.complete():
285
  # self.silence_iters = 0
286
  # elif not self.transcript_buffer.complete():
287
+ # # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
288
  # self.silence_iters += 1
289
  # if self.silence_iters >= 3:
290
  # n = self.last_chunked_at
291
  ## self.chunk_completed_sentence()
292
  ## if n == self.last_chunked_at:
293
  # self.chunk_at(self.last_chunked_at+self.chunk)
294
+ # print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
295
  ## self.silence_iters = 0
296
 
297
 
 
307
  #while k>0 and self.commited[k][1] > l:
308
  # k -= 1
309
  #t = self.commited[k][1]
310
+ print(f"chunking because of len",file=self.logfile)
311
  #self.chunk_at(t)
312
 
313
+ print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
314
  return self.to_flush(o)
315
 
316
  def chunk_completed_sentence(self):
317
  if self.commited == []: return
318
+ print(self.commited,file=self.logfile)
319
  sents = self.words_to_sentences(self.commited)
320
  for s in sents:
321
+ print("\t\tSENT:",s,file=self.logfile)
322
  if len(sents) < 2:
323
  return
324
  while len(sents) > 2:
 
326
  # we will continue with audio processing at this timestamp
327
  chunk_at = sents[-2][1]
328
 
329
+ print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.logfile)
330
  self.chunk_at(chunk_at)
331
 
332
  def chunk_completed_segment(self, res):
 
343
  ends.pop(-1)
344
  e = ends[-2]+self.buffer_time_offset
345
  if e <= t:
346
+ print(f"--- segment chunked at {e:2.2f}",file=self.logfile)
347
  self.chunk_at(e)
348
  else:
349
+ print(f"--- last segment not within commited area",file=self.logfile)
350
  else:
351
+ print(f"--- not enough segments to chunk",file=self.logfile)
352
 
353
 
354
 
 
394
  """
395
  o = self.transcript_buffer.complete()
396
  f = self.to_flush(o)
397
+ print("last, noncommited:",f,file=self.logfile)
398
  return f
399
 
400