SilasKieser commited on
Commit
372bf52
·
1 Parent(s): b18cbba

keep a test script in base directory

Browse files
src/__init__.py ADDED
File without changes
src/whisper_streaming/whisper_online.py CHANGED
@@ -5,23 +5,12 @@ import librosa
5
  from functools import lru_cache
6
  import time
7
  import logging
8
- from backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
9
- from online_asr import OnlineASRProcessor, VACOnlineASRProcessor
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
14
- @lru_cache(10**6)
15
- def load_audio(fname):
16
- a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
17
- return a
18
-
19
-
20
- def load_audio_chunk(fname, beg, end):
21
- audio = load_audio(fname)
22
- beg_s = int(beg * 16000)
23
- end_s = int(end * 16000)
24
- return audio[beg_s:end_s]
25
 
26
  WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
27
  ","
@@ -244,163 +233,3 @@ def set_logging(args, logger, others=[]):
244
  logging.getLogger(other).setLevel(args.log_level)
245
 
246
 
247
- # logging.getLogger("whisper_online_server").setLevel(args.log_level)
248
-
249
-
250
- if __name__ == "__main__":
251
-
252
- import argparse
253
-
254
- parser = argparse.ArgumentParser()
255
- parser.add_argument(
256
- "--audio_path",
257
- type=str,
258
- default='samples_jfk.wav',
259
- help="Filename of 16kHz mono channel wav, on which live streaming is simulated.",
260
- )
261
- add_shared_args(parser)
262
- parser.add_argument(
263
- "--start_at",
264
- type=float,
265
- default=0.0,
266
- help="Start processing audio at this time.",
267
- )
268
- parser.add_argument(
269
- "--offline", action="store_true", default=False, help="Offline mode."
270
- )
271
- parser.add_argument(
272
- "--comp_unaware",
273
- action="store_true",
274
- default=False,
275
- help="Computationally unaware simulation.",
276
- )
277
-
278
- args = parser.parse_args()
279
-
280
- # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
281
- logfile = None # sys.stderr
282
-
283
- if args.offline and args.comp_unaware:
284
- logger.error(
285
- "No or one option from --offline and --comp_unaware are available, not both. Exiting."
286
- )
287
- sys.exit(1)
288
-
289
- # if args.log_level:
290
- # logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
291
- # level=getattr(logging, args.log_level))
292
-
293
- set_logging(args, logger,others=["online_asr"])
294
-
295
- audio_path = args.audio_path
296
-
297
- SAMPLING_RATE = 16000
298
- duration = len(load_audio(audio_path)) / SAMPLING_RATE
299
- logger.info("Audio duration is: %2.2f seconds" % duration)
300
-
301
- asr, online = asr_factory(args, logfile=logfile)
302
- if args.vac:
303
- min_chunk = args.vac_chunk_size
304
- else:
305
- min_chunk = args.min_chunk_size
306
-
307
- # load the audio into the LRU cache before we start the timer
308
- a = load_audio_chunk(audio_path, 0, 1)
309
-
310
- # warm up the ASR because the very first transcribe takes much more time than the other
311
- asr.transcribe(a)
312
-
313
- beg = args.start_at
314
- start = time.time() - beg
315
-
316
- def output_transcript(o, now=None):
317
- # output format in stdout is like:
318
- # 4186.3606 0 1720 Takhle to je
319
- # - the first three words are:
320
- # - emission time from beginning of processing, in milliseconds
321
- # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
322
- # - the next words: segment transcript
323
- if now is None:
324
- now = time.time() - start
325
- if o[0] is not None:
326
- log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
327
-
328
- logger.debug(
329
- log_string
330
- )
331
-
332
- if logfile is not None:
333
- print(
334
- log_string,
335
- file=logfile,
336
- flush=True,
337
- )
338
- else:
339
- # No text, so no output
340
- pass
341
-
342
- if args.offline: ## offline mode processing (for testing/debugging)
343
- a = load_audio(audio_path)
344
- online.insert_audio_chunk(a)
345
- try:
346
- o = online.process_iter()
347
- except AssertionError as e:
348
- logger.error(f"assertion error: {repr(e)}")
349
- else:
350
- output_transcript(o)
351
- now = None
352
- elif args.comp_unaware: # computational unaware mode
353
- end = beg + min_chunk
354
- while True:
355
- a = load_audio_chunk(audio_path, beg, end)
356
- online.insert_audio_chunk(a)
357
- try:
358
- o = online.process_iter()
359
- except AssertionError as e:
360
- logger.error(f"assertion error: {repr(e)}")
361
- pass
362
- else:
363
- output_transcript(o, now=end)
364
-
365
- logger.debug(f"## last processed {end:.2f}s")
366
-
367
- if end >= duration:
368
- break
369
-
370
- beg = end
371
-
372
- if end + min_chunk > duration:
373
- end = duration
374
- else:
375
- end += min_chunk
376
- now = duration
377
-
378
- else: # online = simultaneous mode
379
- end = 0
380
- while True:
381
- now = time.time() - start
382
- if now < end + min_chunk:
383
- time.sleep(min_chunk + end - now)
384
- end = time.time() - start
385
- a = load_audio_chunk(audio_path, beg, end)
386
- beg = end
387
- online.insert_audio_chunk(a)
388
-
389
- try:
390
- o = online.process_iter()
391
- except AssertionError as e:
392
- logger.error(f"assertion error: {e}")
393
- pass
394
- else:
395
- output_transcript(o)
396
- now = time.time() - start
397
- logger.debug(
398
- f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}"
399
- )
400
-
401
- if end >= duration:
402
- break
403
- now = None
404
-
405
- o = online.finish()
406
- output_transcript(o, now=now)
 
5
  from functools import lru_cache
6
  import time
7
  import logging
8
+ from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
9
+ from .online_asr import OnlineASRProcessor, VACOnlineASRProcessor
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
16
  ","
 
233
  logging.getLogger(other).setLevel(args.log_level)
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_noserver_test.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ import numpy as np
4
+ import librosa
5
+ from functools import lru_cache
6
+ import time
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ from src.whisper_streaming.whisper_online import *
12
+
13
+ @lru_cache(10**6)
14
+ def load_audio(fname):
15
+ a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
16
+ return a
17
+
18
+
19
+ def load_audio_chunk(fname, beg, end):
20
+ audio = load_audio(fname)
21
+ beg_s = int(beg * 16000)
22
+ end_s = int(end * 16000)
23
+ return audio[beg_s:end_s]
24
+
25
+ if __name__ == "__main__":
26
+
27
+ import argparse
28
+
29
+ parser = argparse.ArgumentParser()
30
+ parser.add_argument(
31
+ "--audio_path",
32
+ type=str,
33
+ default='samples_jfk.wav',
34
+ help="Filename of 16kHz mono channel wav, on which live streaming is simulated.",
35
+ )
36
+ add_shared_args(parser)
37
+ parser.add_argument(
38
+ "--start_at",
39
+ type=float,
40
+ default=0.0,
41
+ help="Start processing audio at this time.",
42
+ )
43
+ parser.add_argument(
44
+ "--offline", action="store_true", default=False, help="Offline mode."
45
+ )
46
+ parser.add_argument(
47
+ "--comp_unaware",
48
+ action="store_true",
49
+ default=False,
50
+ help="Computationally unaware simulation.",
51
+ )
52
+
53
+ args = parser.parse_args()
54
+
55
+ # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
56
+ logfile = None # sys.stderr
57
+
58
+ if args.offline and args.comp_unaware:
59
+ logger.error(
60
+ "No or one option from --offline and --comp_unaware are available, not both. Exiting."
61
+ )
62
+ sys.exit(1)
63
+
64
+ # if args.log_level:
65
+ # logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
66
+ # level=getattr(logging, args.log_level))
67
+
68
+ set_logging(args, logger,others=["src.whisper_streaming.online_asr"])
69
+
70
+ audio_path = args.audio_path
71
+
72
+ SAMPLING_RATE = 16000
73
+ duration = len(load_audio(audio_path)) / SAMPLING_RATE
74
+ logger.info("Audio duration is: %2.2f seconds" % duration)
75
+
76
+ asr, online = asr_factory(args, logfile=logfile)
77
+ if args.vac:
78
+ min_chunk = args.vac_chunk_size
79
+ else:
80
+ min_chunk = args.min_chunk_size
81
+
82
+ # load the audio into the LRU cache before we start the timer
83
+ a = load_audio_chunk(audio_path, 0, 1)
84
+
85
+ # warm up the ASR because the very first transcribe takes much more time than the other
86
+ asr.transcribe(a)
87
+
88
+ beg = args.start_at
89
+ start = time.time() - beg
90
+
91
+ def output_transcript(o, now=None):
92
+ # output format in stdout is like:
93
+ # 4186.3606 0 1720 Takhle to je
94
+ # - the first three words are:
95
+ # - emission time from beginning of processing, in milliseconds
96
+ # - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
97
+ # - the next words: segment transcript
98
+ if now is None:
99
+ now = time.time() - start
100
+ if o[0] is not None:
101
+ log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
102
+
103
+ logger.debug(
104
+ log_string
105
+ )
106
+
107
+ if logfile is not None:
108
+ print(
109
+ log_string,
110
+ file=logfile,
111
+ flush=True,
112
+ )
113
+ else:
114
+ # No text, so no output
115
+ pass
116
+
117
+ if args.offline: ## offline mode processing (for testing/debugging)
118
+ a = load_audio(audio_path)
119
+ online.insert_audio_chunk(a)
120
+ try:
121
+ o = online.process_iter()
122
+ except AssertionError as e:
123
+ logger.error(f"assertion error: {repr(e)}")
124
+ else:
125
+ output_transcript(o)
126
+ now = None
127
+ elif args.comp_unaware: # computational unaware mode
128
+ end = beg + min_chunk
129
+ while True:
130
+ a = load_audio_chunk(audio_path, beg, end)
131
+ online.insert_audio_chunk(a)
132
+ try:
133
+ o = online.process_iter()
134
+ except AssertionError as e:
135
+ logger.error(f"assertion error: {repr(e)}")
136
+ pass
137
+ else:
138
+ output_transcript(o, now=end)
139
+
140
+ logger.debug(f"## last processed {end:.2f}s")
141
+
142
+ if end >= duration:
143
+ break
144
+
145
+ beg = end
146
+
147
+ if end + min_chunk > duration:
148
+ end = duration
149
+ else:
150
+ end += min_chunk
151
+ now = duration
152
+
153
+ else: # online = simultaneous mode
154
+ end = 0
155
+ while True:
156
+ now = time.time() - start
157
+ if now < end + min_chunk:
158
+ time.sleep(min_chunk + end - now)
159
+ end = time.time() - start
160
+ a = load_audio_chunk(audio_path, beg, end)
161
+ beg = end
162
+ online.insert_audio_chunk(a)
163
+
164
+ try:
165
+ o = online.process_iter()
166
+ except AssertionError as e:
167
+ logger.error(f"assertion error: {e}")
168
+ pass
169
+ else:
170
+ output_transcript(o)
171
+ now = time.time() - start
172
+ logger.debug(
173
+ f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}"
174
+ )
175
+
176
+ if end >= duration:
177
+ break
178
+ now = None
179
+
180
+ o = online.finish()
181
+ output_transcript(o, now=now)