oplatek commited on
Commit
13fd21a
·
1 Parent(s): 8849581

polishing code and note about installing deps for VAD

Browse files
README.md CHANGED
@@ -33,6 +33,8 @@ Please, cite us. [Bibtex citation](http://www.afnlp.org/conferences/ijcnlp2023/p
33
 
34
  1) ``pip install librosa`` -- audio processing library
35
 
 
 
36
  2) Whisper backend.
37
 
38
  Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
 
33
 
34
  1) ``pip install librosa`` -- audio processing library
35
 
36
+ Note: for the VAD I need to `pip install torch torchaudio`.
37
+
38
  2) Whisper backend.
39
 
40
  Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
whisper_online.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  import sys
3
  import numpy as np
4
- import librosa
5
  from functools import lru_cache
6
  import time
7
  import datetime
 
1
  #!/usr/bin/env python3
2
  import sys
3
  import numpy as np
4
+ import librosa
5
  from functools import lru_cache
6
  import time
7
  import datetime
whisper_online_server.py CHANGED
@@ -30,11 +30,12 @@ print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",
30
  if args.backend == "faster-whisper":
31
  from faster_whisper import WhisperModel
32
  asr_cls = FasterWhisperASR
33
- else:
34
  import whisper
35
- import whisper_timestamped
36
- # from whisper_timestamped_model import WhisperTimestampedASR
37
  asr_cls = WhisperTimestampedASR
 
 
38
 
39
  asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
40
 
@@ -44,25 +45,23 @@ if args.task == "translate":
44
  else:
45
  tgt_language = language
46
 
47
- e = time.time()
48
- print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
49
 
50
  if args.vad:
51
  print("setting VAD filter",file=sys.stderr)
52
  asr.use_vad()
53
 
54
 
55
- min_chunk = args.min_chunk_size
56
-
57
  if args.buffer_trimming == "sentence":
58
  tokenizer = create_tokenizer(tgt_language)
59
  else:
60
  tokenizer = None
61
  if not args.vac:
 
62
  online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
63
  else:
64
- from whisper_online_vac import *
65
- online = VACOnlineASRProcessor(min_chunk, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
66
 
67
 
68
  demo_audio_path = "cs-maji-2.16k.wav"
@@ -219,7 +218,7 @@ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
219
  conn, addr = s.accept()
220
  logging.info('INFO: Connected to client on {}'.format(addr))
221
  connection = Connection(conn)
222
- proc = ServerProcessor(connection, online, min_chunk)
223
  proc.process()
224
  conn.close()
225
  logging.info('INFO: Connection to client closed')
 
30
  if args.backend == "faster-whisper":
31
  from faster_whisper import WhisperModel
32
  asr_cls = FasterWhisperASR
33
+ elif args.backend == "whisper_timestamped":
34
  import whisper
35
+ from whisper_online import WhisperTimestampedASR
 
36
  asr_cls = WhisperTimestampedASR
37
+ else:
38
+ raise ValueError(f"Unknown {args.backend=}")
39
 
40
  asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
41
 
 
45
  else:
46
  tgt_language = language
47
 
48
+ print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr)
 
49
 
50
  if args.vad:
51
  print("setting VAD filter",file=sys.stderr)
52
  asr.use_vad()
53
 
54
 
 
 
55
  if args.buffer_trimming == "sentence":
56
  tokenizer = create_tokenizer(tgt_language)
57
  else:
58
  tokenizer = None
59
  if not args.vac:
60
+ from whisper_online import OnlineASRProcessor
61
  online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
62
  else:
63
+ from whisper_online_vac import VACOnlineASRProcessor
64
+ online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
65
 
66
 
67
  demo_audio_path = "cs-maji-2.16k.wav"
 
218
  conn, addr = s.accept()
219
  logging.info('INFO: Connected to client on {}'.format(addr))
220
  connection = Connection(conn)
221
+ proc = ServerProcessor(connection, online, args.min_chunk_size)
222
  proc.process()
223
  conn.close()
224
  logging.info('INFO: Connection to client closed')
whisper_online_vac.py CHANGED
@@ -165,9 +165,9 @@ if __name__ == "__main__":
165
 
166
  if end >= duration:
167
  break
168
-
169
  beg = end
170
-
171
  if end + min_chunk > duration:
172
  end = duration
173
  else:
 
165
 
166
  if end >= duration:
167
  break
168
+
169
  beg = end
170
+
171
  if end + min_chunk > duration:
172
  end = duration
173
  else: