polishing code and note about installing deps for VAD
Browse files- README.md +2 -0
- whisper_online.py +1 -1
- whisper_online_server.py +9 -10
- whisper_online_vac.py +2 -2
README.md
CHANGED
|
@@ -33,6 +33,8 @@ Please, cite us. [Bibtex citation](http://www.afnlp.org/conferences/ijcnlp2023/p
|
|
| 33 |
|
| 34 |
1) ``pip install librosa`` -- audio processing library
|
| 35 |
|
|
|
|
|
|
|
| 36 |
2) Whisper backend.
|
| 37 |
|
| 38 |
Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
|
|
|
|
| 33 |
|
| 34 |
1) ``pip install librosa`` -- audio processing library
|
| 35 |
|
| 36 |
+
Note: for the VAD I need to `pip install torch torchaudio`.
|
| 37 |
+
|
| 38 |
2) Whisper backend.
|
| 39 |
|
| 40 |
Two alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
|
whisper_online.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import sys
|
| 3 |
import numpy as np
|
| 4 |
-
import librosa
|
| 5 |
from functools import lru_cache
|
| 6 |
import time
|
| 7 |
import datetime
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import sys
|
| 3 |
import numpy as np
|
| 4 |
+
import librosa
|
| 5 |
from functools import lru_cache
|
| 6 |
import time
|
| 7 |
import datetime
|
whisper_online_server.py
CHANGED
|
@@ -30,11 +30,12 @@ print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",
|
|
| 30 |
if args.backend == "faster-whisper":
|
| 31 |
from faster_whisper import WhisperModel
|
| 32 |
asr_cls = FasterWhisperASR
|
| 33 |
-
|
| 34 |
import whisper
|
| 35 |
-
import
|
| 36 |
-
# from whisper_timestamped_model import WhisperTimestampedASR
|
| 37 |
asr_cls = WhisperTimestampedASR
|
|
|
|
|
|
|
| 38 |
|
| 39 |
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
|
| 40 |
|
|
@@ -44,25 +45,23 @@ if args.task == "translate":
|
|
| 44 |
else:
|
| 45 |
tgt_language = language
|
| 46 |
|
| 47 |
-
|
| 48 |
-
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
|
| 49 |
|
| 50 |
if args.vad:
|
| 51 |
print("setting VAD filter",file=sys.stderr)
|
| 52 |
asr.use_vad()
|
| 53 |
|
| 54 |
|
| 55 |
-
min_chunk = args.min_chunk_size
|
| 56 |
-
|
| 57 |
if args.buffer_trimming == "sentence":
|
| 58 |
tokenizer = create_tokenizer(tgt_language)
|
| 59 |
else:
|
| 60 |
tokenizer = None
|
| 61 |
if not args.vac:
|
|
|
|
| 62 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 63 |
else:
|
| 64 |
-
from whisper_online_vac import
|
| 65 |
-
online = VACOnlineASRProcessor(
|
| 66 |
|
| 67 |
|
| 68 |
demo_audio_path = "cs-maji-2.16k.wav"
|
|
@@ -219,7 +218,7 @@ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
| 219 |
conn, addr = s.accept()
|
| 220 |
logging.info('INFO: Connected to client on {}'.format(addr))
|
| 221 |
connection = Connection(conn)
|
| 222 |
-
proc = ServerProcessor(connection, online,
|
| 223 |
proc.process()
|
| 224 |
conn.close()
|
| 225 |
logging.info('INFO: Connection to client closed')
|
|
|
|
| 30 |
if args.backend == "faster-whisper":
|
| 31 |
from faster_whisper import WhisperModel
|
| 32 |
asr_cls = FasterWhisperASR
|
| 33 |
+
elif args.backend == "whisper_timestamped":
|
| 34 |
import whisper
|
| 35 |
+
from whisper_online import WhisperTimestampedASR
|
|
|
|
| 36 |
asr_cls = WhisperTimestampedASR
|
| 37 |
+
else:
|
| 38 |
+
raise ValueError(f"Unknown {args.backend=}")
|
| 39 |
|
| 40 |
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
|
| 41 |
|
|
|
|
| 45 |
else:
|
| 46 |
tgt_language = language
|
| 47 |
|
| 48 |
+
print(f"done. It took {round(time.time()-t,2)} seconds.",file=sys.stderr)
|
|
|
|
| 49 |
|
| 50 |
if args.vad:
|
| 51 |
print("setting VAD filter",file=sys.stderr)
|
| 52 |
asr.use_vad()
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
| 55 |
if args.buffer_trimming == "sentence":
|
| 56 |
tokenizer = create_tokenizer(tgt_language)
|
| 57 |
else:
|
| 58 |
tokenizer = None
|
| 59 |
if not args.vac:
|
| 60 |
+
from whisper_online import OnlineASRProcessor
|
| 61 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 62 |
else:
|
| 63 |
+
from whisper_online_vac import VACOnlineASRProcessor
|
| 64 |
+
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 65 |
|
| 66 |
|
| 67 |
demo_audio_path = "cs-maji-2.16k.wav"
|
|
|
|
| 218 |
conn, addr = s.accept()
|
| 219 |
logging.info('INFO: Connected to client on {}'.format(addr))
|
| 220 |
connection = Connection(conn)
|
| 221 |
+
proc = ServerProcessor(connection, online, args.min_chunk_size)
|
| 222 |
proc.process()
|
| 223 |
conn.close()
|
| 224 |
logging.info('INFO: Connection to client closed')
|
whisper_online_vac.py
CHANGED
|
@@ -165,9 +165,9 @@ if __name__ == "__main__":
|
|
| 165 |
|
| 166 |
if end >= duration:
|
| 167 |
break
|
| 168 |
-
|
| 169 |
beg = end
|
| 170 |
-
|
| 171 |
if end + min_chunk > duration:
|
| 172 |
end = duration
|
| 173 |
else:
|
|
|
|
| 165 |
|
| 166 |
if end >= duration:
|
| 167 |
break
|
| 168 |
+
|
| 169 |
beg = end
|
| 170 |
+
|
| 171 |
if end + min_chunk > duration:
|
| 172 |
end = duration
|
| 173 |
else:
|