Dominik Macháček
commited on
Commit
·
1f2352f
1
Parent(s):
bfbe83d
README typo and one more simulation option is not shared
Browse files- README.md +6 -9
- whisper_online.py +1 -2
README.md
CHANGED
@@ -68,9 +68,8 @@ In case of installation issues of opus-fast-mosestokenizer, especially on Window
|
|
68 |
### Real-time simulation from audio file
|
69 |
|
70 |
```
|
71 |
-
usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}] [--model_cache_dir MODEL_CACHE_DIR]
|
72 |
-
[--
|
73 |
-
[--buffer_trimming {sentence,segment}] [--buffer_trimming_sec BUFFER_TRIMMING_SEC] [--offline] [--comp_unaware]
|
74 |
audio_path
|
75 |
|
76 |
positional arguments:
|
@@ -79,8 +78,7 @@ positional arguments:
|
|
79 |
options:
|
80 |
-h, --help show this help message and exit
|
81 |
--min-chunk-size MIN_CHUNK_SIZE
|
82 |
-
Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was
|
83 |
-
received by this time.
|
84 |
--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}
|
85 |
Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.
|
86 |
--model_cache_dir MODEL_CACHE_DIR
|
@@ -91,15 +89,14 @@ options:
|
|
91 |
Language code for transcription, e.g. en,de,cs.
|
92 |
--task {transcribe,translate}
|
93 |
Transcribe or translate.
|
94 |
-
--start_at START_AT Start processing audio at this time.
|
95 |
--backend {faster-whisper,whisper_timestamped}
|
96 |
Load only this backend for Whisper processing.
|
97 |
--vad Use VAD = voice activity detection, with the default parameters.
|
98 |
--buffer_trimming {sentence,segment}
|
99 |
-
Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter
|
100 |
-
must be installed for "sentence" option.
|
101 |
--buffer_trimming_sec BUFFER_TRIMMING_SEC
|
102 |
Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.
|
|
|
103 |
--offline Offline mode.
|
104 |
--comp_unaware Computationally unaware simulation.
|
105 |
```
|
@@ -120,7 +117,7 @@ Simulation modes:
|
|
120 |
|
121 |
- `--start_at START_AT`: Start processing audio at this time. The first update receives the whole audio by `START_AT`. It is useful for debugging, e.g. when we observe a bug in a specific time in audio file, and want to reproduce it quickly, without long waiting.
|
122 |
|
123 |
-
- `--
|
124 |
|
125 |
|
126 |
|
|
|
68 |
### Real-time simulation from audio file
|
69 |
|
70 |
```
|
71 |
+
usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}] [--model_cache_dir MODEL_CACHE_DIR] [--model_dir MODEL_DIR] [--lan LAN] [--task {transcribe,translate}]
|
72 |
+
[--backend {faster-whisper,whisper_timestamped}] [--vad] [--buffer_trimming {sentence,segment}] [--buffer_trimming_sec BUFFER_TRIMMING_SEC] [--start_at START_AT] [--offline] [--comp_unaware]
|
|
|
73 |
audio_path
|
74 |
|
75 |
positional arguments:
|
|
|
78 |
options:
|
79 |
-h, --help show this help message and exit
|
80 |
--min-chunk-size MIN_CHUNK_SIZE
|
81 |
+
Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.
|
|
|
82 |
--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}
|
83 |
Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.
|
84 |
--model_cache_dir MODEL_CACHE_DIR
|
|
|
89 |
Language code for transcription, e.g. en,de,cs.
|
90 |
--task {transcribe,translate}
|
91 |
Transcribe or translate.
|
|
|
92 |
--backend {faster-whisper,whisper_timestamped}
|
93 |
Load only this backend for Whisper processing.
|
94 |
--vad Use VAD = voice activity detection, with the default parameters.
|
95 |
--buffer_trimming {sentence,segment}
|
96 |
+
Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.
|
|
|
97 |
--buffer_trimming_sec BUFFER_TRIMMING_SEC
|
98 |
Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.
|
99 |
+
--start_at START_AT Start processing audio at this time.
|
100 |
--offline Offline mode.
|
101 |
--comp_unaware Computationally unaware simulation.
|
102 |
```
|
|
|
117 |
|
118 |
- `--start_at START_AT`: Start processing audio at this time. The first update receives the whole audio by `START_AT`. It is useful for debugging, e.g. when we observe a bug in a specific time in audio file, and want to reproduce it quickly, without long waiting.
|
119 |
|
120 |
+
- `--offline` option: It processes the whole audio file at once, in offline mode. We implement it to find out the lowest possible WER on given audio file.
|
121 |
|
122 |
|
123 |
|
whisper_online.py
CHANGED
@@ -453,7 +453,6 @@ def add_shared_args(parser):
|
|
453 |
parser.add_argument('--model_dir', type=str, default=None, help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.")
|
454 |
parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
|
455 |
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
|
456 |
-
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
|
457 |
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
|
458 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
459 |
parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
|
@@ -467,9 +466,9 @@ if __name__ == "__main__":
|
|
467 |
parser = argparse.ArgumentParser()
|
468 |
parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
|
469 |
add_shared_args(parser)
|
|
|
470 |
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
|
471 |
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
|
472 |
-
|
473 |
|
474 |
args = parser.parse_args()
|
475 |
|
|
|
453 |
parser.add_argument('--model_dir', type=str, default=None, help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.")
|
454 |
parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
|
455 |
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
|
|
|
456 |
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
|
457 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
458 |
parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
|
|
|
466 |
parser = argparse.ArgumentParser()
|
467 |
parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
|
468 |
add_shared_args(parser)
|
469 |
+
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
|
470 |
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
|
471 |
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
|
|
|
472 |
|
473 |
args = parser.parse_args()
|
474 |
|