danhtran2mind SWivid commited on
Commit
4e95a39
Β·
1 Parent(s): c879353

Update WAV File Naming and Dependencies πŸ“πŸ”Š (#1091)

Browse files

* Update infer_cli.py

* Update pyproject.toml

* formalized

---------

Co-authored-by: SWivid <[email protected]>

Files changed (2) hide show
  1. pyproject.toml +1 -0
  2. src/f5_tts/infer/infer_cli.py +14 -0
pyproject.toml CHANGED
@@ -38,6 +38,7 @@ dependencies = [
38
  "tqdm>=4.65.0",
39
  "transformers",
40
  "transformers_stream_generator",
 
41
  "vocos",
42
  "wandb",
43
  "x_transformers>=1.31.14",
 
38
  "tqdm>=4.65.0",
39
  "transformers",
40
  "transformers_stream_generator",
41
+ "unidecode",
42
  "vocos",
43
  "wandb",
44
  "x_transformers>=1.31.14",
src/f5_tts/infer/infer_cli.py CHANGED
@@ -12,6 +12,7 @@ import tomli
12
  from cached_path import cached_path
13
  from hydra.utils import get_class
14
  from omegaconf import OmegaConf
 
15
 
16
  from f5_tts.infer.utils_infer import (
17
  cfg_strength,
@@ -112,6 +113,11 @@ parser.add_argument(
112
  action="store_true",
113
  help="To save each audio chunks during inference",
114
  )
 
 
 
 
 
115
  parser.add_argument(
116
  "--remove_silence",
117
  action="store_true",
@@ -197,6 +203,12 @@ output_file = args.output_file or config.get(
197
  )
198
 
199
  save_chunk = args.save_chunk or config.get("save_chunk", False)
 
 
 
 
 
 
200
  remove_silence = args.remove_silence or config.get("remove_silence", False)
201
  load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
202
 
@@ -344,6 +356,8 @@ def main():
344
  if save_chunk:
345
  if len(gen_text_) > 200:
346
  gen_text_ = gen_text_[:200] + " ... "
 
 
347
  sf.write(
348
  os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
349
  audio_segment,
 
12
  from cached_path import cached_path
13
  from hydra.utils import get_class
14
  from omegaconf import OmegaConf
15
+ from unidecode import unidecode
16
 
17
  from f5_tts.infer.utils_infer import (
18
  cfg_strength,
 
113
  action="store_true",
114
  help="To save each audio chunks during inference",
115
  )
116
+ parser.add_argument(
117
+ "--no_legacy_text",
118
+ action="store_false",
119
+ help="Not to use lossy ASCII transliterations of unicode text in saved file names.",
120
+ )
121
  parser.add_argument(
122
  "--remove_silence",
123
  action="store_true",
 
203
  )
204
 
205
  save_chunk = args.save_chunk or config.get("save_chunk", False)
206
+ use_legacy_text = args.no_legacy_text or config.get("no_legacy_text", False) # no_legacy_text is a store_false arg
207
+ if save_chunk and use_legacy_text:
208
+ print(
209
+ "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n"
210
+ )
211
+
212
  remove_silence = args.remove_silence or config.get("remove_silence", False)
213
  load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
214
 
 
356
  if save_chunk:
357
  if len(gen_text_) > 200:
358
  gen_text_ = gen_text_[:200] + " ... "
359
+ if use_legacy_text:
360
+ gen_text_ = unidecode(gen_text_)
361
  sf.write(
362
  os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
363
  audio_segment,