fix #524 #529
Browse files- src/f5_tts/infer/SHARED.md +17 -0
- src/f5_tts/infer/infer_cli.py +33 -1
src/f5_tts/infer/SHARED.md
CHANGED
@@ -26,6 +26,8 @@
|
|
26 |
- [F5-TTS Italian @ finetune @ it](#f5-tts-italian--finetune--it)
|
27 |
- [Japanese](#japanese)
|
28 |
- [F5-TTS Japanese @ pretrain/finetune @ ja](#f5-tts-japanese--pretrainfinetune--ja)
|
|
|
|
|
29 |
- [Mandarin](#mandarin)
|
30 |
- [Spanish](#spanish)
|
31 |
- [F5-TTS Spanish @ pretrain/finetune @ es](#f5-tts-spanish--pretrainfinetune--es)
|
@@ -108,6 +110,21 @@ MODEL_CKPT: hf://Jmica/F5TTS/JA_8500000/model_8499660.pt
|
|
108 |
VOCAB_FILE: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
|
109 |
```
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
## Mandarin
|
113 |
|
|
|
26 |
- [F5-TTS Italian @ finetune @ it](#f5-tts-italian--finetune--it)
|
27 |
- [Japanese](#japanese)
|
28 |
- [F5-TTS Japanese @ pretrain/finetune @ ja](#f5-tts-japanese--pretrainfinetune--ja)
|
29 |
+
- [Hindi](#hindi)
|
30 |
+
- [F5-TTS Small @ pretrain @ hi](#f5-tts-small--pretrain--hi)
|
31 |
- [Mandarin](#mandarin)
|
32 |
- [Spanish](#spanish)
|
33 |
- [F5-TTS Spanish @ pretrain/finetune @ es](#f5-tts-spanish--pretrainfinetune--es)
|
|
|
110 |
VOCAB_FILE: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
|
111 |
```
|
112 |
|
113 |
+
## Hindi
|
114 |
+
|
115 |
+
#### F5-TTS Small @ pretrain @ hi
|
116 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
117 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
118 |
+
|F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
|
119 |
+
|
120 |
+
```bash
|
121 |
+
MODEL_CKPT: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
|
122 |
+
VOCAB_FILE: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
|
123 |
+
```
|
124 |
+
|
125 |
+
Authors: SPRING Lab, Indian Institute of Technology, Madras
|
126 |
+
<br>
|
127 |
+
Website: https://asr.iitm.ac.in/
|
128 |
|
129 |
## Mandarin
|
130 |
|
src/f5_tts/infer/infer_cli.py
CHANGED
@@ -71,6 +71,11 @@ parser.add_argument(
|
|
71 |
type=str,
|
72 |
help="Filename of output file..",
|
73 |
)
|
|
|
|
|
|
|
|
|
|
|
74 |
parser.add_argument(
|
75 |
"--remove_silence",
|
76 |
help="Remove silence.",
|
@@ -87,6 +92,12 @@ parser.add_argument(
|
|
87 |
default=1.0,
|
88 |
help="Adjust the speed of the audio generation (default: 1.0)",
|
89 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
args = parser.parse_args()
|
91 |
|
92 |
config = tomli.load(open(args.config, "rb"))
|
@@ -95,6 +106,7 @@ ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"]
|
|
95 |
ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
|
96 |
gen_text = args.gen_text if args.gen_text else config["gen_text"]
|
97 |
gen_file = args.gen_file if args.gen_file else config["gen_file"]
|
|
|
98 |
|
99 |
# patches for pip pkg user
|
100 |
if "infer/examples/" in ref_audio:
|
@@ -116,6 +128,7 @@ ckpt_file = args.ckpt_file if args.ckpt_file else ""
|
|
116 |
vocab_file = args.vocab_file if args.vocab_file else ""
|
117 |
remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
|
118 |
speed = args.speed
|
|
|
119 |
|
120 |
wave_path = Path(output_dir) / output_file
|
121 |
# spectrogram_path = Path(output_dir) / "infer_cli_out.png"
|
@@ -200,7 +213,14 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
|
|
200 |
ref_text = voices[voice]["ref_text"]
|
201 |
print(f"Voice: {voice}")
|
202 |
audio, final_sample_rate, spectragram = infer_process(
|
203 |
-
ref_audio,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
)
|
205 |
generated_audio_segments.append(audio)
|
206 |
|
@@ -216,6 +236,18 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
|
|
216 |
if remove_silence:
|
217 |
remove_silence_for_generated_wav(f.name)
|
218 |
print(f.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
|
221 |
def main():
|
|
|
71 |
type=str,
|
72 |
help="Filename of output file..",
|
73 |
)
|
74 |
+
parser.add_argument(
|
75 |
+
"--save_chunk",
|
76 |
+
action="store_true",
|
77 |
+
help="Save chunk audio if your text is too long.",
|
78 |
+
)
|
79 |
parser.add_argument(
|
80 |
"--remove_silence",
|
81 |
help="Remove silence.",
|
|
|
92 |
default=1.0,
|
93 |
help="Adjust the speed of the audio generation (default: 1.0)",
|
94 |
)
|
95 |
+
parser.add_argument(
|
96 |
+
"--nfe_step",
|
97 |
+
type=int,
|
98 |
+
default=32,
|
99 |
+
help="Set the number of denoising steps (default: 32)",
|
100 |
+
)
|
101 |
args = parser.parse_args()
|
102 |
|
103 |
config = tomli.load(open(args.config, "rb"))
|
|
|
106 |
ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
|
107 |
gen_text = args.gen_text if args.gen_text else config["gen_text"]
|
108 |
gen_file = args.gen_file if args.gen_file else config["gen_file"]
|
109 |
+
save_chunk = args.save_chunk if args.save_chunk else False
|
110 |
|
111 |
# patches for pip pkg user
|
112 |
if "infer/examples/" in ref_audio:
|
|
|
128 |
vocab_file = args.vocab_file if args.vocab_file else ""
|
129 |
remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
|
130 |
speed = args.speed
|
131 |
+
nfe_step = args.nfe_step
|
132 |
|
133 |
wave_path = Path(output_dir) / output_file
|
134 |
# spectrogram_path = Path(output_dir) / "infer_cli_out.png"
|
|
|
213 |
ref_text = voices[voice]["ref_text"]
|
214 |
print(f"Voice: {voice}")
|
215 |
audio, final_sample_rate, spectragram = infer_process(
|
216 |
+
ref_audio,
|
217 |
+
ref_text,
|
218 |
+
gen_text,
|
219 |
+
model_obj,
|
220 |
+
vocoder,
|
221 |
+
mel_spec_type=mel_spec_type,
|
222 |
+
speed=speed,
|
223 |
+
nfe_step=nfe_step,
|
224 |
)
|
225 |
generated_audio_segments.append(audio)
|
226 |
|
|
|
236 |
if remove_silence:
|
237 |
remove_silence_for_generated_wav(f.name)
|
238 |
print(f.name)
|
239 |
+
# Ensure the gen_text chunk directory exists
|
240 |
+
|
241 |
+
if save_chunk:
|
242 |
+
gen_text_chunk_dir = os.path.join(output_dir, "chunks")
|
243 |
+
if not os.path.exists(gen_text_chunk_dir): # if Not create directory
|
244 |
+
os.makedirs(gen_text_chunk_dir)
|
245 |
+
|
246 |
+
# Save individual chunks as separate files
|
247 |
+
for idx, segment in enumerate(generated_audio_segments):
|
248 |
+
gen_text_chunk_path = os.path.join(output_dir, gen_text_chunk_dir, f"chunk_{idx}.wav")
|
249 |
+
sf.write(gen_text_chunk_path, segment, final_sample_rate)
|
250 |
+
print(f"Saved gen_text chunk {idx} at {gen_text_chunk_path}")
|
251 |
|
252 |
|
253 |
def main():
|