zkniu commited on
Commit
aee0200
·
1 Parent(s): 22c95cd

fix #524 #529

Browse files
src/f5_tts/infer/SHARED.md CHANGED
@@ -26,6 +26,8 @@
26
  - [F5-TTS Italian @ finetune @ it](#f5-tts-italian--finetune--it)
27
  - [Japanese](#japanese)
28
  - [F5-TTS Japanese @ pretrain/finetune @ ja](#f5-tts-japanese--pretrainfinetune--ja)
 
 
29
  - [Mandarin](#mandarin)
30
  - [Spanish](#spanish)
31
  - [F5-TTS Spanish @ pretrain/finetune @ es](#f5-tts-spanish--pretrainfinetune--es)
@@ -108,6 +110,21 @@ MODEL_CKPT: hf://Jmica/F5TTS/JA_8500000/model_8499660.pt
108
  VOCAB_FILE: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
109
  ```
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  ## Mandarin
113
 
 
26
  - [F5-TTS Italian @ finetune @ it](#f5-tts-italian--finetune--it)
27
  - [Japanese](#japanese)
28
  - [F5-TTS Japanese @ pretrain/finetune @ ja](#f5-tts-japanese--pretrainfinetune--ja)
29
+ - [Hindi](#hindi)
30
+ - [F5-TTS Small @ pretrain @ hi](#f5-tts-small--pretrain--hi)
31
  - [Mandarin](#mandarin)
32
  - [Spanish](#spanish)
33
  - [F5-TTS Spanish @ pretrain/finetune @ es](#f5-tts-spanish--pretrainfinetune--es)
 
110
  VOCAB_FILE: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
111
  ```
112
 
113
+ ## Hindi
114
+
115
+ #### F5-TTS Small @ pretrain @ hi
116
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
117
+ |:---:|:------------:|:-----------:|:-------------:|
118
+ |F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
119
+
120
+ ```bash
121
+ MODEL_CKPT: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
122
+ VOCAB_FILE: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
123
+ ```
124
+
125
+ Authors: SPRING Lab, Indian Institute of Technology, Madras
126
+ <br>
127
+ Website: https://asr.iitm.ac.in/
128
 
129
  ## Mandarin
130
 
src/f5_tts/infer/infer_cli.py CHANGED
@@ -71,6 +71,11 @@ parser.add_argument(
71
  type=str,
72
  help="Filename of output file..",
73
  )
 
 
 
 
 
74
  parser.add_argument(
75
  "--remove_silence",
76
  help="Remove silence.",
@@ -87,6 +92,12 @@ parser.add_argument(
87
  default=1.0,
88
  help="Adjust the speed of the audio generation (default: 1.0)",
89
  )
 
 
 
 
 
 
90
  args = parser.parse_args()
91
 
92
  config = tomli.load(open(args.config, "rb"))
@@ -95,6 +106,7 @@ ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"]
95
  ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
96
  gen_text = args.gen_text if args.gen_text else config["gen_text"]
97
  gen_file = args.gen_file if args.gen_file else config["gen_file"]
 
98
 
99
  # patches for pip pkg user
100
  if "infer/examples/" in ref_audio:
@@ -116,6 +128,7 @@ ckpt_file = args.ckpt_file if args.ckpt_file else ""
116
  vocab_file = args.vocab_file if args.vocab_file else ""
117
  remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
118
  speed = args.speed
 
119
 
120
  wave_path = Path(output_dir) / output_file
121
  # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
@@ -200,7 +213,14 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
200
  ref_text = voices[voice]["ref_text"]
201
  print(f"Voice: {voice}")
202
  audio, final_sample_rate, spectragram = infer_process(
203
- ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type=mel_spec_type, speed=speed
 
 
 
 
 
 
 
204
  )
205
  generated_audio_segments.append(audio)
206
 
@@ -216,6 +236,18 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
216
  if remove_silence:
217
  remove_silence_for_generated_wav(f.name)
218
  print(f.name)
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
 
221
  def main():
 
71
  type=str,
72
  help="Filename of output file..",
73
  )
74
+ parser.add_argument(
75
+ "--save_chunk",
76
+ action="store_true",
77
+ help="Save chunk audio if your text is too long.",
78
+ )
79
  parser.add_argument(
80
  "--remove_silence",
81
  help="Remove silence.",
 
92
  default=1.0,
93
  help="Adjust the speed of the audio generation (default: 1.0)",
94
  )
95
+ parser.add_argument(
96
+ "--nfe_step",
97
+ type=int,
98
+ default=32,
99
+ help="Set the number of denoising steps (default: 32)",
100
+ )
101
  args = parser.parse_args()
102
 
103
  config = tomli.load(open(args.config, "rb"))
 
106
  ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
107
  gen_text = args.gen_text if args.gen_text else config["gen_text"]
108
  gen_file = args.gen_file if args.gen_file else config["gen_file"]
109
+ save_chunk = args.save_chunk if args.save_chunk else False
110
 
111
  # patches for pip pkg user
112
  if "infer/examples/" in ref_audio:
 
128
  vocab_file = args.vocab_file if args.vocab_file else ""
129
  remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
130
  speed = args.speed
131
+ nfe_step = args.nfe_step
132
 
133
  wave_path = Path(output_dir) / output_file
134
  # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
 
213
  ref_text = voices[voice]["ref_text"]
214
  print(f"Voice: {voice}")
215
  audio, final_sample_rate, spectragram = infer_process(
216
+ ref_audio,
217
+ ref_text,
218
+ gen_text,
219
+ model_obj,
220
+ vocoder,
221
+ mel_spec_type=mel_spec_type,
222
+ speed=speed,
223
+ nfe_step=nfe_step,
224
  )
225
  generated_audio_segments.append(audio)
226
 
 
236
  if remove_silence:
237
  remove_silence_for_generated_wav(f.name)
238
  print(f.name)
239
+ # Ensure the gen_text chunk directory exists
240
+
241
+ if save_chunk:
242
+ gen_text_chunk_dir = os.path.join(output_dir, "chunks")
243
+ if not os.path.exists(gen_text_chunk_dir): # if Not create directory
244
+ os.makedirs(gen_text_chunk_dir)
245
+
246
+ # Save individual chunks as separate files
247
+ for idx, segment in enumerate(generated_audio_segments):
248
+ gen_text_chunk_path = os.path.join(output_dir, gen_text_chunk_dir, f"chunk_{idx}.wav")
249
+ sf.write(gen_text_chunk_path, segment, final_sample_rate)
250
+ print(f"Saved gen_text chunk {idx} at {gen_text_chunk_path}")
251
 
252
 
253
  def main():