Chi Kim commited on
Commit
d393827
·
1 Parent(s): 408075f

Inference commandline interface.

Browse files
Files changed (3) hide show
  1. inference-cli.py +391 -0
  2. inference-cli.toml +8 -0
  3. requirements.txt +2 -0
inference-cli.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ import torchaudio
5
+ import numpy as np
6
+ import tempfile
7
+ from einops import rearrange
8
+ from vocos import Vocos
9
+ from pydub import AudioSegment, silence
10
+ from model import CFM, UNetT, DiT, MMDiT
11
+ from cached_path import cached_path
12
+ from model.utils import (
13
+ load_checkpoint,
14
+ get_tokenizer,
15
+ convert_char_to_pinyin,
16
+ save_spectrogram,
17
+ )
18
+ from transformers import pipeline
19
+ import librosa
20
+ import click
21
+ import soundfile as sf
22
+ import tomllib
23
+ import argparse
24
+ import tqdm
25
+ from pathlib import Path
26
+
27
+ parser = argparse.ArgumentParser(
28
+ prog="python3 inference-cli.py",
29
+ description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
30
+ epilog="Specify options above to override one or more settings from config.",
31
+ )
32
+ parser.add_argument(
33
+ "-c",
34
+ "--config",
35
+ help="Configuration file. Default=cli-config.toml",
36
+ default="inference-cli.toml",
37
+ )
38
+ parser.add_argument(
39
+ "-m",
40
+ "--model",
41
+ help="F5-TTS | E2-TTS",
42
+ )
43
+ parser.add_argument(
44
+ "-r",
45
+ "--reference",
46
+ type=str,
47
+ help="Reference audio file < 15 seconds."
48
+ )
49
+ parser.add_argument(
50
+ "-s",
51
+ "--subtitle",
52
+ type=str,
53
+ help="Subtitle for the reference audio."
54
+ )
55
+ parser.add_argument(
56
+ "-t",
57
+ "--text",
58
+ type=str,
59
+ help="Text to generate.",
60
+ )
61
+ parser.add_argument(
62
+ "-o",
63
+ "--output_dir",
64
+ type=str,
65
+ help="Path to output folder..",
66
+ )
67
+ parser.add_argument(
68
+ "--remove_silence",
69
+ help="Remove silence.",
70
+ )
71
+ args = parser.parse_args()
72
+
73
+ config = tomllib.load(open(args.config, "rb"))
74
+
75
+ ref_audio = args.reference if args.reference else config["reference"]
76
+ ref_text = args.subtitle if args.subtitle else config["subtitle"]
77
+ gen_text = args.text if args.text else config["text"]
78
+ output_dir = args.output_dir if args.output_dir else config["output_dir"]
79
+ exp_name = args.model if args.model else config["model"]
80
+ remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
81
+ wave_path = Path(output_dir)/"out.wav"
82
+ spectrogram_path = Path(output_dir)/"out.png"
83
+
84
+ SPLIT_WORDS = [
85
+ "but", "however", "nevertheless", "yet", "still",
86
+ "therefore", "thus", "hence", "consequently",
87
+ "moreover", "furthermore", "additionally",
88
+ "meanwhile", "alternatively", "otherwise",
89
+ "namely", "specifically", "for example", "such as",
90
+ "in fact", "indeed", "notably",
91
+ "in contrast", "on the other hand", "conversely",
92
+ "in conclusion", "to summarize", "finally"
93
+ ]
94
+
95
+ device = (
96
+ "cuda"
97
+ if torch.cuda.is_available()
98
+ else "mps" if torch.backends.mps.is_available() else "cpu"
99
+ )
100
+
101
+ print(f"Using {device} device")
102
+
103
+ pipe = pipeline(
104
+ "automatic-speech-recognition",
105
+ model="openai/whisper-large-v3-turbo",
106
+ torch_dtype=torch.float16,
107
+ device=device,
108
+ )
109
+
110
+ # --------------------- Settings -------------------- #
111
+
112
+ target_sample_rate = 24000
113
+ n_mel_channels = 100
114
+ hop_length = 256
115
+ target_rms = 0.1
116
+ nfe_step = 32 # 16, 32
117
+ cfg_strength = 2.0
118
+ ode_method = "euler"
119
+ sway_sampling_coef = -1.0
120
+ speed = 1.0
121
+ # fix_duration = 27 # None or float (duration in seconds)
122
+ fix_duration = None
123
+
124
+ def load_model(exp_name, model_cls, model_cfg, ckpt_step):
125
+ ckpt_path = str(cached_path(f"hf://SWivid/F5-TTS/{exp_name}/model_{ckpt_step}.safetensors"))
126
+ # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
127
+ vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
128
+ model = CFM(
129
+ transformer=model_cls(
130
+ **model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
131
+ ),
132
+ mel_spec_kwargs=dict(
133
+ target_sample_rate=target_sample_rate,
134
+ n_mel_channels=n_mel_channels,
135
+ hop_length=hop_length,
136
+ ),
137
+ odeint_kwargs=dict(
138
+ method=ode_method,
139
+ ),
140
+ vocab_char_map=vocab_char_map,
141
+ ).to(device)
142
+
143
+ model = load_checkpoint(model, ckpt_path, device, use_ema = True)
144
+
145
+ return model
146
+
147
+
148
+ # load models
149
+ F5TTS_model_cfg = dict(
150
+ dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
151
+ )
152
+ E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
153
+
154
+ F5TTS_ema_model = load_model(
155
+ "F5TTS_Base", DiT, F5TTS_model_cfg, 1200000
156
+ )
157
+ E2TTS_ema_model = load_model(
158
+ "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
159
+ )
160
+
161
+ def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
162
+ if len(text.encode('utf-8')) <= max_chars:
163
+ return [text]
164
+ if text[-1] not in ['。', '.', '!', '!', '?', '?']:
165
+ text += '.'
166
+
167
+ sentences = re.split('([。.!?!?])', text)
168
+ sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2])]
169
+
170
+ batches = []
171
+ current_batch = ""
172
+
173
+ def split_by_words(text):
174
+ words = text.split()
175
+ current_word_part = ""
176
+ word_batches = []
177
+ for word in words:
178
+ if len(current_word_part.encode('utf-8')) + len(word.encode('utf-8')) + 1 <= max_chars:
179
+ current_word_part += word + ' '
180
+ else:
181
+ if current_word_part:
182
+ # Try to find a suitable split word
183
+ for split_word in split_words:
184
+ split_index = current_word_part.rfind(' ' + split_word + ' ')
185
+ if split_index != -1:
186
+ word_batches.append(current_word_part[:split_index].strip())
187
+ current_word_part = current_word_part[split_index:].strip() + ' '
188
+ break
189
+ else:
190
+ # If no suitable split word found, just append the current part
191
+ word_batches.append(current_word_part.strip())
192
+ current_word_part = ""
193
+ current_word_part += word + ' '
194
+ if current_word_part:
195
+ word_batches.append(current_word_part.strip())
196
+ return word_batches
197
+
198
+ for sentence in sentences:
199
+ if len(current_batch.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
200
+ current_batch += sentence
201
+ else:
202
+ # If adding this sentence would exceed the limit
203
+ if current_batch:
204
+ batches.append(current_batch)
205
+ current_batch = ""
206
+
207
+ # If the sentence itself is longer than max_chars, split it
208
+ if len(sentence.encode('utf-8')) > max_chars:
209
+ # First, try to split by colon
210
+ colon_parts = sentence.split(':')
211
+ if len(colon_parts) > 1:
212
+ for part in colon_parts:
213
+ if len(part.encode('utf-8')) <= max_chars:
214
+ batches.append(part)
215
+ else:
216
+ # If colon part is still too long, split by comma
217
+ comma_parts = re.split('[,,]', part)
218
+ if len(comma_parts) > 1:
219
+ current_comma_part = ""
220
+ for comma_part in comma_parts:
221
+ if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
222
+ current_comma_part += comma_part + ','
223
+ else:
224
+ if current_comma_part:
225
+ batches.append(current_comma_part.rstrip(','))
226
+ current_comma_part = comma_part + ','
227
+ if current_comma_part:
228
+ batches.append(current_comma_part.rstrip(','))
229
+ else:
230
+ # If no comma, split by words
231
+ batches.extend(split_by_words(part))
232
+ else:
233
+ # If no colon, split by comma
234
+ comma_parts = re.split('[,,]', sentence)
235
+ if len(comma_parts) > 1:
236
+ current_comma_part = ""
237
+ for comma_part in comma_parts:
238
+ if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
239
+ current_comma_part += comma_part + ','
240
+ else:
241
+ if current_comma_part:
242
+ batches.append(current_comma_part.rstrip(','))
243
+ current_comma_part = comma_part + ','
244
+ if current_comma_part:
245
+ batches.append(current_comma_part.rstrip(','))
246
+ else:
247
+ # If no comma, split by words
248
+ batches.extend(split_by_words(sentence))
249
+ else:
250
+ current_batch = sentence
251
+
252
+ if current_batch:
253
+ batches.append(current_batch)
254
+
255
+ return batches
256
+
257
+ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence):
258
+ if exp_name == "F5-TTS":
259
+ ema_model = F5TTS_ema_model
260
+ elif exp_name == "E2-TTS":
261
+ ema_model = E2TTS_ema_model
262
+
263
+ audio, sr = torchaudio.load(ref_audio)
264
+ if audio.shape[0] > 1:
265
+ audio = torch.mean(audio, dim=0, keepdim=True)
266
+
267
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
268
+ if rms < target_rms:
269
+ audio = audio * target_rms / rms
270
+ if sr != target_sample_rate:
271
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
272
+ audio = resampler(audio)
273
+ audio = audio.to(device)
274
+
275
+ generated_waves = []
276
+ spectrograms = []
277
+
278
+ for i, gen_text in enumerate(tqdm.tqdm(gen_text_batches)):
279
+ # Prepare the text
280
+ if len(ref_text[-1].encode('utf-8')) == 1:
281
+ ref_text = ref_text + " "
282
+ text_list = [ref_text + gen_text]
283
+ final_text_list = convert_char_to_pinyin(text_list)
284
+
285
+ # Calculate duration
286
+ ref_audio_len = audio.shape[-1] // hop_length
287
+ zh_pause_punc = r"。,、;:?!"
288
+ ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
289
+ gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
290
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
291
+
292
+ # inference
293
+ with torch.inference_mode():
294
+ generated, _ = ema_model.sample(
295
+ cond=audio,
296
+ text=final_text_list,
297
+ duration=duration,
298
+ steps=nfe_step,
299
+ cfg_strength=cfg_strength,
300
+ sway_sampling_coef=sway_sampling_coef,
301
+ )
302
+
303
+ generated = generated[:, ref_audio_len:, :]
304
+ generated_mel_spec = rearrange(generated, "1 n d -> 1 d n")
305
+
306
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
307
+ generated_wave = vocos.decode(generated_mel_spec.cpu())
308
+ if rms < target_rms:
309
+ generated_wave = generated_wave * rms / target_rms
310
+
311
+ # wav -> numpy
312
+ generated_wave = generated_wave.squeeze().cpu().numpy()
313
+
314
+ generated_waves.append(generated_wave)
315
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
316
+
317
+ # Combine all generated waves
318
+ final_wave = np.concatenate(generated_waves)
319
+
320
+ # Remove silence
321
+ if remove_silence:
322
+ with open(wave_path, "wb") as f:
323
+ sf.write(f.name, final_wave, target_sample_rate)
324
+ aseg = AudioSegment.from_file(f.name)
325
+ non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
326
+ non_silent_wave = AudioSegment.silent(duration=0)
327
+ for non_silent_seg in non_silent_segs:
328
+ non_silent_wave += non_silent_seg
329
+ aseg = non_silent_wave
330
+ aseg.export(f.name, format="wav")
331
+ print(f.name)
332
+
333
+ # Create a combined spectrogram
334
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
335
+ save_spectrogram(combined_spectrogram, spectrogram_path)
336
+ print(spectrogram_path)
337
+
338
+
339
+ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words):
340
+ if not custom_split_words.strip():
341
+ custom_words = [word.strip() for word in custom_split_words.split(',')]
342
+ global SPLIT_WORDS
343
+ SPLIT_WORDS = custom_words
344
+
345
+ print(gen_text)
346
+
347
+ print("Converting audio...")
348
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
349
+ aseg = AudioSegment.from_file(ref_audio_orig)
350
+
351
+ non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
352
+ non_silent_wave = AudioSegment.silent(duration=0)
353
+ for non_silent_seg in non_silent_segs:
354
+ non_silent_wave += non_silent_seg
355
+ aseg = non_silent_wave
356
+
357
+ audio_duration = len(aseg)
358
+ if audio_duration > 15000:
359
+ print("Audio is over 15s, clipping to only first 15s.")
360
+ aseg = aseg[:15000]
361
+ aseg.export(f.name, format="wav")
362
+ ref_audio = f.name
363
+
364
+ if not ref_text.strip():
365
+ print("No reference text provided, transcribing reference audio...")
366
+ ref_text = pipe(
367
+ ref_audio,
368
+ chunk_length_s=30,
369
+ batch_size=128,
370
+ generate_kwargs={"task": "transcribe"},
371
+ return_timestamps=False,
372
+ )["text"].strip()
373
+ print("Finished transcription")
374
+ else:
375
+ print("Using custom reference text...")
376
+
377
+ # Split the input text into batches
378
+ if len(ref_text.encode('utf-8')) == len(ref_text) and len(gen_text.encode('utf-8')) == len(gen_text):
379
+ max_chars = 400-len(ref_text.encode('utf-8'))
380
+ else:
381
+ max_chars = 300-len(ref_text.encode('utf-8'))
382
+ gen_text_batches = split_text_into_batches(gen_text, max_chars=max_chars)
383
+ print('ref_text', ref_text)
384
+ for i, gen_text in enumerate(gen_text_batches):
385
+ print(f'gen_text {i}', gen_text)
386
+
387
+ print(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
388
+ return infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence)
389
+
390
+
391
+ infer(ref_audio, ref_text, gen_text, exp_name, remove_silence, ",".join(SPLIT_WORDS))
inference-cli.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # F5-TTS | E2-TTS
2
+ model = "F5-TTS"
3
+ reference = "tests/ref_audio/test_en_1_ref_short.wav"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ subtitle = "Some call me nature, others call me mother nature."
6
+ text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
7
+ remove_silence = true
8
+ output_dir = "tests"
requirements.txt CHANGED
@@ -21,3 +21,5 @@ wandb
21
  x_transformers>=1.31.14
22
  zhconv
23
  zhon
 
 
 
21
  x_transformers>=1.31.14
22
  zhconv
23
  zhon
24
+ pydub
25
+ cached_path