Spaces:
Running
on
Zero
Running
on
Zero
merge master
Browse files- .gitattributes +1 -0
- .gitignore +2 -0
- README.md +6 -8
- app.py +98 -150
- requirements.txt +4 -3
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.gradio
|
README.md
CHANGED
@@ -1,14 +1,12 @@
|
|
1 |
---
|
2 |
-
title: Vietnamese
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
-
license: cc-by-4.0
|
11 |
-
short_description: 'Vietnamese tts trained on public data using f5-tts '
|
12 |
---
|
13 |
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Vietnamese TTS
|
3 |
+
emoji: 🗣️
|
4 |
+
colorFrom: red
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.15.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
|
|
|
|
10 |
---
|
11 |
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,154 +1,102 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
from
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
)
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
step=1,
|
97 |
-
value=0,
|
98 |
-
)
|
99 |
-
|
100 |
-
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
|
101 |
-
|
102 |
-
with gr.Row():
|
103 |
-
width = gr.Slider(
|
104 |
-
label="Width",
|
105 |
-
minimum=256,
|
106 |
-
maximum=MAX_IMAGE_SIZE,
|
107 |
-
step=32,
|
108 |
-
value=1024, # Replace with defaults that work for your model
|
109 |
-
)
|
110 |
-
|
111 |
-
height = gr.Slider(
|
112 |
-
label="Height",
|
113 |
-
minimum=256,
|
114 |
-
maximum=MAX_IMAGE_SIZE,
|
115 |
-
step=32,
|
116 |
-
value=1024, # Replace with defaults that work for your model
|
117 |
-
)
|
118 |
-
|
119 |
-
with gr.Row():
|
120 |
-
guidance_scale = gr.Slider(
|
121 |
-
label="Guidance scale",
|
122 |
-
minimum=0.0,
|
123 |
-
maximum=10.0,
|
124 |
-
step=0.1,
|
125 |
-
value=0.0, # Replace with defaults that work for your model
|
126 |
-
)
|
127 |
-
|
128 |
-
num_inference_steps = gr.Slider(
|
129 |
-
label="Number of inference steps",
|
130 |
-
minimum=1,
|
131 |
-
maximum=50,
|
132 |
-
step=1,
|
133 |
-
value=2, # Replace with defaults that work for your model
|
134 |
-
)
|
135 |
-
|
136 |
-
gr.Examples(examples=examples, inputs=[prompt])
|
137 |
-
gr.on(
|
138 |
-
triggers=[run_button.click, prompt.submit],
|
139 |
-
fn=infer,
|
140 |
-
inputs=[
|
141 |
-
prompt,
|
142 |
-
negative_prompt,
|
143 |
-
seed,
|
144 |
-
randomize_seed,
|
145 |
-
width,
|
146 |
-
height,
|
147 |
-
guidance_scale,
|
148 |
-
num_inference_steps,
|
149 |
],
|
150 |
-
|
151 |
-
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
-
|
|
|
1 |
+
import spaces
|
2 |
import gradio as gr
|
3 |
+
from cached_path import cached_path
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
from f5_tts.model import DiT
|
7 |
+
from f5_tts.infer.utils_infer import (
|
8 |
+
preprocess_ref_audio_text,
|
9 |
+
load_vocoder,
|
10 |
+
load_model,
|
11 |
+
infer_process,
|
12 |
+
save_spectrogram,
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
vocoder = load_vocoder()
|
17 |
+
model = load_model(
|
18 |
+
DiT,
|
19 |
+
dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
|
20 |
+
ckpt_path=str(
|
21 |
+
cached_path("hf://whatvn/F5-TTS-vietnamese-tts/model.tensors")
|
22 |
+
),
|
23 |
+
vocab_file=str(cached_path("hf://whatvn/F5-TTS-vietnamese-tts/vocab.txt")),
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
@spaces.GPU
|
28 |
+
def infer(ref_audio_orig: str, gen_text: str, speed: float = 1.0):
|
29 |
+
if ref_audio_orig is None:
|
30 |
+
raise gr.Error("Reference audio is required.")
|
31 |
+
|
32 |
+
if gen_text is None or gen_text.strip() == "":
|
33 |
+
raise gr.Error("Text to generate is required.")
|
34 |
+
|
35 |
+
try:
|
36 |
+
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
|
37 |
+
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
38 |
+
ref_audio,
|
39 |
+
ref_text,
|
40 |
+
gen_text,
|
41 |
+
model,
|
42 |
+
vocoder,
|
43 |
+
cross_fade_duration=0.15,
|
44 |
+
nfe_step=32,
|
45 |
+
speed=speed,
|
46 |
+
)
|
47 |
+
|
48 |
+
with tempfile.NamedTemporaryFile(
|
49 |
+
suffix=".png", delete=False
|
50 |
+
) as tmp_spectrogram:
|
51 |
+
spectrogram_path = tmp_spectrogram.name
|
52 |
+
save_spectrogram(combined_spectrogram, spectrogram_path)
|
53 |
+
|
54 |
+
return (final_sample_rate, final_wave), spectrogram_path
|
55 |
+
except Exception as e:
|
56 |
+
raise gr.Error(f"An error occurred during inference: {e}")
|
57 |
+
|
58 |
+
|
59 |
+
iface = gr.Interface(
|
60 |
+
title="Vietnamese TTS",
|
61 |
+
description="Vietnamese TTS model trained with public data (around 200 hours Vietnamese voice) using [F5-TTS](https://github.com/SWivid/F5-TTS) model",
|
62 |
+
fn=infer,
|
63 |
+
inputs=[
|
64 |
+
gr.components.Audio(type="filepath", label="Reference Audio"),
|
65 |
+
gr.components.Textbox(label="Text to Generate", lines=3),
|
66 |
+
gr.components.Slider(
|
67 |
+
label="Speed",
|
68 |
+
minimum=0.3,
|
69 |
+
maximum=2.0,
|
70 |
+
value=1.0,
|
71 |
+
step=0.1,
|
72 |
+
info="Adjust the speed of the audio.",
|
73 |
+
),
|
74 |
+
],
|
75 |
+
outputs=[
|
76 |
+
gr.components.Audio(type="numpy", label="Synthesized Audio"),
|
77 |
+
gr.components.Image(type="filepath", label="Spectrogram"),
|
78 |
+
],
|
79 |
+
submit_btn="Synthesize",
|
80 |
+
clear_btn=None,
|
81 |
+
flagging_mode="never",
|
82 |
+
examples=[
|
83 |
+
[
|
84 |
+
"examples/01.wav",
|
85 |
+
"Kiểm soát cảm xúc thực chất là một quá trình đánh giá lại bản thân, để tìm thấy tự do, thoát khỏi sự cuốn hút của chính bản ngã.",
|
86 |
+
0.8,
|
87 |
+
],
|
88 |
+
[
|
89 |
+
"examples/02.wav",
|
90 |
+
"Ngoài ra, nội dung ở bên kênh đấy tôi sẽ cố gắng là không nói bậy nhá.",
|
91 |
+
1.0,
|
92 |
+
],
|
93 |
+
[
|
94 |
+
"examples/01.wav",
|
95 |
+
"Cho tôi năm trăm triệu tôi sẽ gạch tên Pew và con tôi ra khỏi danh sách bạn bè, thực tế còn chịu tham gia một trận bốc xing để kết thúc tình nghĩa.",
|
96 |
+
0.8,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
],
|
98 |
+
],
|
99 |
+
)
|
100 |
|
101 |
if __name__ == "__main__":
|
102 |
+
iface.queue().launch()
|
requirements.txt
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
accelerate
|
2 |
-
diffusers
|
3 |
-
invisible_watermark
|
4 |
torch
|
|
|
|
|
5 |
transformers
|
6 |
-
|
|
|
|
1 |
accelerate
|
|
|
|
|
2 |
torch
|
3 |
+
torchaudio
|
4 |
+
soundfile
|
5 |
transformers
|
6 |
+
f5_tts @ git+https://github.com/SWivid/F5-TTS.git
|
7 |
+
bitsandbytes>0.37.0
|