Hưng commited on
Commit
4abde05
·
2 Parent(s): 2acc727 79fa297

merge master

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. .gitignore +2 -0
  3. README.md +6 -8
  4. app.py +98 -150
  5. requirements.txt +4 -3
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .gradio
README.md CHANGED
@@ -1,14 +1,12 @@
1
  ---
2
- title: Vietnamese Tts
3
- emoji: 🖼
4
- colorFrom: purple
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.0.1
8
  app_file: app.py
9
- pinned: false
10
- license: cc-by-4.0
11
- short_description: 'Vietnamese tts trained on public data using f5-tts '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Vietnamese TTS
3
+ emoji: 🗣️
4
+ colorFrom: red
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.15.0
8
  app_file: app.py
9
+ pinned: true
 
 
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,154 +1,102 @@
 
1
  import gradio as gr
2
- import numpy as np
3
- import random
4
-
5
- # import spaces #[uncomment to use ZeroGPU]
6
- from diffusers import DiffusionPipeline
7
- import torch
8
-
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
10
- model_repo_id = "stabilityai/sdxl-turbo" # Replace to the model you would like to use
11
-
12
- if torch.cuda.is_available():
13
- torch_dtype = torch.float16
14
- else:
15
- torch_dtype = torch.float32
16
-
17
- pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
18
- pipe = pipe.to(device)
19
-
20
- MAX_SEED = np.iinfo(np.int32).max
21
- MAX_IMAGE_SIZE = 1024
22
-
23
-
24
- # @spaces.GPU #[uncomment to use ZeroGPU]
25
- def infer(
26
- prompt,
27
- negative_prompt,
28
- seed,
29
- randomize_seed,
30
- width,
31
- height,
32
- guidance_scale,
33
- num_inference_steps,
34
- progress=gr.Progress(track_tqdm=True),
35
- ):
36
- if randomize_seed:
37
- seed = random.randint(0, MAX_SEED)
38
-
39
- generator = torch.Generator().manual_seed(seed)
40
-
41
- image = pipe(
42
- prompt=prompt,
43
- negative_prompt=negative_prompt,
44
- guidance_scale=guidance_scale,
45
- num_inference_steps=num_inference_steps,
46
- width=width,
47
- height=height,
48
- generator=generator,
49
- ).images[0]
50
-
51
- return image, seed
52
-
53
-
54
- examples = [
55
- "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
56
- "An astronaut riding a green horse",
57
- "A delicious ceviche cheesecake slice",
58
- ]
59
-
60
- css = """
61
- #col-container {
62
- margin: 0 auto;
63
- max-width: 640px;
64
- }
65
- """
66
-
67
- with gr.Blocks(css=css) as demo:
68
- with gr.Column(elem_id="col-container"):
69
- gr.Markdown(" # Text-to-Image Gradio Template")
70
-
71
- with gr.Row():
72
- prompt = gr.Text(
73
- label="Prompt",
74
- show_label=False,
75
- max_lines=1,
76
- placeholder="Enter your prompt",
77
- container=False,
78
- )
79
-
80
- run_button = gr.Button("Run", scale=0, variant="primary")
81
-
82
- result = gr.Image(label="Result", show_label=False)
83
-
84
- with gr.Accordion("Advanced Settings", open=False):
85
- negative_prompt = gr.Text(
86
- label="Negative prompt",
87
- max_lines=1,
88
- placeholder="Enter a negative prompt",
89
- visible=False,
90
- )
91
-
92
- seed = gr.Slider(
93
- label="Seed",
94
- minimum=0,
95
- maximum=MAX_SEED,
96
- step=1,
97
- value=0,
98
- )
99
-
100
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
101
-
102
- with gr.Row():
103
- width = gr.Slider(
104
- label="Width",
105
- minimum=256,
106
- maximum=MAX_IMAGE_SIZE,
107
- step=32,
108
- value=1024, # Replace with defaults that work for your model
109
- )
110
-
111
- height = gr.Slider(
112
- label="Height",
113
- minimum=256,
114
- maximum=MAX_IMAGE_SIZE,
115
- step=32,
116
- value=1024, # Replace with defaults that work for your model
117
- )
118
-
119
- with gr.Row():
120
- guidance_scale = gr.Slider(
121
- label="Guidance scale",
122
- minimum=0.0,
123
- maximum=10.0,
124
- step=0.1,
125
- value=0.0, # Replace with defaults that work for your model
126
- )
127
-
128
- num_inference_steps = gr.Slider(
129
- label="Number of inference steps",
130
- minimum=1,
131
- maximum=50,
132
- step=1,
133
- value=2, # Replace with defaults that work for your model
134
- )
135
-
136
- gr.Examples(examples=examples, inputs=[prompt])
137
- gr.on(
138
- triggers=[run_button.click, prompt.submit],
139
- fn=infer,
140
- inputs=[
141
- prompt,
142
- negative_prompt,
143
- seed,
144
- randomize_seed,
145
- width,
146
- height,
147
- guidance_scale,
148
- num_inference_steps,
149
  ],
150
- outputs=[result, seed],
151
- )
152
 
153
  if __name__ == "__main__":
154
- demo.launch()
 
1
+ import spaces
2
  import gradio as gr
3
+ from cached_path import cached_path
4
+ import tempfile
5
+
6
+ from f5_tts.model import DiT
7
+ from f5_tts.infer.utils_infer import (
8
+ preprocess_ref_audio_text,
9
+ load_vocoder,
10
+ load_model,
11
+ infer_process,
12
+ save_spectrogram,
13
+ )
14
+
15
+
16
+ vocoder = load_vocoder()
17
+ model = load_model(
18
+ DiT,
19
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
20
+ ckpt_path=str(
21
+ cached_path("hf://whatvn/F5-TTS-vietnamese-tts/model.tensors")
22
+ ),
23
+ vocab_file=str(cached_path("hf://whatvn/F5-TTS-vietnamese-tts/vocab.txt")),
24
+ )
25
+
26
+
27
+ @spaces.GPU
28
+ def infer(ref_audio_orig: str, gen_text: str, speed: float = 1.0):
29
+ if ref_audio_orig is None:
30
+ raise gr.Error("Reference audio is required.")
31
+
32
+ if gen_text is None or gen_text.strip() == "":
33
+ raise gr.Error("Text to generate is required.")
34
+
35
+ try:
36
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
37
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
38
+ ref_audio,
39
+ ref_text,
40
+ gen_text,
41
+ model,
42
+ vocoder,
43
+ cross_fade_duration=0.15,
44
+ nfe_step=32,
45
+ speed=speed,
46
+ )
47
+
48
+ with tempfile.NamedTemporaryFile(
49
+ suffix=".png", delete=False
50
+ ) as tmp_spectrogram:
51
+ spectrogram_path = tmp_spectrogram.name
52
+ save_spectrogram(combined_spectrogram, spectrogram_path)
53
+
54
+ return (final_sample_rate, final_wave), spectrogram_path
55
+ except Exception as e:
56
+ raise gr.Error(f"An error occurred during inference: {e}")
57
+
58
+
59
+ iface = gr.Interface(
60
+ title="Vietnamese TTS",
61
+ description="Vietnamese TTS model trained with public data (around 200 hours Vietnamese voice) using [F5-TTS](https://github.com/SWivid/F5-TTS) model",
62
+ fn=infer,
63
+ inputs=[
64
+ gr.components.Audio(type="filepath", label="Reference Audio"),
65
+ gr.components.Textbox(label="Text to Generate", lines=3),
66
+ gr.components.Slider(
67
+ label="Speed",
68
+ minimum=0.3,
69
+ maximum=2.0,
70
+ value=1.0,
71
+ step=0.1,
72
+ info="Adjust the speed of the audio.",
73
+ ),
74
+ ],
75
+ outputs=[
76
+ gr.components.Audio(type="numpy", label="Synthesized Audio"),
77
+ gr.components.Image(type="filepath", label="Spectrogram"),
78
+ ],
79
+ submit_btn="Synthesize",
80
+ clear_btn=None,
81
+ flagging_mode="never",
82
+ examples=[
83
+ [
84
+ "examples/01.wav",
85
+ "Kiểm soát cảm xúc thực chất là một quá trình đánh giá lại bản thân, để tìm thấy tự do, thoát khỏi sự cuốn hút của chính bản ngã.",
86
+ 0.8,
87
+ ],
88
+ [
89
+ "examples/02.wav",
90
+ "Ngoài ra, nội dung ở bên kênh đấy tôi sẽ cố gắng là không nói bậy nhá.",
91
+ 1.0,
92
+ ],
93
+ [
94
+ "examples/01.wav",
95
+ "Cho tôi năm trăm triệu tôi sẽ gạch tên Pew và con tôi ra khỏi danh sách bạn bè, thực tế còn chịu tham gia một trận bốc xing để kết thúc tình nghĩa.",
96
+ 0.8,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  ],
98
+ ],
99
+ )
100
 
101
  if __name__ == "__main__":
102
+ iface.queue().launch()
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  accelerate
2
- diffusers
3
- invisible_watermark
4
  torch
 
 
5
  transformers
6
- xformers
 
 
1
  accelerate
 
 
2
  torch
3
+ torchaudio
4
+ soundfile
5
  transformers
6
+ f5_tts @ git+https://github.com/SWivid/F5-TTS.git
7
+ bitsandbytes>0.37.0