Hưng commited on
Commit
79fa297
·
0 Parent(s):
Files changed (5) hide show
  1. .gitattributes +36 -0
  2. .gitignore +2 -0
  3. README.md +12 -0
  4. app.py +102 -0
  5. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .gradio
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vietnamese TTS
3
+ emoji: 🗣️
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.15.0
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from cached_path import cached_path
4
+ import tempfile
5
+
6
+ from f5_tts.model import DiT
7
+ from f5_tts.infer.utils_infer import (
8
+ preprocess_ref_audio_text,
9
+ load_vocoder,
10
+ load_model,
11
+ infer_process,
12
+ save_spectrogram,
13
+ )
14
+
15
+
16
+ vocoder = load_vocoder()
17
+ model = load_model(
18
+ DiT,
19
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
20
+ ckpt_path=str(
21
+ cached_path("hf://whatvn/F5-TTS-vietnamese-tts/model.tensors")
22
+ ),
23
+ vocab_file=str(cached_path("hf://whatvn/F5-TTS-vietnamese-tts/vocab.txt")),
24
+ )
25
+
26
+
27
+ @spaces.GPU
28
+ def infer(ref_audio_orig: str, gen_text: str, speed: float = 1.0):
29
+ if ref_audio_orig is None:
30
+ raise gr.Error("Reference audio is required.")
31
+
32
+ if gen_text is None or gen_text.strip() == "":
33
+ raise gr.Error("Text to generate is required.")
34
+
35
+ try:
36
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
37
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
38
+ ref_audio,
39
+ ref_text,
40
+ gen_text,
41
+ model,
42
+ vocoder,
43
+ cross_fade_duration=0.15,
44
+ nfe_step=32,
45
+ speed=speed,
46
+ )
47
+
48
+ with tempfile.NamedTemporaryFile(
49
+ suffix=".png", delete=False
50
+ ) as tmp_spectrogram:
51
+ spectrogram_path = tmp_spectrogram.name
52
+ save_spectrogram(combined_spectrogram, spectrogram_path)
53
+
54
+ return (final_sample_rate, final_wave), spectrogram_path
55
+ except Exception as e:
56
+ raise gr.Error(f"An error occurred during inference: {e}")
57
+
58
+
59
+ iface = gr.Interface(
60
+ title="Vietnamese TTS",
61
+ description="Vietnamese TTS model trained with public data (around 200 hours Vietnamese voice) using [F5-TTS](https://github.com/SWivid/F5-TTS) model",
62
+ fn=infer,
63
+ inputs=[
64
+ gr.components.Audio(type="filepath", label="Reference Audio"),
65
+ gr.components.Textbox(label="Text to Generate", lines=3),
66
+ gr.components.Slider(
67
+ label="Speed",
68
+ minimum=0.3,
69
+ maximum=2.0,
70
+ value=1.0,
71
+ step=0.1,
72
+ info="Adjust the speed of the audio.",
73
+ ),
74
+ ],
75
+ outputs=[
76
+ gr.components.Audio(type="numpy", label="Synthesized Audio"),
77
+ gr.components.Image(type="filepath", label="Spectrogram"),
78
+ ],
79
+ submit_btn="Synthesize",
80
+ clear_btn=None,
81
+ flagging_mode="never",
82
+ examples=[
83
+ [
84
+ "examples/01.wav",
85
+ "Kiểm soát cảm xúc thực chất là một quá trình đánh giá lại bản thân, để tìm thấy tự do, thoát khỏi sự cuốn hút của chính bản ngã.",
86
+ 0.8,
87
+ ],
88
+ [
89
+ "examples/02.wav",
90
+ "Ngoài ra, nội dung ở bên kênh đấy tôi sẽ cố gắng là không nói bậy nhá.",
91
+ 1.0,
92
+ ],
93
+ [
94
+ "examples/01.wav",
95
+ "Cho tôi năm trăm triệu tôi sẽ gạch tên Pew và con tôi ra khỏi danh sách bạn bè, thực tế còn chịu tham gia một trận bốc xing để kết thúc tình nghĩa.",
96
+ 0.8,
97
+ ],
98
+ ],
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ iface.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ soundfile
4
+ transformers
5
+ f5_tts @ git+https://github.com/SWivid/F5-TTS.git
6
+ bitsandbytes>0.37.0