Hassan-16 commited on
Commit
9541cea
Β·
verified Β·
1 Parent(s): 7b8cd2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -0
app.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from kokoro import KModel, KPipeline
2
+ import gradio as gr
3
+ import os
4
+ import random
5
+ import torch
6
+ import logging
7
+ import soundfile as sf
8
+
9
+ # Optional: import Resemblyzer for voice cloning (install via pip install resemblyzer)
10
+ try:
11
+ from resemblyzer import VoiceEncoder, preprocess_wav
12
+ encoder = VoiceEncoder()
13
+ except ImportError:
14
+ encoder = None
15
+
16
+ # Configuration
17
+ VOICE_DIR = r"D:\New folder (2)\model\voices"
18
+ OUTPUT_DIR = r"D:\New folder (2)\output_audio"
19
+ TEXT = "Hello, this is a test of the Kokoro TTS system."
20
+
21
+ # Configure logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Device setup
26
+ CUDA_AVAILABLE = torch.cuda.is_available()
27
+ device = "cuda" if CUDA_AVAILABLE else "cpu"
28
+ logger.info(f"Using hardware: {device}")
29
+
30
+ # Load models for CPU and GPU (if available)
31
+ models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
32
+
33
+ # Define pipelines for American ('a') and British ('b') English
34
+ pipelines = {
35
+ 'a': KPipeline(model=models[False], lang_code='a', device='cpu'), # American English
36
+ 'b': KPipeline(model=models[False], lang_code='b', device='cpu') # British English
37
+ }
38
+
39
+ # Set custom pronunciations for "kokoro" in both American and British modes
40
+ try:
41
+ pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO"
42
+ pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ"
43
+ except AttributeError as e:
44
+ logger.warning(f"Could not set custom pronunciations: {e}")
45
+
46
+ def forward_gpu(text, voice_path, speed):
47
+ # Use the GPU model directly without spaces.GPU decorator
48
+ pipeline = pipelines[voice_path[0]]
49
+ # Ensure the pipeline uses the GPU model
50
+ pipeline.model = models[True] # Switch to GPU model
51
+ generator = pipeline(text, voice=voice_path, speed=speed)
52
+ for _, _, audio in generator:
53
+ return audio
54
+ return None
55
+
56
+ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE, clone_voice_file=None):
57
+ voice_path = os.path.join(VOICE_DIR, voice)
58
+ if not os.path.exists(voice_path):
59
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
60
+
61
+ pipeline = pipelines[voice[0]]
62
+
63
+ # If a clone file is provided and the encoder is available, try to clone the voice
64
+ if clone_voice_file is not None and encoder is not None:
65
+ try:
66
+ # clone_voice_file is a file path (string) in Gradio with type="filepath"
67
+ wav = preprocess_wav(clone_voice_file)
68
+ cloned_voice = torch.tensor(encoder.embed_utterance(wav), device=device).unsqueeze(0)
69
+ temp_voice_path = os.path.join(VOICE_DIR, "cloned_voice.pt")
70
+ torch.save(cloned_voice, temp_voice_path)
71
+ voice_path = temp_voice_path
72
+ except Exception as e:
73
+ logger.error(f"Error cloning voice: {e}")
74
+ voice_path = os.path.join(VOICE_DIR, voice)
75
+
76
+ use_gpu = use_gpu and CUDA_AVAILABLE
77
+ try:
78
+ if use_gpu:
79
+ audio = forward_gpu(text, voice_path, speed)
80
+ else:
81
+ pipeline.model = models[False] # Ensure CPU model is used
82
+ generator = pipeline(text, voice=voice_path, speed=speed)
83
+ for _, ps, audio in generator:
84
+ return (24000, audio.numpy()), ps
85
+ except gr.exceptions.Error as e:
86
+ if use_gpu:
87
+ gr.Warning(str(e))
88
+ gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
89
+ pipeline.model = models[False] # Switch to CPU model
90
+ generator = pipeline(text, voice=voice_path, speed=speed)
91
+ for _, ps, audio in generator:
92
+ return (24000, audio.numpy()), ps
93
+ else:
94
+ raise gr.Error(e)
95
+ return None, ""
96
+
97
+ def predict(text, voice="af_bella.pt", speed=1):
98
+ return generate_first(text, voice, speed, use_gpu=False)[0]
99
+
100
+ def tokenize_first(text, voice="af_bella.pt"):
101
+ voice_path = os.path.join(VOICE_DIR, voice)
102
+ if not os.path.exists(voice_path):
103
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
104
+
105
+ pipeline = pipelines[voice[0]]
106
+ generator = pipeline(text, voice=voice_path)
107
+ for _, ps, _ in generator:
108
+ return ps
109
+ return ""
110
+
111
+ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
112
+ voice_path = os.path.join(VOICE_DIR, voice)
113
+ if not os.path.exists(voice_path):
114
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
115
+
116
+ pipeline = pipelines[voice[0]]
117
+ use_gpu = use_gpu and CUDA_AVAILABLE
118
+ first = True
119
+ if use_gpu:
120
+ pipeline.model = models[True] # Switch to GPU model
121
+ else:
122
+ pipeline.model = models[False] # Switch to CPU model
123
+ generator = pipeline(text, voice=voice_path, speed=speed)
124
+ for _, _, audio in generator:
125
+ yield 24000, audio.numpy()
126
+ if first:
127
+ first = False
128
+ yield 24000, torch.zeros(1).numpy()
129
+
130
+ # Load random quotes and sample texts
131
+ try:
132
+ with open("en.txt", "r") as r:
133
+ random_quotes = [line.strip() for line in r]
134
+ except FileNotFoundError:
135
+ random_quotes = ["Hello, this is a test of the Kokoro TTS system."]
136
+
137
+ def get_random_quote():
138
+ return random.choice(random_quotes)
139
+
140
+ def get_gatsby():
141
+ try:
142
+ with open("gatsby5k.md", "r") as r:
143
+ return r.read().strip()
144
+ except FileNotFoundError:
145
+ return "The Great Gatsby text not found."
146
+
147
+ def get_frankenstein():
148
+ try:
149
+ with open("frankenstein5k.md", "r") as r:
150
+ return r.read().strip()
151
+ except FileNotFoundError:
152
+ return "Frankenstein text not found."
153
+
154
+ # Dynamically load all .pt voice files from VOICE_DIR
155
+ def load_voice_choices():
156
+ voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
157
+ choices = {}
158
+ for voice_file in voice_files:
159
+ # Determine the voice type based on the prefix
160
+ prefix = voice_file[:2]
161
+ if prefix == 'af':
162
+ label = f"πŸ‡ΊπŸ‡Έ 🚺 {voice_file[3:-3].capitalize()}"
163
+ elif prefix == 'am':
164
+ label = f"πŸ‡ΊπŸ‡Έ 🚹 {voice_file[3:-3].capitalize()}"
165
+ elif prefix == 'bf':
166
+ label = f"πŸ‡¬πŸ‡§ 🚺 {voice_file[3:-3].capitalize()}"
167
+ elif prefix == 'bm':
168
+ label = f"πŸ‡¬πŸ‡§ 🚹 {voice_file[3:-3].capitalize()}"
169
+ else:
170
+ label = f"Unknown {voice_file[:-3]}"
171
+ choices[label] = voice_file
172
+ return choices
173
+
174
+ CHOICES = load_voice_choices()
175
+
176
+ # Log available voices
177
+ for label, voice_path in CHOICES.items():
178
+ full_path = os.path.join(VOICE_DIR, voice_path)
179
+ if not os.path.exists(full_path):
180
+ logger.warning(f"Voice file not found: {full_path}")
181
+ else:
182
+ logger.info(f"Loaded voice: {label} ({voice_path})")
183
+
184
+ # If no voices are found, add a default fallback
185
+ if not CHOICES:
186
+ logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
187
+ CHOICES = {"πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella.pt"}
188
+
189
+ TOKEN_NOTE = '''
190
+ πŸ’‘ Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkΙ™ΙΉO/)
191
+
192
+ πŸ’¬ To adjust intonation, try punctuation ;:,.!?—…"()β€œβ€ or stress ˈ and ˌ
193
+
194
+ ⬇️ Lower stress [1 level](-1) or [2 levels](-2)
195
+
196
+ ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
197
+ '''
198
+
199
+ with gr.Blocks() as generate_tab:
200
+ out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
201
+ generate_btn = gr.Button("Generate", variant="primary")
202
+ with gr.Accordion("Output Tokens", open=True):
203
+ out_ps = gr.Textbox(interactive=False, show_label=False,
204
+ info="Tokens used to generate the audio, up to 510 context length.")
205
+ tokenize_btn = gr.Button("Tokenize", variant="secondary")
206
+ gr.Markdown(TOKEN_NOTE)
207
+ predict_btn = gr.Button("Predict", variant="secondary", visible=False)
208
+
209
+ with gr.Blocks() as stream_tab:
210
+ out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
211
+ with gr.Row():
212
+ stream_btn = gr.Button("Stream", variant="primary")
213
+ stop_btn = gr.Button("Stop", variant="stop")
214
+ with gr.Accordion("Note", open=True):
215
+ gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
216
+ gr.DuplicateButton()
217
+
218
+ API_OPEN = True
219
+ with gr.Blocks() as app:
220
+ with gr.Row():
221
+ with gr.Column():
222
+ text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
223
+ with gr.Row():
224
+ voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
225
+ info="Quality and availability vary by language")
226
+ use_gpu = gr.Dropdown(
227
+ [("GPU πŸš€", True), ("CPU 🐌", False)],
228
+ value=CUDA_AVAILABLE,
229
+ label="Hardware",
230
+ info="GPU is usually faster, but may require CUDA support",
231
+ interactive=CUDA_AVAILABLE
232
+ )
233
+ speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
234
+ clone_voice_file = gr.File(label="Clone Voice Sample (Optional)", file_count="single", type="filepath")
235
+ random_btn = gr.Button("🎲 Random Quote πŸ’¬", variant="secondary")
236
+ with gr.Row():
237
+ gatsby_btn = gr.Button("πŸ₯‚ Gatsby πŸ“•", variant="secondary")
238
+ frankenstein_btn = gr.Button("πŸ’€ Frankenstein πŸ“—", variant="secondary")
239
+ with gr.Column():
240
+ gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
241
+ random_btn.click(fn=get_random_quote, inputs=[], outputs=[text])
242
+ gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text])
243
+ frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text])
244
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu, clone_voice_file],
245
+ outputs=[out_audio, out_ps])
246
+ tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
247
+ stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
248
+ stop_btn.click(fn=None, cancels=[stream_event])
249
+ predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
250
+
251
+ if __name__ == "__main__":
252
+ app.queue(api_open=API_OPEN).launch(
253
+ server_name="127.0.0.1",
254
+ server_port=40001,
255
+ show_api=API_OPEN,
256
+ inbrowser=True
257
+ )