Hassan-16 commited on
Commit
ee617da
·
verified ·
1 Parent(s): cbcfe99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -257
app.py CHANGED
@@ -1,257 +1,15 @@
1
- from kokoro import KModel, KPipeline
2
- import gradio as gr
3
- import os
4
- import random
5
- import torch
6
- import logging
7
- import soundfile as sf
8
-
9
- # Optional: import Resemblyzer for voice cloning (install via pip install resemblyzer)
10
- try:
11
- from resemblyzer import VoiceEncoder, preprocess_wav
12
- encoder = VoiceEncoder()
13
- except ImportError:
14
- encoder = None
15
-
16
- # Configuration
17
- VOICE_DIR = r"D:\New folder (2)\model\voices"
18
- OUTPUT_DIR = r"D:\New folder (2)\output_audio"
19
- TEXT = "Hello, this is a test of the Kokoro TTS system."
20
-
21
- # Configure logging
22
- logging.basicConfig(level=logging.INFO)
23
- logger = logging.getLogger(__name__)
24
-
25
- # Device setup
26
- CUDA_AVAILABLE = torch.cuda.is_available()
27
- device = "cuda" if CUDA_AVAILABLE else "cpu"
28
- logger.info(f"Using hardware: {device}")
29
-
30
- # Load models for CPU and GPU (if available)
31
- models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
32
-
33
- # Define pipelines for American ('a') and British ('b') English
34
- pipelines = {
35
- 'a': KPipeline(model=models[False], lang_code='a', device='cpu'), # American English
36
- 'b': KPipeline(model=models[False], lang_code='b', device='cpu') # British English
37
- }
38
-
39
- # Set custom pronunciations for "kokoro" in both American and British modes
40
- try:
41
- pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
42
- pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
43
- except AttributeError as e:
44
- logger.warning(f"Could not set custom pronunciations: {e}")
45
-
46
- def forward_gpu(text, voice_path, speed):
47
- # Use the GPU model directly without spaces.GPU decorator
48
- pipeline = pipelines[voice_path[0]]
49
- # Ensure the pipeline uses the GPU model
50
- pipeline.model = models[True] # Switch to GPU model
51
- generator = pipeline(text, voice=voice_path, speed=speed)
52
- for _, _, audio in generator:
53
- return audio
54
- return None
55
-
56
- def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE, clone_voice_file=None):
57
- voice_path = os.path.join(VOICE_DIR, voice)
58
- if not os.path.exists(voice_path):
59
- raise FileNotFoundError(f"Voice file not found: {voice_path}")
60
-
61
- pipeline = pipelines[voice[0]]
62
-
63
- # If a clone file is provided and the encoder is available, try to clone the voice
64
- if clone_voice_file is not None and encoder is not None:
65
- try:
66
- # clone_voice_file is a file path (string) in Gradio with type="filepath"
67
- wav = preprocess_wav(clone_voice_file)
68
- cloned_voice = torch.tensor(encoder.embed_utterance(wav), device=device).unsqueeze(0)
69
- temp_voice_path = os.path.join(VOICE_DIR, "cloned_voice.pt")
70
- torch.save(cloned_voice, temp_voice_path)
71
- voice_path = temp_voice_path
72
- except Exception as e:
73
- logger.error(f"Error cloning voice: {e}")
74
- voice_path = os.path.join(VOICE_DIR, voice)
75
-
76
- use_gpu = use_gpu and CUDA_AVAILABLE
77
- try:
78
- if use_gpu:
79
- audio = forward_gpu(text, voice_path, speed)
80
- else:
81
- pipeline.model = models[False] # Ensure CPU model is used
82
- generator = pipeline(text, voice=voice_path, speed=speed)
83
- for _, ps, audio in generator:
84
- return (24000, audio.numpy()), ps
85
- except gr.exceptions.Error as e:
86
- if use_gpu:
87
- gr.Warning(str(e))
88
- gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
89
- pipeline.model = models[False] # Switch to CPU model
90
- generator = pipeline(text, voice=voice_path, speed=speed)
91
- for _, ps, audio in generator:
92
- return (24000, audio.numpy()), ps
93
- else:
94
- raise gr.Error(e)
95
- return None, ""
96
-
97
- def predict(text, voice="af_bella.pt", speed=1):
98
- return generate_first(text, voice, speed, use_gpu=False)[0]
99
-
100
- def tokenize_first(text, voice="af_bella.pt"):
101
- voice_path = os.path.join(VOICE_DIR, voice)
102
- if not os.path.exists(voice_path):
103
- raise FileNotFoundError(f"Voice file not found: {voice_path}")
104
-
105
- pipeline = pipelines[voice[0]]
106
- generator = pipeline(text, voice=voice_path)
107
- for _, ps, _ in generator:
108
- return ps
109
- return ""
110
-
111
- def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
112
- voice_path = os.path.join(VOICE_DIR, voice)
113
- if not os.path.exists(voice_path):
114
- raise FileNotFoundError(f"Voice file not found: {voice_path}")
115
-
116
- pipeline = pipelines[voice[0]]
117
- use_gpu = use_gpu and CUDA_AVAILABLE
118
- first = True
119
- if use_gpu:
120
- pipeline.model = models[True] # Switch to GPU model
121
- else:
122
- pipeline.model = models[False] # Switch to CPU model
123
- generator = pipeline(text, voice=voice_path, speed=speed)
124
- for _, _, audio in generator:
125
- yield 24000, audio.numpy()
126
- if first:
127
- first = False
128
- yield 24000, torch.zeros(1).numpy()
129
-
130
- # Load random quotes and sample texts
131
- try:
132
- with open("en.txt", "r") as r:
133
- random_quotes = [line.strip() for line in r]
134
- except FileNotFoundError:
135
- random_quotes = ["Hello, this is a test of the Kokoro TTS system."]
136
-
137
- def get_random_quote():
138
- return random.choice(random_quotes)
139
-
140
- def get_gatsby():
141
- try:
142
- with open("gatsby5k.md", "r") as r:
143
- return r.read().strip()
144
- except FileNotFoundError:
145
- return "The Great Gatsby text not found."
146
-
147
- def get_frankenstein():
148
- try:
149
- with open("frankenstein5k.md", "r") as r:
150
- return r.read().strip()
151
- except FileNotFoundError:
152
- return "Frankenstein text not found."
153
-
154
- # Dynamically load all .pt voice files from VOICE_DIR
155
- def load_voice_choices():
156
- voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
157
- choices = {}
158
- for voice_file in voice_files:
159
- # Determine the voice type based on the prefix
160
- prefix = voice_file[:2]
161
- if prefix == 'af':
162
- label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
163
- elif prefix == 'am':
164
- label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
165
- elif prefix == 'bf':
166
- label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
167
- elif prefix == 'bm':
168
- label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
169
- else:
170
- label = f"Unknown {voice_file[:-3]}"
171
- choices[label] = voice_file
172
- return choices
173
-
174
- CHOICES = load_voice_choices()
175
-
176
- # Log available voices
177
- for label, voice_path in CHOICES.items():
178
- full_path = os.path.join(VOICE_DIR, voice_path)
179
- if not os.path.exists(full_path):
180
- logger.warning(f"Voice file not found: {full_path}")
181
- else:
182
- logger.info(f"Loaded voice: {label} ({voice_path})")
183
-
184
- # If no voices are found, add a default fallback
185
- if not CHOICES:
186
- logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
187
- CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
188
-
189
- TOKEN_NOTE = '''
190
- 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
191
-
192
- 💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ
193
-
194
- ⬇️ Lower stress [1 level](-1) or [2 levels](-2)
195
-
196
- ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
197
- '''
198
-
199
- with gr.Blocks() as generate_tab:
200
- out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
201
- generate_btn = gr.Button("Generate", variant="primary")
202
- with gr.Accordion("Output Tokens", open=True):
203
- out_ps = gr.Textbox(interactive=False, show_label=False,
204
- info="Tokens used to generate the audio, up to 510 context length.")
205
- tokenize_btn = gr.Button("Tokenize", variant="secondary")
206
- gr.Markdown(TOKEN_NOTE)
207
- predict_btn = gr.Button("Predict", variant="secondary", visible=False)
208
-
209
- with gr.Blocks() as stream_tab:
210
- out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
211
- with gr.Row():
212
- stream_btn = gr.Button("Stream", variant="primary")
213
- stop_btn = gr.Button("Stop", variant="stop")
214
- with gr.Accordion("Note", open=True):
215
- gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
216
- gr.DuplicateButton()
217
-
218
- API_OPEN = True
219
- with gr.Blocks() as app:
220
- with gr.Row():
221
- with gr.Column():
222
- text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
223
- with gr.Row():
224
- voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
225
- info="Quality and availability vary by language")
226
- use_gpu = gr.Dropdown(
227
- [("GPU 🚀", True), ("CPU 🐌", False)],
228
- value=CUDA_AVAILABLE,
229
- label="Hardware",
230
- info="GPU is usually faster, but may require CUDA support",
231
- interactive=CUDA_AVAILABLE
232
- )
233
- speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
234
- clone_voice_file = gr.File(label="Clone Voice Sample (Optional)", file_count="single", type="filepath")
235
- random_btn = gr.Button("🎲 Random Quote 💬", variant="secondary")
236
- with gr.Row():
237
- gatsby_btn = gr.Button("🥂 Gatsby 📕", variant="secondary")
238
- frankenstein_btn = gr.Button("💀 Frankenstein 📗", variant="secondary")
239
- with gr.Column():
240
- gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
241
- random_btn.click(fn=get_random_quote, inputs=[], outputs=[text])
242
- gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text])
243
- frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text])
244
- generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu, clone_voice_file],
245
- outputs=[out_audio, out_ps])
246
- tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
247
- stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
248
- stop_btn.click(fn=None, cancels=[stream_event])
249
- predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
250
-
251
- if __name__ == "__main__":
252
- app.queue(api_open=API_OPEN).launch(
253
- server_name="127.0.0.1",
254
- server_port=40001,
255
- show_api=API_OPEN,
256
- inbrowser=True
257
- )
 
1
+ import os
2
+ import zipfile
3
+
4
+ # Path to the zip file and extraction directory
5
+ zip_path = "model/kokoro-v1_0.zip"
6
+ extract_dir = "model/"
7
+
8
+ # Check if the zip file exists and extract it
9
+ if os.path.exists(zip_path):
10
+ print(f"Extracting {zip_path}...")
11
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
12
+ zip_ref.extractall(extract_dir)
13
+ print(f"Extraction completed. Files extracted to {extract_dir}")
14
+ else:
15
+ print(f"File {zip_path} does not exist.")