Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
from kokoro import KModel, KPipeline
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
import torch
|
5 |
import logging
|
6 |
import soundfile as sf
|
|
|
7 |
|
8 |
# Configure logging
|
9 |
logging.basicConfig(level=logging.INFO)
|
@@ -23,30 +23,22 @@ CUDA_AVAILABLE = torch.cuda.is_available()
|
|
23 |
device = "cuda" if CUDA_AVAILABLE else "cpu"
|
24 |
logger.info(f"Using hardware: {device}")
|
25 |
|
26 |
-
# Load
|
27 |
-
|
28 |
|
29 |
# Define pipelines for American ('a') and British ('b') English
|
30 |
pipelines = {
|
31 |
-
'a': KPipeline(model=
|
32 |
-
'b': KPipeline(model=
|
33 |
}
|
34 |
|
35 |
-
# Set custom pronunciations for "kokoro"
|
36 |
try:
|
37 |
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
|
38 |
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
|
39 |
except AttributeError as e:
|
40 |
logger.warning(f"Could not set custom pronunciations: {e}")
|
41 |
|
42 |
-
def forward_gpu(text, voice_path, speed):
|
43 |
-
pipeline = pipelines[voice_path[0]]
|
44 |
-
pipeline.model = models[True] # Switch to GPU model
|
45 |
-
generator = pipeline(text, voice=voice_path, speed=speed)
|
46 |
-
for _, _, audio in generator:
|
47 |
-
return audio
|
48 |
-
return None
|
49 |
-
|
50 |
def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
51 |
voice_path = os.path.join(VOICE_DIR, voice)
|
52 |
if not os.path.exists(voice_path):
|
@@ -55,18 +47,14 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
|
55 |
pipeline = pipelines[voice[0]]
|
56 |
use_gpu = use_gpu and CUDA_AVAILABLE
|
57 |
try:
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
pipeline.model = models[False] # Ensure CPU model is used
|
62 |
-
generator = pipeline(text, voice=voice_path, speed=speed)
|
63 |
-
for _, ps, audio in generator:
|
64 |
-
return (24000, audio.numpy()), ps
|
65 |
except gr.exceptions.Error as e:
|
66 |
if use_gpu:
|
67 |
gr.Warning(str(e))
|
68 |
gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
|
69 |
-
|
70 |
generator = pipeline(text, voice=voice_path, speed=speed)
|
71 |
for _, ps, audio in generator:
|
72 |
return (24000, audio.numpy()), ps
|
@@ -74,9 +62,6 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
|
74 |
raise gr.Error(e)
|
75 |
return None, ""
|
76 |
|
77 |
-
def predict(text, voice="af_bella.pt", speed=1):
|
78 |
-
return generate_first(text, voice, speed, use_gpu=False)[0]
|
79 |
-
|
80 |
def tokenize_first(text, voice="af_bella.pt"):
|
81 |
voice_path = os.path.join(VOICE_DIR, voice)
|
82 |
if not os.path.exists(voice_path):
|
@@ -96,10 +81,8 @@ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
|
96 |
pipeline = pipelines[voice[0]]
|
97 |
use_gpu = use_gpu and CUDA_AVAILABLE
|
98 |
first = True
|
99 |
-
if use_gpu:
|
100 |
-
|
101 |
-
else:
|
102 |
-
pipeline.model = models[False] # Switch to CPU model
|
103 |
generator = pipeline(text, voice=voice_path, speed=speed)
|
104 |
for _, _, audio in generator:
|
105 |
yield 24000, audio.numpy()
|
@@ -107,7 +90,7 @@ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
|
107 |
first = False
|
108 |
yield 24000, torch.zeros(1).numpy()
|
109 |
|
110 |
-
# Dynamically load
|
111 |
def load_voice_choices():
|
112 |
voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
|
113 |
choices = {}
|
@@ -159,7 +142,6 @@ with gr.Blocks() as generate_tab:
|
|
159 |
info="Tokens used to generate the audio, up to 510 context length.")
|
160 |
tokenize_btn = gr.Button("Tokenize", variant="secondary")
|
161 |
gr.Markdown(TOKEN_NOTE)
|
162 |
-
predict_btn = gr.Button("Predict", variant="secondary", visible=False)
|
163 |
|
164 |
with gr.Blocks() as stream_tab:
|
165 |
out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
|
@@ -167,8 +149,7 @@ with gr.Blocks() as stream_tab:
|
|
167 |
stream_btn = gr.Button("Stream", variant="primary")
|
168 |
stop_btn = gr.Button("Stop", variant="stop")
|
169 |
with gr.Accordion("Note", open=True):
|
170 |
-
gr.Markdown("⚠️ There
|
171 |
-
gr.DuplicateButton()
|
172 |
|
173 |
with gr.Blocks() as app:
|
174 |
with gr.Row():
|
@@ -178,7 +159,7 @@ with gr.Blocks() as app:
|
|
178 |
voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
|
179 |
info="Quality and availability vary by language")
|
180 |
use_gpu = gr.Dropdown(
|
181 |
-
[("GPU
|
182 |
value=CUDA_AVAILABLE,
|
183 |
label="Hardware",
|
184 |
info="GPU is usually faster, but may require CUDA support",
|
@@ -192,7 +173,6 @@ with gr.Blocks() as app:
|
|
192 |
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
|
193 |
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
|
194 |
stop_btn.click(fn=None, cancels=[stream_event])
|
195 |
-
predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
|
196 |
|
197 |
if __name__ == "__main__":
|
198 |
app.queue().launch()
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import torch
|
4 |
import logging
|
5 |
import soundfile as sf
|
6 |
+
from kokoro import KModel, KPipeline
|
7 |
|
8 |
# Configure logging
|
9 |
logging.basicConfig(level=logging.INFO)
|
|
|
23 |
device = "cuda" if CUDA_AVAILABLE else "cpu"
|
24 |
logger.info(f"Using hardware: {device}")
|
25 |
|
26 |
+
# Load a single model instance
|
27 |
+
model = KModel("hexgrad/Kokoro-82M").to(device).eval()
|
28 |
|
29 |
# Define pipelines for American ('a') and British ('b') English
|
30 |
pipelines = {
|
31 |
+
'a': KPipeline(model=model, lang_code='a', device=device), # American English
|
32 |
+
'b': KPipeline(model=model, lang_code='b', device=device) # British English
|
33 |
}
|
34 |
|
35 |
+
# Set custom pronunciations for "kokoro"
|
36 |
try:
|
37 |
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
|
38 |
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
|
39 |
except AttributeError as e:
|
40 |
logger.warning(f"Could not set custom pronunciations: {e}")
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
43 |
voice_path = os.path.join(VOICE_DIR, voice)
|
44 |
if not os.path.exists(voice_path):
|
|
|
47 |
pipeline = pipelines[voice[0]]
|
48 |
use_gpu = use_gpu and CUDA_AVAILABLE
|
49 |
try:
|
50 |
+
generator = pipeline(text, voice=voice_path, speed=speed)
|
51 |
+
for _, ps, audio in generator:
|
52 |
+
return (24000, audio.numpy()), ps
|
|
|
|
|
|
|
|
|
53 |
except gr.exceptions.Error as e:
|
54 |
if use_gpu:
|
55 |
gr.Warning(str(e))
|
56 |
gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
|
57 |
+
model.to("cpu")
|
58 |
generator = pipeline(text, voice=voice_path, speed=speed)
|
59 |
for _, ps, audio in generator:
|
60 |
return (24000, audio.numpy()), ps
|
|
|
62 |
raise gr.Error(e)
|
63 |
return None, ""
|
64 |
|
|
|
|
|
|
|
65 |
def tokenize_first(text, voice="af_bella.pt"):
|
66 |
voice_path = os.path.join(VOICE_DIR, voice)
|
67 |
if not os.path.exists(voice_path):
|
|
|
81 |
pipeline = pipelines[voice[0]]
|
82 |
use_gpu = use_gpu and CUDA_AVAILABLE
|
83 |
first = True
|
84 |
+
if not use_gpu:
|
85 |
+
model.to("cpu")
|
|
|
|
|
86 |
generator = pipeline(text, voice=voice_path, speed=speed)
|
87 |
for _, _, audio in generator:
|
88 |
yield 24000, audio.numpy()
|
|
|
90 |
first = False
|
91 |
yield 24000, torch.zeros(1).numpy()
|
92 |
|
93 |
+
# Dynamically load .pt voice files from VOICE_DIR
|
94 |
def load_voice_choices():
|
95 |
voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
|
96 |
choices = {}
|
|
|
142 |
info="Tokens used to generate the audio, up to 510 context length.")
|
143 |
tokenize_btn = gr.Button("Tokenize", variant="secondary")
|
144 |
gr.Markdown(TOKEN_NOTE)
|
|
|
145 |
|
146 |
with gr.Blocks() as stream_tab:
|
147 |
out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
|
|
|
149 |
stream_btn = gr.Button("Stream", variant="primary")
|
150 |
stop_btn = gr.Button("Stop", variant="stop")
|
151 |
with gr.Accordion("Note", open=True):
|
152 |
+
gr.Markdown("⚠️ There may be delays in streaming audio due to processing limitations.")
|
|
|
153 |
|
154 |
with gr.Blocks() as app:
|
155 |
with gr.Row():
|
|
|
159 |
voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
|
160 |
info="Quality and availability vary by language")
|
161 |
use_gpu = gr.Dropdown(
|
162 |
+
[("GPU �-held", True), ("CPU 🐌", False)],
|
163 |
value=CUDA_AVAILABLE,
|
164 |
label="Hardware",
|
165 |
info="GPU is usually faster, but may require CUDA support",
|
|
|
173 |
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
|
174 |
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
|
175 |
stop_btn.click(fn=None, cancels=[stream_event])
|
|
|
176 |
|
177 |
if __name__ == "__main__":
|
178 |
app.queue().launch()
|