Hassan-16 commited on
Commit
1509739
·
verified ·
1 Parent(s): 976f3b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -35
app.py CHANGED
@@ -1,9 +1,9 @@
1
- from kokoro import KModel, KPipeline
2
  import gradio as gr
3
  import os
4
  import torch
5
  import logging
6
  import soundfile as sf
 
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.INFO)
@@ -23,30 +23,22 @@ CUDA_AVAILABLE = torch.cuda.is_available()
23
  device = "cuda" if CUDA_AVAILABLE else "cpu"
24
  logger.info(f"Using hardware: {device}")
25
 
26
- # Load models for CPU and GPU (if available)
27
- models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
28
 
29
  # Define pipelines for American ('a') and British ('b') English
30
  pipelines = {
31
- 'a': KPipeline(model=models[False], lang_code='a', device='cpu'), # American English
32
- 'b': KPipeline(model=models[False], lang_code='b', device='cpu') # British English
33
  }
34
 
35
- # Set custom pronunciations for "kokoro" in both American and British modes
36
  try:
37
  pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
38
  pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
39
  except AttributeError as e:
40
  logger.warning(f"Could not set custom pronunciations: {e}")
41
 
42
- def forward_gpu(text, voice_path, speed):
43
- pipeline = pipelines[voice_path[0]]
44
- pipeline.model = models[True] # Switch to GPU model
45
- generator = pipeline(text, voice=voice_path, speed=speed)
46
- for _, _, audio in generator:
47
- return audio
48
- return None
49
-
50
  def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
51
  voice_path = os.path.join(VOICE_DIR, voice)
52
  if not os.path.exists(voice_path):
@@ -55,18 +47,14 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
55
  pipeline = pipelines[voice[0]]
56
  use_gpu = use_gpu and CUDA_AVAILABLE
57
  try:
58
- if use_gpu:
59
- audio = forward_gpu(text, voice_path, speed)
60
- else:
61
- pipeline.model = models[False] # Ensure CPU model is used
62
- generator = pipeline(text, voice=voice_path, speed=speed)
63
- for _, ps, audio in generator:
64
- return (24000, audio.numpy()), ps
65
  except gr.exceptions.Error as e:
66
  if use_gpu:
67
  gr.Warning(str(e))
68
  gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
69
- pipeline.model = models[False] # Switch to CPU model
70
  generator = pipeline(text, voice=voice_path, speed=speed)
71
  for _, ps, audio in generator:
72
  return (24000, audio.numpy()), ps
@@ -74,9 +62,6 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
74
  raise gr.Error(e)
75
  return None, ""
76
 
77
- def predict(text, voice="af_bella.pt", speed=1):
78
- return generate_first(text, voice, speed, use_gpu=False)[0]
79
-
80
  def tokenize_first(text, voice="af_bella.pt"):
81
  voice_path = os.path.join(VOICE_DIR, voice)
82
  if not os.path.exists(voice_path):
@@ -96,10 +81,8 @@ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
96
  pipeline = pipelines[voice[0]]
97
  use_gpu = use_gpu and CUDA_AVAILABLE
98
  first = True
99
- if use_gpu:
100
- pipeline.model = models[True] # Switch to GPU model
101
- else:
102
- pipeline.model = models[False] # Switch to CPU model
103
  generator = pipeline(text, voice=voice_path, speed=speed)
104
  for _, _, audio in generator:
105
  yield 24000, audio.numpy()
@@ -107,7 +90,7 @@ def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
107
  first = False
108
  yield 24000, torch.zeros(1).numpy()
109
 
110
- # Dynamically load all .pt voice files from VOICE_DIR
111
  def load_voice_choices():
112
  voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
113
  choices = {}
@@ -159,7 +142,6 @@ with gr.Blocks() as generate_tab:
159
  info="Tokens used to generate the audio, up to 510 context length.")
160
  tokenize_btn = gr.Button("Tokenize", variant="secondary")
161
  gr.Markdown(TOKEN_NOTE)
162
- predict_btn = gr.Button("Predict", variant="secondary", visible=False)
163
 
164
  with gr.Blocks() as stream_tab:
165
  out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
@@ -167,8 +149,7 @@ with gr.Blocks() as stream_tab:
167
  stream_btn = gr.Button("Stream", variant="primary")
168
  stop_btn = gr.Button("Stop", variant="stop")
169
  with gr.Accordion("Note", open=True):
170
- gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
171
- gr.DuplicateButton()
172
 
173
  with gr.Blocks() as app:
174
  with gr.Row():
@@ -178,7 +159,7 @@ with gr.Blocks() as app:
178
  voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
179
  info="Quality and availability vary by language")
180
  use_gpu = gr.Dropdown(
181
- [("GPU 🚀", True), ("CPU 🐌", False)],
182
  value=CUDA_AVAILABLE,
183
  label="Hardware",
184
  info="GPU is usually faster, but may require CUDA support",
@@ -192,7 +173,6 @@ with gr.Blocks() as app:
192
  tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
193
  stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
194
  stop_btn.click(fn=None, cancels=[stream_event])
195
- predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
196
 
197
  if __name__ == "__main__":
198
  app.queue().launch()
 
 
1
  import gradio as gr
2
  import os
3
  import torch
4
  import logging
5
  import soundfile as sf
6
+ from kokoro import KModel, KPipeline
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.INFO)
 
23
  device = "cuda" if CUDA_AVAILABLE else "cpu"
24
  logger.info(f"Using hardware: {device}")
25
 
26
+ # Load a single model instance
27
+ model = KModel("hexgrad/Kokoro-82M").to(device).eval()
28
 
29
  # Define pipelines for American ('a') and British ('b') English
30
  pipelines = {
31
+ 'a': KPipeline(model=model, lang_code='a', device=device), # American English
32
+ 'b': KPipeline(model=model, lang_code='b', device=device) # British English
33
  }
34
 
35
+ # Set custom pronunciations for "kokoro"
36
  try:
37
  pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
38
  pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
39
  except AttributeError as e:
40
  logger.warning(f"Could not set custom pronunciations: {e}")
41
 
 
 
 
 
 
 
 
 
42
  def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
43
  voice_path = os.path.join(VOICE_DIR, voice)
44
  if not os.path.exists(voice_path):
 
47
  pipeline = pipelines[voice[0]]
48
  use_gpu = use_gpu and CUDA_AVAILABLE
49
  try:
50
+ generator = pipeline(text, voice=voice_path, speed=speed)
51
+ for _, ps, audio in generator:
52
+ return (24000, audio.numpy()), ps
 
 
 
 
53
  except gr.exceptions.Error as e:
54
  if use_gpu:
55
  gr.Warning(str(e))
56
  gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
57
+ model.to("cpu")
58
  generator = pipeline(text, voice=voice_path, speed=speed)
59
  for _, ps, audio in generator:
60
  return (24000, audio.numpy()), ps
 
62
  raise gr.Error(e)
63
  return None, ""
64
 
 
 
 
65
  def tokenize_first(text, voice="af_bella.pt"):
66
  voice_path = os.path.join(VOICE_DIR, voice)
67
  if not os.path.exists(voice_path):
 
81
  pipeline = pipelines[voice[0]]
82
  use_gpu = use_gpu and CUDA_AVAILABLE
83
  first = True
84
+ if not use_gpu:
85
+ model.to("cpu")
 
 
86
  generator = pipeline(text, voice=voice_path, speed=speed)
87
  for _, _, audio in generator:
88
  yield 24000, audio.numpy()
 
90
  first = False
91
  yield 24000, torch.zeros(1).numpy()
92
 
93
+ # Dynamically load .pt voice files from VOICE_DIR
94
  def load_voice_choices():
95
  voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
96
  choices = {}
 
142
  info="Tokens used to generate the audio, up to 510 context length.")
143
  tokenize_btn = gr.Button("Tokenize", variant="secondary")
144
  gr.Markdown(TOKEN_NOTE)
 
145
 
146
  with gr.Blocks() as stream_tab:
147
  out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
 
149
  stream_btn = gr.Button("Stream", variant="primary")
150
  stop_btn = gr.Button("Stop", variant="stop")
151
  with gr.Accordion("Note", open=True):
152
+ gr.Markdown("⚠️ There may be delays in streaming audio due to processing limitations.")
 
153
 
154
  with gr.Blocks() as app:
155
  with gr.Row():
 
159
  voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
160
  info="Quality and availability vary by language")
161
  use_gpu = gr.Dropdown(
162
+ [("GPU �-held", True), ("CPU 🐌", False)],
163
  value=CUDA_AVAILABLE,
164
  label="Hardware",
165
  info="GPU is usually faster, but may require CUDA support",
 
173
  tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
174
  stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
175
  stop_btn.click(fn=None, cancels=[stream_event])
 
176
 
177
  if __name__ == "__main__":
178
  app.queue().launch()