Hassan-16 commited on
Commit
db34aa6
Β·
verified Β·
1 Parent(s): 21977f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -179
app.py CHANGED
@@ -3,7 +3,6 @@ import os
3
  import torch
4
  import logging
5
  import soundfile as sf
6
- import time
7
  from kokoro import KModel, KPipeline
8
 
9
  # Configure logging
@@ -25,13 +24,7 @@ device = "cuda" if CUDA_AVAILABLE else "cpu"
25
  logger.info(f"Using hardware: {device}")
26
 
27
  # Load a single model instance
28
- try:
29
- start_time = time.time()
30
- model = KModel("hexgrad/Kokoro-82M").to(device).eval()
31
- logger.info(f"Model loading time: {time.time() - start_time} seconds")
32
- except Exception as e:
33
- logger.error(f"Failed to load model: {e}")
34
- raise
35
 
36
  # Define pipelines for American ('a') and British ('b') English
37
  pipelines = {
@@ -46,81 +39,33 @@ try:
46
  except AttributeError as e:
47
  logger.warning(f"Could not set custom pronunciations: {e}")
48
 
49
- # Cache voice choices to avoid repeated file scanning
50
- VOICE_CHOICES = None
51
- def load_voice_choices():
52
- global VOICE_CHOICES
53
- if VOICE_CHOICES is not None:
54
- return VOICE_CHOICES
55
- voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
56
- choices = {}
57
- for voice_file in voice_files:
58
- prefix = voice_file[:2]
59
- if prefix == 'af':
60
- label = f"πŸ‡ΊπŸ‡Έ Female: {voice_file[3:-3].capitalize()}"
61
- elif prefix == 'am':
62
- label = f"πŸ‡ΊπŸ‡Έ Male: {voice_file[3:-3].capitalize()}"
63
- elif prefix == 'bf':
64
- label = f"πŸ‡¬πŸ‡§ Female: {voice_file[3:-3].capitalize()}"
65
- elif prefix == 'bm':
66
- label = f"πŸ‡¬πŸ‡§ Male: {voice_file[3:-3].capitalize()}"
67
- else:
68
- label = f"Unknown: {voice_file[:-3]}"
69
- choices[label] = voice_file
70
- if not choices:
71
- logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
72
- choices = {"πŸ‡ΊπŸ‡Έ Female: Bella": "af_bella.pt"}
73
- VOICE_CHOICES = choices
74
- return choices
75
-
76
- CHOICES = load_voice_choices()
77
-
78
- # Log available voices
79
- for label, voice_path in CHOICES.items():
80
- full_path = os.path.join(VOICE_DIR, voice_path)
81
- if not os.path.exists(full_path):
82
- logger.warning(f"Voice file not found: {full_path}")
83
- else:
84
- logger.info(f"Loaded voice: {label} ({voice_path})")
85
-
86
  def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
87
- start_time = time.time()
88
- if len(text) > 510:
89
- text = text[:510]
90
- gr.Warning("Text truncated to 510 characters for faster processing.")
91
  voice_path = os.path.join(VOICE_DIR, voice)
92
  if not os.path.exists(voice_path):
93
- raise gr.Error(f"Voice file not found: {voice_path}")
94
 
95
  pipeline = pipelines[voice[0]]
96
  use_gpu = use_gpu and CUDA_AVAILABLE
97
  try:
98
- if not use_gpu and model.device.type != "cpu":
99
- model.to("cpu")
100
  generator = pipeline(text, voice=voice_path, speed=speed)
101
  for _, ps, audio in generator:
102
- logger.info(f"Generation time: {time.time() - start_time} seconds")
103
  return (24000, audio.numpy()), ps
104
  except gr.exceptions.Error as e:
105
  if use_gpu:
106
  gr.Warning(str(e))
107
- gr.Info("Retrying with CPU.")
108
  model.to("cpu")
109
  generator = pipeline(text, voice=voice_path, speed=speed)
110
  for _, ps, audio in generator:
111
- logger.info(f"Generation time (CPU retry): {time.time() - start_time} seconds")
112
  return (24000, audio.numpy()), ps
113
  else:
114
  raise gr.Error(e)
115
  return None, ""
116
 
117
  def tokenize_first(text, voice="af_bella.pt"):
118
- if len(text) > 510:
119
- text = text[:510]
120
- gr.Warning("Text truncated to 510 characters for faster processing.")
121
  voice_path = os.path.join(VOICE_DIR, voice)
122
  if not os.path.exists(voice_path):
123
- raise gr.Error(f"Voice file not found: {voice_path}")
124
 
125
  pipeline = pipelines[voice[0]]
126
  generator = pipeline(text, voice=voice_path)
@@ -129,146 +74,105 @@ def tokenize_first(text, voice="af_bella.pt"):
129
  return ""
130
 
131
  def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
132
- start_time = time.time()
133
- if len(text) > 510:
134
- text = text[:510]
135
- gr.Warning("Text truncated to 510 characters for faster processing.")
136
  voice_path = os.path.join(VOICE_DIR, voice)
137
  if not os.path.exists(voice_path):
138
- raise gr.Error(f"Voice file not found: {voice_path}")
139
 
140
  pipeline = pipelines[voice[0]]
141
  use_gpu = use_gpu and CUDA_AVAILABLE
142
- if not use_gpu and model.device.type != "cpu":
143
- model.to("cpu")
144
  first = True
 
 
145
  generator = pipeline(text, voice=voice_path, speed=speed)
146
  for _, _, audio in generator:
147
  yield 24000, audio.numpy()
148
  if first:
149
  first = False
150
  yield 24000, torch.zeros(1).numpy()
151
- logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
152
 
153
- TOKEN_NOTE = '''
154
- **How to Customize Pronunciation**
155
- - Use Markdown link syntax, e.g., `[Kokoro](/kˈOkΙ™ΙΉO/)` for custom pronunciation.
156
- - Adjust intonation with punctuation: `;:,.!?—…"()β€œβ€`.
157
- - Control stress: `[word](-1)` or `[word](-2)` to lower, `[word](+1)` or `[word](+2)` to raise stress.
158
- '''
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- with gr.Blocks(theme="huggingface", css=".gr-button-primary {background-color: #1e88e5 !important; color: white !important;}") as app:
161
- gr.Markdown("# Kokoro TTS: Text-to-Speech Generator")
162
- gr.Markdown("Enter text and select a voice to generate high-quality audio. Adjust speed for faster or slower speech.")
163
-
164
- with gr.Column():
165
- text = gr.Textbox(
166
- label="Input Text",
167
- value=TEXT,
168
- placeholder="Type your text here (max 510 characters)",
169
- lines=3,
170
- max_lines=5,
171
- info="Enter text to convert to speech."
172
- )
173
- with gr.Row():
174
- voice = gr.Dropdown(
175
- choices=list(CHOICES.items()),
176
- value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0],
177
- label="Voice",
178
- info="Choose a voice for the audio output."
179
- )
180
- use_gpu = gr.Dropdown(
181
- choices=[("GPU πŸš€ (Faster)", True), ("CPU 🐌 (Slower)", False)],
182
- value=CUDA_AVAILABLE,
183
- label="Hardware",
184
- info="GPU is faster but requires CUDA support.",
185
- interactive=CUDA_AVAILABLE
186
- )
187
- speed = gr.Slider(
188
- minimum=0.5,
189
- maximum=2,
190
- value=1,
191
- step=0.1,
192
- label="Speech Speed",
193
- info="Adjust the speed of the generated audio (0.5 = slower, 2 = faster)."
194
- )
195
 
196
- with gr.Tabs():
197
- with gr.Tab(label="Generate Audio"):
198
- out_audio = gr.Audio(
199
- label="Generated Audio",
200
- interactive=False,
201
- streaming=False,
202
- autoplay=True
203
- )
204
- status = gr.Textbox(
205
- value="Ready to generate audio.",
206
- label="Status",
207
- interactive=False
208
- )
209
- generate_btn = gr.Button("Generate Audio", variant="primary")
210
- with gr.Accordion("Pronunciation Tokens", open=False):
211
- out_ps = gr.Textbox(
212
- interactive=False,
213
- show_label=False,
214
- info="Tokens used to generate the audio."
215
- )
216
- tokenize_btn = gr.Button("Show Tokens", variant="secondary")
217
- gr.Markdown(TOKEN_NOTE)
218
 
219
- with gr.Tab(label="Stream Audio"):
220
- out_stream = gr.Audio(
221
- label="Streaming Audio",
222
- interactive=False,
223
- streaming=True,
224
- autoplay=True
225
- )
226
- status_stream = gr.Textbox(
227
- value="Ready to stream audio.",
228
- label="Status",
229
- interactive=False
230
- )
231
- with gr.Row():
232
- stream_btn = gr.Button("Start Streaming", variant="primary")
233
- stop_btn = gr.Button("Stop Streaming", variant="stop")
234
- gr.Markdown("⚠️ Streaming may have slight delays due to processing.")
235
 
236
- def update_status_generate(text, voice, speed, use_gpu):
237
- status.value = "Generating audio..."
238
- result, ps = generate_first(text, voice, speed, use_gpu)
239
- status.value = "Audio generated successfully!" if result else "Failed to generate audio."
240
- return result, ps
241
 
242
- def update_status_tokenize(text, voice):
243
- status.value = "Tokenizing text..."
244
- result = tokenize_first(text, voice)
245
- status.value = "Tokenization complete!" if result else "Failed to tokenize."
246
- return result
247
 
248
- def update_status_stream(text, voice, speed, use_gpu):
249
- status_stream.value = "Starting audio stream..."
250
- for audio in generate_all(text, voice, speed, use_gpu):
251
- yield audio
252
- status_stream.value = "Streaming complete!"
253
 
254
- generate_btn.click(
255
- fn=update_status_generate,
256
- inputs=[text, voice, speed, use_gpu],
257
- outputs=[out_audio, out_ps, status]
258
- )
259
- tokenize_btn.click(
260
- fn=update_status_tokenize,
261
- inputs=[text, voice],
262
- outputs=[out_ps, status]
263
- )
264
- stream_event = stream_btn.click(
265
- fn=update_status_stream,
266
- inputs=[text, voice, speed, use_gpu],
267
- outputs=[out_stream, status_stream]
268
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  stop_btn.click(fn=None, cancels=[stream_event])
270
 
271
  if __name__ == "__main__":
272
- logger.info("Starting Gradio app...")
273
- app.launch(queue=False)
274
- logger.info("Gradio app started.")
 
3
  import torch
4
  import logging
5
  import soundfile as sf
 
6
  from kokoro import KModel, KPipeline
7
 
8
  # Configure logging
 
24
  logger.info(f"Using hardware: {device}")
25
 
26
  # Load a single model instance
27
+ model = KModel("hexgrad/Kokoro-82M").to(device).eval()
 
 
 
 
 
 
28
 
29
  # Define pipelines for American ('a') and British ('b') English
30
  pipelines = {
 
39
  except AttributeError as e:
40
  logger.warning(f"Could not set custom pronunciations: {e}")
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
 
 
 
 
43
  voice_path = os.path.join(VOICE_DIR, voice)
44
  if not os.path.exists(voice_path):
45
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
46
 
47
  pipeline = pipelines[voice[0]]
48
  use_gpu = use_gpu and CUDA_AVAILABLE
49
  try:
 
 
50
  generator = pipeline(text, voice=voice_path, speed=speed)
51
  for _, ps, audio in generator:
 
52
  return (24000, audio.numpy()), ps
53
  except gr.exceptions.Error as e:
54
  if use_gpu:
55
  gr.Warning(str(e))
56
+ gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
57
  model.to("cpu")
58
  generator = pipeline(text, voice=voice_path, speed=speed)
59
  for _, ps, audio in generator:
 
60
  return (24000, audio.numpy()), ps
61
  else:
62
  raise gr.Error(e)
63
  return None, ""
64
 
65
  def tokenize_first(text, voice="af_bella.pt"):
 
 
 
66
  voice_path = os.path.join(VOICE_DIR, voice)
67
  if not os.path.exists(voice_path):
68
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
69
 
70
  pipeline = pipelines[voice[0]]
71
  generator = pipeline(text, voice=voice_path)
 
74
  return ""
75
 
76
  def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
 
 
 
 
77
  voice_path = os.path.join(VOICE_DIR, voice)
78
  if not os.path.exists(voice_path):
79
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
80
 
81
  pipeline = pipelines[voice[0]]
82
  use_gpu = use_gpu and CUDA_AVAILABLE
 
 
83
  first = True
84
+ if not use_gpu:
85
+ model.to("cpu")
86
  generator = pipeline(text, voice=voice_path, speed=speed)
87
  for _, _, audio in generator:
88
  yield 24000, audio.numpy()
89
  if first:
90
  first = False
91
  yield 24000, torch.zeros(1).numpy()
 
92
 
93
+ # Dynamically load .pt voice files from VOICE_DIR
94
+ def load_voice_choices():
95
+ voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
96
+ choices = {}
97
+ for voice_file in voice_files:
98
+ prefix = voice_file[:2]
99
+ if prefix == 'af':
100
+ label = f"πŸ‡ΊπŸ‡Έ 🚺 {voice_file[3:-3].capitalize()}"
101
+ elif prefix == 'am':
102
+ label = f"πŸ‡ΊπŸ‡Έ 🚹 {voice_file[3:-3].capitalize()}"
103
+ elif prefix == 'bf':
104
+ label = f"πŸ‡¬πŸ‡§ 🚺 {voice_file[3:-3].capitalize()}"
105
+ elif prefix == 'bm':
106
+ label = f"πŸ‡¬πŸ‡§ 🚹 {voice_file[3:-3].capitalize()}"
107
+ else:
108
+ label = f"Unknown {voice_file[:-3]}"
109
+ choices[label] = voice_file
110
+ return choices
111
 
112
+ CHOICES = load_voice_choices()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ # Log available voices
115
+ for label, voice_path in CHOICES.items():
116
+ full_path = os.path.join(VOICE_DIR, voice_path)
117
+ if not os.path.exists(full_path):
118
+ logger.warning(f"Voice file not found: {full_path}")
119
+ else:
120
+ logger.info(f"Loaded voice: {label} ({voice_path})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ # If no voices are found, add a default fallback
123
+ if not CHOICES:
124
+ logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
125
+ CHOICES = {"πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella.pt"}
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ TOKEN_NOTE = '''
128
+ πŸ’‘ Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkΙ™ΙΉO/)
 
 
 
129
 
130
+ πŸ’¬ To adjust intonation, try punctuation ;:,.!?—…"()β€œβ€ or stress ˈ and ˌ
 
 
 
 
131
 
132
+ ⬇️ Lower stress [1 level](-1) or [2 levels](-2)
 
 
 
 
133
 
134
+ ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
135
+ '''
136
+
137
+ with gr.Blocks() as generate_tab:
138
+ out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
139
+ generate_btn = gr.Button("Generate", variant="primary")
140
+ with gr.Accordion("Output Tokens", open=True):
141
+ out_ps = gr.Textbox(interactive=False, show_label=False,
142
+ info="Tokens used to generate the audio, up to 510 context length.")
143
+ tokenize_btn = gr.Button("Tokenize", variant="secondary")
144
+ gr.Markdown(TOKEN_NOTE)
145
+
146
+ with gr.Blocks() as stream_tab:
147
+ out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
148
+ with gr.Row():
149
+ stream_btn = gr.Button("Stream", variant="primary")
150
+ stop_btn = gr.Button("Stop", variant="stop")
151
+ with gr.Accordion("Note", open=True):
152
+ gr.Markdown("⚠️ There may be delays in streaming audio due to processing limitations.")
153
+
154
+ with gr.Blocks() as app:
155
+ with gr.Row():
156
+ with gr.Column():
157
+ text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
158
+ with gr.Row():
159
+ voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
160
+ info="Quality and availability vary by language")
161
+ use_gpu = gr.Dropdown(
162
+ [("GPU �-held", True), ("CPU 🐌", False)],
163
+ value=CUDA_AVAILABLE,
164
+ label="Hardware",
165
+ info="GPU is usually faster, but may require CUDA support",
166
+ interactive=CUDA_AVAILABLE
167
+ )
168
+ speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
169
+ with gr.Column():
170
+ gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
171
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu],
172
+ outputs=[out_audio, out_ps])
173
+ tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
174
+ stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
175
  stop_btn.click(fn=None, cancels=[stream_event])
176
 
177
  if __name__ == "__main__":
178
+ app.queue().launch()