Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -179,9 +179,9 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
|
|
179 |
return None, "Error: Please upload a reference audio file (.wav or .mp3)."
|
180 |
|
181 |
try:
|
182 |
-
print(f"
|
183 |
print(f" Text: '{text_to_speak}'")
|
184 |
-
print(f" Audio
|
185 |
print(f" Exaggeration: {exaggeration}")
|
186 |
print(f" CFG/Pace: {cfg_pace}")
|
187 |
print(f" Random Seed: {random_seed}")
|
@@ -206,7 +206,7 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
|
|
206 |
except:
|
207 |
sample_rate = 24000
|
208 |
|
209 |
-
print(f"Audio generated successfully
|
210 |
|
211 |
if isinstance(output_wav_data, str):
|
212 |
return output_wav_data, "Success: Audio generated successfully!"
|
@@ -219,12 +219,11 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
|
|
219 |
return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
|
220 |
|
221 |
except Exception as e:
|
222 |
-
print(f"ERROR: Failed during audio generation
|
223 |
-
print("Detailed error trace for audio generation
|
224 |
traceback.print_exc()
|
225 |
return None, f"Error during audio generation: {str(e)}. Check logs for more details."
|
226 |
|
227 |
-
# Updated clone_voice_api function with detailed logging
|
228 |
def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
|
229 |
import requests
|
230 |
import tempfile
|
@@ -233,120 +232,60 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
|
|
233 |
|
234 |
temp_audio_path = None
|
235 |
try:
|
236 |
-
print(f"=== API CALL DEBUG ===")
|
237 |
-
print(f"Text: {text_to_speak}")
|
238 |
-
print(f"Audio URL type: {type(reference_audio_url)}")
|
239 |
-
print(f"Audio URL length: {len(str(reference_audio_url)) if reference_audio_url else 0}")
|
240 |
-
print(f"Audio URL preview: {str(reference_audio_url)[:100]}...")
|
241 |
-
print(f"Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
|
242 |
-
|
243 |
-
# Validate inputs
|
244 |
-
if not text_to_speak or text_to_speak.strip() == "":
|
245 |
-
return None, "Error: Please enter some text to speak."
|
246 |
-
|
247 |
-
if not reference_audio_url:
|
248 |
-
return None, "Error: Please provide reference audio."
|
249 |
-
|
250 |
-
print("Processing audio data...")
|
251 |
-
|
252 |
if reference_audio_url.startswith('data:audio'):
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
elif 'wav' in header:
|
265 |
-
ext = '.wav'
|
266 |
-
else:
|
267 |
-
ext = '.wav'
|
268 |
-
|
269 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
|
270 |
-
temp_file.write(audio_data)
|
271 |
-
temp_audio_path = temp_file.name
|
272 |
-
|
273 |
-
print(f"Created temporary audio file: {temp_audio_path}")
|
274 |
-
print(f"File exists: {os.path.exists(temp_audio_path)}")
|
275 |
-
print(f"File size: {os.path.getsize(temp_audio_path)} bytes")
|
276 |
-
|
277 |
-
except Exception as audio_error:
|
278 |
-
print(f"Audio processing error: {audio_error}")
|
279 |
-
return None, f"Error processing audio data: {str(audio_error)}"
|
280 |
-
|
281 |
elif reference_audio_url.startswith('http'):
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
temp_file.write(response.content)
|
294 |
-
temp_audio_path = temp_file.name
|
295 |
-
print(f"Downloaded audio to: {temp_audio_path}")
|
296 |
-
except Exception as download_error:
|
297 |
-
print(f"Download error: {download_error}")
|
298 |
-
return None, f"Error downloading audio: {str(download_error)}"
|
299 |
else:
|
300 |
-
print("Using direct file path...")
|
301 |
temp_audio_path = reference_audio_url
|
302 |
|
303 |
-
print(f"Calling clone_voice with:")
|
304 |
-
print(f" Text: {text_to_speak}")
|
305 |
-
print(f" Audio path: {temp_audio_path}")
|
306 |
-
print(f" Parameters: {exaggeration}, {cfg_pace}, {random_seed}, {temperature}")
|
307 |
-
|
308 |
-
# Call the main function
|
309 |
audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
|
310 |
-
|
311 |
-
print(f"clone_voice returned:")
|
312 |
-
print(f" Audio output type: {type(audio_output)}")
|
313 |
-
print(f" Status: {status}")
|
314 |
|
315 |
-
# Cleanup
|
316 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
317 |
try:
|
318 |
os.unlink(temp_audio_path)
|
319 |
-
|
320 |
-
|
321 |
-
print(f"Cleanup error: {cleanup_error}")
|
322 |
-
|
323 |
return audio_output, status
|
324 |
-
|
325 |
except Exception as e:
|
326 |
-
print(f"=== CRITICAL ERROR ===")
|
327 |
-
print(f"Error type: {type(e)}")
|
328 |
-
print(f"Error message: {str(e)}")
|
329 |
-
import traceback
|
330 |
-
traceback.print_exc()
|
331 |
-
|
332 |
-
# Cleanup on error
|
333 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
334 |
try:
|
335 |
os.unlink(temp_audio_path)
|
336 |
except:
|
337 |
pass
|
338 |
-
|
339 |
return None, f"API Error: {str(e)}"
|
340 |
|
341 |
def main():
|
342 |
print("Starting Advanced Gradio interface...")
|
343 |
-
|
344 |
-
|
|
|
345 |
gr.Markdown("# ποΈ Advanced Chatterbox Voice Cloning")
|
346 |
gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
|
347 |
-
|
348 |
with gr.Row():
|
349 |
with gr.Column(scale=2):
|
|
|
350 |
text_input = gr.Textbox(
|
351 |
label="Text to Speak",
|
352 |
placeholder="Enter the text you want the cloned voice to say...",
|
@@ -357,9 +296,10 @@ def main():
|
|
357 |
label="Reference Audio (Upload a short .wav or .mp3 clip)",
|
358 |
sources=["upload", "microphone"]
|
359 |
)
|
|
|
360 |
with gr.Accordion("π§ Advanced Settings", open=False):
|
361 |
with gr.Row():
|
362 |
-
|
363 |
minimum=0.25,
|
364 |
maximum=1.0,
|
365 |
value=0.6,
|
@@ -367,7 +307,7 @@ def main():
|
|
367 |
label="Exaggeration",
|
368 |
info="Controls voice characteristic emphasis"
|
369 |
)
|
370 |
-
|
371 |
minimum=0.2,
|
372 |
maximum=1.0,
|
373 |
value=0.3,
|
@@ -376,13 +316,13 @@ def main():
|
|
376 |
info="Classifier-free guidance weight"
|
377 |
)
|
378 |
with gr.Row():
|
379 |
-
|
380 |
value=0,
|
381 |
label="Random Seed",
|
382 |
info="Set to 0 for random results",
|
383 |
precision=0
|
384 |
)
|
385 |
-
|
386 |
minimum=0.05,
|
387 |
maximum=2.0,
|
388 |
value=0.6,
|
@@ -390,28 +330,14 @@ def main():
|
|
390 |
label="Temperature",
|
391 |
info="Controls randomness in generation"
|
392 |
)
|
|
|
393 |
generate_btn = gr.Button("π΅ Generate Voice Clone", variant="primary", size="lg")
|
394 |
-
|
395 |
with gr.Column(scale=1):
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
)
|
401 |
-
status_output = gr.Textbox(
|
402 |
-
label="Status",
|
403 |
-
interactive=False,
|
404 |
-
lines=2
|
405 |
-
)
|
406 |
-
|
407 |
-
# This is the key part - create the API endpoint properly
|
408 |
-
generate_btn.click(
|
409 |
-
fn=clone_voice_api, # Use the API-ready function
|
410 |
-
inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
|
411 |
-
outputs=[audio_output, status_output],
|
412 |
-
api_name="predict" # This creates /api/predict endpoint
|
413 |
-
)
|
414 |
-
|
415 |
with gr.Accordion("π Examples", open=False):
|
416 |
gr.Examples(
|
417 |
examples=[
|
@@ -419,14 +345,43 @@ def main():
|
|
419 |
["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
|
420 |
["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
|
421 |
],
|
422 |
-
inputs=[text_input, audio_input,
|
423 |
-
outputs=[audio_output, status_output],
|
424 |
-
fn=clone_voice_api,
|
425 |
-
cache_examples=False
|
426 |
)
|
427 |
-
|
428 |
-
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
server_name="0.0.0.0",
|
431 |
server_port=7860,
|
432 |
show_error=True,
|
|
|
179 |
return None, "Error: Please upload a reference audio file (.wav or .mp3)."
|
180 |
|
181 |
try:
|
182 |
+
print(f"Received request:")
|
183 |
print(f" Text: '{text_to_speak}'")
|
184 |
+
print(f" Audio: '{reference_audio_path}'")
|
185 |
print(f" Exaggeration: {exaggeration}")
|
186 |
print(f" CFG/Pace: {cfg_pace}")
|
187 |
print(f" Random Seed: {random_seed}")
|
|
|
206 |
except:
|
207 |
sample_rate = 24000
|
208 |
|
209 |
+
print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
|
210 |
|
211 |
if isinstance(output_wav_data, str):
|
212 |
return output_wav_data, "Success: Audio generated successfully!"
|
|
|
219 |
return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
|
220 |
|
221 |
except Exception as e:
|
222 |
+
print(f"ERROR: Failed during audio generation: {e}")
|
223 |
+
print("Detailed error trace for audio generation:")
|
224 |
traceback.print_exc()
|
225 |
return None, f"Error during audio generation: {str(e)}. Check logs for more details."
|
226 |
|
|
|
227 |
def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
|
228 |
import requests
|
229 |
import tempfile
|
|
|
232 |
|
233 |
temp_audio_path = None
|
234 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
if reference_audio_url.startswith('data:audio'):
|
236 |
+
header, encoded = reference_audio_url.split(',', 1)
|
237 |
+
audio_data = base64.b64decode(encoded)
|
238 |
+
if 'mp3' in header:
|
239 |
+
ext = '.mp3'
|
240 |
+
elif 'wav' in header:
|
241 |
+
ext = '.wav'
|
242 |
+
else:
|
243 |
+
ext = '.wav'
|
244 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
|
245 |
+
temp_file.write(audio_data)
|
246 |
+
temp_audio_path = temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
elif reference_audio_url.startswith('http'):
|
248 |
+
response = requests.get(reference_audio_url)
|
249 |
+
response.raise_for_status()
|
250 |
+
if reference_audio_url.endswith('.mp3'):
|
251 |
+
ext = '.mp3'
|
252 |
+
elif reference_audio_url.endswith('.wav'):
|
253 |
+
ext = '.wav'
|
254 |
+
else:
|
255 |
+
ext = '.wav'
|
256 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
|
257 |
+
temp_file.write(response.content)
|
258 |
+
temp_audio_path = temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
else:
|
|
|
260 |
temp_audio_path = reference_audio_url
|
261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
|
|
|
|
|
|
|
|
|
263 |
|
|
|
264 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
265 |
try:
|
266 |
os.unlink(temp_audio_path)
|
267 |
+
except:
|
268 |
+
pass
|
|
|
|
|
269 |
return audio_output, status
|
|
|
270 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
272 |
try:
|
273 |
os.unlink(temp_audio_path)
|
274 |
except:
|
275 |
pass
|
|
|
276 |
return None, f"API Error: {str(e)}"
|
277 |
|
278 |
def main():
|
279 |
print("Starting Advanced Gradio interface...")
|
280 |
+
|
281 |
+
# Create a Blocks interface with multiple functions
|
282 |
+
with gr.Blocks(title="ποΈ Advanced Chatterbox Voice Cloning") as demo:
|
283 |
gr.Markdown("# ποΈ Advanced Chatterbox Voice Cloning")
|
284 |
gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
|
285 |
+
|
286 |
with gr.Row():
|
287 |
with gr.Column(scale=2):
|
288 |
+
# Main interface inputs
|
289 |
text_input = gr.Textbox(
|
290 |
label="Text to Speak",
|
291 |
placeholder="Enter the text you want the cloned voice to say...",
|
|
|
296 |
label="Reference Audio (Upload a short .wav or .mp3 clip)",
|
297 |
sources=["upload", "microphone"]
|
298 |
)
|
299 |
+
|
300 |
with gr.Accordion("π§ Advanced Settings", open=False):
|
301 |
with gr.Row():
|
302 |
+
exaggeration_input = gr.Slider(
|
303 |
minimum=0.25,
|
304 |
maximum=1.0,
|
305 |
value=0.6,
|
|
|
307 |
label="Exaggeration",
|
308 |
info="Controls voice characteristic emphasis"
|
309 |
)
|
310 |
+
cfg_pace_input = gr.Slider(
|
311 |
minimum=0.2,
|
312 |
maximum=1.0,
|
313 |
value=0.3,
|
|
|
316 |
info="Classifier-free guidance weight"
|
317 |
)
|
318 |
with gr.Row():
|
319 |
+
seed_input = gr.Number(
|
320 |
value=0,
|
321 |
label="Random Seed",
|
322 |
info="Set to 0 for random results",
|
323 |
precision=0
|
324 |
)
|
325 |
+
temperature_input = gr.Slider(
|
326 |
minimum=0.05,
|
327 |
maximum=2.0,
|
328 |
value=0.6,
|
|
|
330 |
label="Temperature",
|
331 |
info="Controls randomness in generation"
|
332 |
)
|
333 |
+
|
334 |
generate_btn = gr.Button("π΅ Generate Voice Clone", variant="primary", size="lg")
|
335 |
+
|
336 |
with gr.Column(scale=1):
|
337 |
+
# Outputs
|
338 |
+
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
339 |
+
status_output = gr.Textbox(label="Status", lines=2)
|
340 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
with gr.Accordion("π Examples", open=False):
|
342 |
gr.Examples(
|
343 |
examples=[
|
|
|
345 |
["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
|
346 |
["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
|
347 |
],
|
348 |
+
inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input]
|
|
|
|
|
|
|
349 |
)
|
350 |
+
|
351 |
+
# Main interface function (for file uploads)
|
352 |
+
generate_btn.click(
|
353 |
+
fn=clone_voice_api,
|
354 |
+
inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input],
|
355 |
+
outputs=[audio_output, status_output],
|
356 |
+
api_name="predict"
|
357 |
+
)
|
358 |
+
|
359 |
+
# API function for base64 data (for external API calls)
|
360 |
+
def clone_voice_base64_api(text_to_speak, reference_audio_b64, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
|
361 |
+
"""API function that accepts base64 audio data directly."""
|
362 |
+
return clone_voice_api(text_to_speak, reference_audio_b64, exaggeration, cfg_pace, random_seed, temperature)
|
363 |
+
|
364 |
+
# Hidden inputs/outputs for the base64 API
|
365 |
+
with gr.Row(visible=False):
|
366 |
+
api_text_input = gr.Textbox()
|
367 |
+
api_audio_input = gr.Textbox() # This will receive base64 data URL
|
368 |
+
api_exaggeration_input = gr.Slider(minimum=0.25, maximum=1.0, value=0.6)
|
369 |
+
api_cfg_pace_input = gr.Slider(minimum=0.2, maximum=1.0, value=0.3)
|
370 |
+
api_seed_input = gr.Number(value=0, precision=0)
|
371 |
+
api_temperature_input = gr.Slider(minimum=0.05, maximum=2.0, value=0.6)
|
372 |
+
api_audio_output = gr.Audio(type="numpy")
|
373 |
+
api_status_output = gr.Textbox()
|
374 |
+
api_btn = gr.Button()
|
375 |
+
|
376 |
+
# API endpoint for base64 data
|
377 |
+
api_btn.click(
|
378 |
+
fn=clone_voice_base64_api,
|
379 |
+
inputs=[api_text_input, api_audio_input, api_exaggeration_input, api_cfg_pace_input, api_seed_input, api_temperature_input],
|
380 |
+
outputs=[api_audio_output, api_status_output],
|
381 |
+
api_name="clone_voice"
|
382 |
+
)
|
383 |
+
|
384 |
+
demo.launch(
|
385 |
server_name="0.0.0.0",
|
386 |
server_port=7860,
|
387 |
show_error=True,
|