ramimu commited on
Commit
7be21d2
Β·
verified Β·
1 Parent(s): 5961c78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -126
app.py CHANGED
@@ -179,9 +179,9 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
179
  return None, "Error: Please upload a reference audio file (.wav or .mp3)."
180
 
181
  try:
182
- print(f"clone_voice function called:")
183
  print(f" Text: '{text_to_speak}'")
184
- print(f" Audio Path: '{reference_audio_path}'")
185
  print(f" Exaggeration: {exaggeration}")
186
  print(f" CFG/Pace: {cfg_pace}")
187
  print(f" Random Seed: {random_seed}")
@@ -206,7 +206,7 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
206
  except:
207
  sample_rate = 24000
208
 
209
- print(f"Audio generated successfully by clone_voice. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
210
 
211
  if isinstance(output_wav_data, str):
212
  return output_wav_data, "Success: Audio generated successfully!"
@@ -219,12 +219,11 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
219
  return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
220
 
221
  except Exception as e:
222
- print(f"ERROR: Failed during audio generation in clone_voice: {e}")
223
- print("Detailed error trace for audio generation in clone_voice:")
224
  traceback.print_exc()
225
  return None, f"Error during audio generation: {str(e)}. Check logs for more details."
226
 
227
- # Updated clone_voice_api function with detailed logging
228
  def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
229
  import requests
230
  import tempfile
@@ -233,120 +232,60 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
233
 
234
  temp_audio_path = None
235
  try:
236
- print(f"=== API CALL DEBUG ===")
237
- print(f"Text: {text_to_speak}")
238
- print(f"Audio URL type: {type(reference_audio_url)}")
239
- print(f"Audio URL length: {len(str(reference_audio_url)) if reference_audio_url else 0}")
240
- print(f"Audio URL preview: {str(reference_audio_url)[:100]}...")
241
- print(f"Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
242
-
243
- # Validate inputs
244
- if not text_to_speak or text_to_speak.strip() == "":
245
- return None, "Error: Please enter some text to speak."
246
-
247
- if not reference_audio_url:
248
- return None, "Error: Please provide reference audio."
249
-
250
- print("Processing audio data...")
251
-
252
  if reference_audio_url.startswith('data:audio'):
253
- print("Processing base64 audio data...")
254
- try:
255
- header, encoded = reference_audio_url.split(',', 1)
256
- print(f"Header: {header}")
257
- print(f"Encoded data length: {len(encoded)}")
258
-
259
- audio_data = base64.b64decode(encoded)
260
- print(f"Decoded audio data size: {len(audio_data)} bytes")
261
-
262
- if 'mp3' in header:
263
- ext = '.mp3'
264
- elif 'wav' in header:
265
- ext = '.wav'
266
- else:
267
- ext = '.wav'
268
-
269
- with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
270
- temp_file.write(audio_data)
271
- temp_audio_path = temp_file.name
272
-
273
- print(f"Created temporary audio file: {temp_audio_path}")
274
- print(f"File exists: {os.path.exists(temp_audio_path)}")
275
- print(f"File size: {os.path.getsize(temp_audio_path)} bytes")
276
-
277
- except Exception as audio_error:
278
- print(f"Audio processing error: {audio_error}")
279
- return None, f"Error processing audio data: {str(audio_error)}"
280
-
281
  elif reference_audio_url.startswith('http'):
282
- print("Processing HTTP audio URL...")
283
- try:
284
- response = requests.get(reference_audio_url)
285
- response.raise_for_status()
286
- if reference_audio_url.endswith('.mp3'):
287
- ext = '.mp3'
288
- elif reference_audio_url.endswith('.wav'):
289
- ext = '.wav'
290
- else:
291
- ext = '.wav'
292
- with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
293
- temp_file.write(response.content)
294
- temp_audio_path = temp_file.name
295
- print(f"Downloaded audio to: {temp_audio_path}")
296
- except Exception as download_error:
297
- print(f"Download error: {download_error}")
298
- return None, f"Error downloading audio: {str(download_error)}"
299
  else:
300
- print("Using direct file path...")
301
  temp_audio_path = reference_audio_url
302
 
303
- print(f"Calling clone_voice with:")
304
- print(f" Text: {text_to_speak}")
305
- print(f" Audio path: {temp_audio_path}")
306
- print(f" Parameters: {exaggeration}, {cfg_pace}, {random_seed}, {temperature}")
307
-
308
- # Call the main function
309
  audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
310
-
311
- print(f"clone_voice returned:")
312
- print(f" Audio output type: {type(audio_output)}")
313
- print(f" Status: {status}")
314
 
315
- # Cleanup
316
  if temp_audio_path and temp_audio_path != reference_audio_url:
317
  try:
318
  os.unlink(temp_audio_path)
319
- print(f"Cleaned up temporary file: {temp_audio_path}")
320
- except Exception as cleanup_error:
321
- print(f"Cleanup error: {cleanup_error}")
322
-
323
  return audio_output, status
324
-
325
  except Exception as e:
326
- print(f"=== CRITICAL ERROR ===")
327
- print(f"Error type: {type(e)}")
328
- print(f"Error message: {str(e)}")
329
- import traceback
330
- traceback.print_exc()
331
-
332
- # Cleanup on error
333
  if temp_audio_path and temp_audio_path != reference_audio_url:
334
  try:
335
  os.unlink(temp_audio_path)
336
  except:
337
  pass
338
-
339
  return None, f"API Error: {str(e)}"
340
 
341
  def main():
342
  print("Starting Advanced Gradio interface...")
343
-
344
- with gr.Blocks(title="Advanced Chatterbox Voice Cloning", theme=gr.themes.Soft()) as iface:
 
345
  gr.Markdown("# πŸŽ™οΈ Advanced Chatterbox Voice Cloning")
346
  gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
347
-
348
  with gr.Row():
349
  with gr.Column(scale=2):
 
350
  text_input = gr.Textbox(
351
  label="Text to Speak",
352
  placeholder="Enter the text you want the cloned voice to say...",
@@ -357,9 +296,10 @@ def main():
357
  label="Reference Audio (Upload a short .wav or .mp3 clip)",
358
  sources=["upload", "microphone"]
359
  )
 
360
  with gr.Accordion("πŸ”§ Advanced Settings", open=False):
361
  with gr.Row():
362
- exaggeration = gr.Slider(
363
  minimum=0.25,
364
  maximum=1.0,
365
  value=0.6,
@@ -367,7 +307,7 @@ def main():
367
  label="Exaggeration",
368
  info="Controls voice characteristic emphasis"
369
  )
370
- cfg_pace = gr.Slider(
371
  minimum=0.2,
372
  maximum=1.0,
373
  value=0.3,
@@ -376,13 +316,13 @@ def main():
376
  info="Classifier-free guidance weight"
377
  )
378
  with gr.Row():
379
- random_seed = gr.Number(
380
  value=0,
381
  label="Random Seed",
382
  info="Set to 0 for random results",
383
  precision=0
384
  )
385
- temperature = gr.Slider(
386
  minimum=0.05,
387
  maximum=2.0,
388
  value=0.6,
@@ -390,28 +330,14 @@ def main():
390
  label="Temperature",
391
  info="Controls randomness in generation"
392
  )
 
393
  generate_btn = gr.Button("🎡 Generate Voice Clone", variant="primary", size="lg")
394
-
395
  with gr.Column(scale=1):
396
- audio_output = gr.Audio(
397
- label="Generated Audio",
398
- type="numpy",
399
- interactive=False
400
- )
401
- status_output = gr.Textbox(
402
- label="Status",
403
- interactive=False,
404
- lines=2
405
- )
406
-
407
- # This is the key part - create the API endpoint properly
408
- generate_btn.click(
409
- fn=clone_voice_api, # Use the API-ready function
410
- inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
411
- outputs=[audio_output, status_output],
412
- api_name="predict" # This creates /api/predict endpoint
413
- )
414
-
415
  with gr.Accordion("πŸ“ Examples", open=False):
416
  gr.Examples(
417
  examples=[
@@ -419,14 +345,43 @@ def main():
419
  ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
420
  ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
421
  ],
422
- inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
423
- outputs=[audio_output, status_output],
424
- fn=clone_voice_api,
425
- cache_examples=False
426
  )
427
-
428
- # Launch the interface
429
- iface.launch(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  server_name="0.0.0.0",
431
  server_port=7860,
432
  show_error=True,
 
179
  return None, "Error: Please upload a reference audio file (.wav or .mp3)."
180
 
181
  try:
182
+ print(f"Received request:")
183
  print(f" Text: '{text_to_speak}'")
184
+ print(f" Audio: '{reference_audio_path}'")
185
  print(f" Exaggeration: {exaggeration}")
186
  print(f" CFG/Pace: {cfg_pace}")
187
  print(f" Random Seed: {random_seed}")
 
206
  except:
207
  sample_rate = 24000
208
 
209
+ print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
210
 
211
  if isinstance(output_wav_data, str):
212
  return output_wav_data, "Success: Audio generated successfully!"
 
219
  return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
220
 
221
  except Exception as e:
222
+ print(f"ERROR: Failed during audio generation: {e}")
223
+ print("Detailed error trace for audio generation:")
224
  traceback.print_exc()
225
  return None, f"Error during audio generation: {str(e)}. Check logs for more details."
226
 
 
227
  def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
228
  import requests
229
  import tempfile
 
232
 
233
  temp_audio_path = None
234
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  if reference_audio_url.startswith('data:audio'):
236
+ header, encoded = reference_audio_url.split(',', 1)
237
+ audio_data = base64.b64decode(encoded)
238
+ if 'mp3' in header:
239
+ ext = '.mp3'
240
+ elif 'wav' in header:
241
+ ext = '.wav'
242
+ else:
243
+ ext = '.wav'
244
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
245
+ temp_file.write(audio_data)
246
+ temp_audio_path = temp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  elif reference_audio_url.startswith('http'):
248
+ response = requests.get(reference_audio_url)
249
+ response.raise_for_status()
250
+ if reference_audio_url.endswith('.mp3'):
251
+ ext = '.mp3'
252
+ elif reference_audio_url.endswith('.wav'):
253
+ ext = '.wav'
254
+ else:
255
+ ext = '.wav'
256
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
257
+ temp_file.write(response.content)
258
+ temp_audio_path = temp_file.name
 
 
 
 
 
 
259
  else:
 
260
  temp_audio_path = reference_audio_url
261
 
 
 
 
 
 
 
262
  audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
 
 
 
 
263
 
 
264
  if temp_audio_path and temp_audio_path != reference_audio_url:
265
  try:
266
  os.unlink(temp_audio_path)
267
+ except:
268
+ pass
 
 
269
  return audio_output, status
 
270
  except Exception as e:
 
 
 
 
 
 
 
271
  if temp_audio_path and temp_audio_path != reference_audio_url:
272
  try:
273
  os.unlink(temp_audio_path)
274
  except:
275
  pass
 
276
  return None, f"API Error: {str(e)}"
277
 
278
  def main():
279
  print("Starting Advanced Gradio interface...")
280
+
281
+ # Create a Blocks interface with multiple functions
282
+ with gr.Blocks(title="πŸŽ™οΈ Advanced Chatterbox Voice Cloning") as demo:
283
  gr.Markdown("# πŸŽ™οΈ Advanced Chatterbox Voice Cloning")
284
  gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
285
+
286
  with gr.Row():
287
  with gr.Column(scale=2):
288
+ # Main interface inputs
289
  text_input = gr.Textbox(
290
  label="Text to Speak",
291
  placeholder="Enter the text you want the cloned voice to say...",
 
296
  label="Reference Audio (Upload a short .wav or .mp3 clip)",
297
  sources=["upload", "microphone"]
298
  )
299
+
300
  with gr.Accordion("πŸ”§ Advanced Settings", open=False):
301
  with gr.Row():
302
+ exaggeration_input = gr.Slider(
303
  minimum=0.25,
304
  maximum=1.0,
305
  value=0.6,
 
307
  label="Exaggeration",
308
  info="Controls voice characteristic emphasis"
309
  )
310
+ cfg_pace_input = gr.Slider(
311
  minimum=0.2,
312
  maximum=1.0,
313
  value=0.3,
 
316
  info="Classifier-free guidance weight"
317
  )
318
  with gr.Row():
319
+ seed_input = gr.Number(
320
  value=0,
321
  label="Random Seed",
322
  info="Set to 0 for random results",
323
  precision=0
324
  )
325
+ temperature_input = gr.Slider(
326
  minimum=0.05,
327
  maximum=2.0,
328
  value=0.6,
 
330
  label="Temperature",
331
  info="Controls randomness in generation"
332
  )
333
+
334
  generate_btn = gr.Button("🎡 Generate Voice Clone", variant="primary", size="lg")
335
+
336
  with gr.Column(scale=1):
337
+ # Outputs
338
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
339
+ status_output = gr.Textbox(label="Status", lines=2)
340
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  with gr.Accordion("πŸ“ Examples", open=False):
342
  gr.Examples(
343
  examples=[
 
345
  ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
346
  ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
347
  ],
348
+ inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input]
 
 
 
349
  )
350
+
351
+ # Main interface function (for file uploads)
352
+ generate_btn.click(
353
+ fn=clone_voice_api,
354
+ inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input],
355
+ outputs=[audio_output, status_output],
356
+ api_name="predict"
357
+ )
358
+
359
+ # API function for base64 data (for external API calls)
360
+ def clone_voice_base64_api(text_to_speak, reference_audio_b64, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
361
+ """API function that accepts base64 audio data directly."""
362
+ return clone_voice_api(text_to_speak, reference_audio_b64, exaggeration, cfg_pace, random_seed, temperature)
363
+
364
+ # Hidden inputs/outputs for the base64 API
365
+ with gr.Row(visible=False):
366
+ api_text_input = gr.Textbox()
367
+ api_audio_input = gr.Textbox() # This will receive base64 data URL
368
+ api_exaggeration_input = gr.Slider(minimum=0.25, maximum=1.0, value=0.6)
369
+ api_cfg_pace_input = gr.Slider(minimum=0.2, maximum=1.0, value=0.3)
370
+ api_seed_input = gr.Number(value=0, precision=0)
371
+ api_temperature_input = gr.Slider(minimum=0.05, maximum=2.0, value=0.6)
372
+ api_audio_output = gr.Audio(type="numpy")
373
+ api_status_output = gr.Textbox()
374
+ api_btn = gr.Button()
375
+
376
+ # API endpoint for base64 data
377
+ api_btn.click(
378
+ fn=clone_voice_base64_api,
379
+ inputs=[api_text_input, api_audio_input, api_exaggeration_input, api_cfg_pace_input, api_seed_input, api_temperature_input],
380
+ outputs=[api_audio_output, api_status_output],
381
+ api_name="clone_voice"
382
+ )
383
+
384
+ demo.launch(
385
  server_name="0.0.0.0",
386
  server_port=7860,
387
  show_error=True,