ceymox commited on
Commit
7810a88
Β·
verified Β·
1 Parent(s): 4ac6a9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +656 -293
app.py CHANGED
@@ -9,12 +9,13 @@ import uuid
9
  import logging
10
  import requests
11
  import io
12
- from typing import Optional, Dict, Any
 
13
  from pathlib import Path
14
 
15
  import gradio as gr
16
  import spaces
17
- from fastapi import FastAPI, HTTPException
18
  from fastapi.responses import StreamingResponse
19
  from fastapi.middleware.cors import CORSMiddleware
20
  from pydantic import BaseModel
@@ -31,10 +32,148 @@ logger.info(f"πŸš€ Running on device: {DEVICE}")
31
  MODEL = None
32
  CHATTERBOX_AVAILABLE = False
33
 
34
- # Storage for generated audio
35
  AUDIO_DIR = "generated_audio"
 
36
  os.makedirs(AUDIO_DIR, exist_ok=True)
 
 
 
37
  audio_cache = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def load_chatterbox_model():
40
  """Try multiple ways to load ChatterboxTTS from Resemble AI"""
@@ -81,52 +220,6 @@ def load_chatterbox_model():
81
  except Exception as e:
82
  logger.warning(f"Method 3 failed with error: {e}")
83
 
84
- # Method 4: Try exploring the installed package
85
- try:
86
- import chatterbox
87
- import inspect
88
-
89
- # Log what's available in the chatterbox package
90
- logger.info(f"Chatterbox module path: {chatterbox.__file__}")
91
- logger.info(f"Chatterbox contents: {dir(chatterbox)}")
92
-
93
- # Try to find ChatterboxTTS class anywhere in the module
94
- for name, obj in inspect.getmembers(chatterbox):
95
- if name == 'ChatterboxTTS' or (inspect.isclass(obj) and 'TTS' in name):
96
- logger.info(f"Found potential TTS class: {name}")
97
- MODEL = obj.from_pretrained(DEVICE)
98
- CHATTERBOX_AVAILABLE = True
99
- return True
100
-
101
- raise ImportError("ChatterboxTTS class not found in chatterbox package")
102
-
103
- except ImportError as e:
104
- logger.warning(f"Method 4 failed: {e}")
105
- except Exception as e:
106
- logger.warning(f"Method 4 failed with error: {e}")
107
-
108
- # Method 5: Check if the GitHub repo was installed correctly
109
- try:
110
- import pkg_resources
111
- try:
112
- pkg_resources.get_distribution('chatterbox')
113
- logger.info("βœ… Chatterbox package is installed")
114
- except pkg_resources.DistributionNotFound:
115
- logger.warning("❌ Chatterbox package not found in installed packages")
116
-
117
- # Try to import and inspect what we got
118
- import chatterbox
119
- chatterbox_path = chatterbox.__path__[0] if hasattr(chatterbox, '__path__') else str(chatterbox.__file__)
120
- logger.info(f"Chatterbox installed at: {chatterbox_path}")
121
-
122
- # List all available modules/classes
123
- import pkgutil
124
- for importer, modname, ispkg in pkgutil.walk_packages(chatterbox.__path__, chatterbox.__name__ + "."):
125
- logger.info(f"Available module: {modname}")
126
-
127
- except Exception as e:
128
- logger.warning(f"Package inspection failed: {e}")
129
-
130
  # If we get here, the GitHub repo might have a different structure
131
  logger.error("❌ Could not load ChatterboxTTS from Resemble AI repository")
132
  logger.error("πŸ’‘ The GitHub repo might have a different structure than expected")
@@ -135,30 +228,6 @@ def load_chatterbox_model():
135
 
136
  return False
137
 
138
- def download_audio_from_url(url):
139
- """Download audio from URL and save to temporary file"""
140
- try:
141
- logger.info(f"πŸ“₯ Downloading reference audio from: {url}")
142
- response = requests.get(url, timeout=30, headers={
143
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
144
- })
145
-
146
- if response.status_code == 200:
147
- # Create temporary file
148
- temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
149
- temp_file.write(response.content)
150
- temp_file.close()
151
-
152
- logger.info(f"βœ… Audio downloaded to: {temp_file.name}")
153
- return temp_file.name
154
- else:
155
- logger.error(f"❌ HTTP {response.status_code} when downloading audio")
156
- return None
157
-
158
- except Exception as e:
159
- logger.error(f"❌ Error downloading audio from URL: {e}")
160
- return None
161
-
162
  def get_or_load_model():
163
  """Load ChatterboxTTS model if not already loaded"""
164
  global MODEL
@@ -171,7 +240,6 @@ def get_or_load_model():
171
  logger.info("βœ… ChatterboxTTS model loaded successfully")
172
  else:
173
  logger.error("❌ Failed to load ChatterboxTTS - using fallback")
174
- # Create a better fallback that shows the issue
175
  create_fallback_model()
176
  return MODEL
177
 
@@ -230,15 +298,29 @@ def generate_id():
230
  """Generate unique ID"""
231
  return str(uuid.uuid4())
232
 
 
 
 
233
  # Pydantic models for API
234
  class TTSRequest(BaseModel):
235
  text: str
236
- audio_prompt_url: Optional[str] = "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
237
  exaggeration: Optional[float] = 0.5
238
  temperature: Optional[float] = 0.8
239
  cfg_weight: Optional[float] = 0.5
240
  seed: Optional[int] = 0
241
 
 
 
 
 
 
 
 
 
 
 
 
242
  class TTSResponse(BaseModel):
243
  success: bool
244
  audio_id: Optional[str] = None
@@ -260,14 +342,14 @@ except Exception as e:
260
  @spaces.GPU
261
  def generate_tts_audio(
262
  text_input: str,
263
- audio_prompt_path_input: str,
264
  exaggeration_input: float,
265
  temperature_input: float,
266
  seed_num_input: int,
267
  cfgw_input: float
268
  ) -> tuple[int, np.ndarray]:
269
  """
270
- Generate TTS audio using ChatterboxTTS model
271
  """
272
  current_model = get_or_load_model()
273
 
@@ -278,29 +360,25 @@ def generate_tts_audio(
278
  set_seed(int(seed_num_input))
279
 
280
  logger.info(f"🎡 Generating audio for: '{text_input[:50]}...'")
 
281
 
282
  if not CHATTERBOX_AVAILABLE:
283
  logger.warning("🚨 USING FALLBACK - Real ChatterboxTTS not found!")
284
- logger.warning("πŸ“‹ To fix: Upload your ChatterboxTTS package to this Space")
285
 
286
- # Handle audio prompt - download if it's a URL
287
- audio_prompt_path = audio_prompt_path_input
288
  temp_audio_file = None
289
 
290
  try:
291
- # Check if it's a URL
292
- if audio_prompt_path_input and (audio_prompt_path_input.startswith('http://') or audio_prompt_path_input.startswith('https://')):
293
- logger.info(f"🌐 Detected URL, downloading audio: {audio_prompt_path_input}")
294
- temp_audio_file = download_audio_from_url(audio_prompt_path_input)
295
- if temp_audio_file:
296
- audio_prompt_path = temp_audio_file
297
- logger.info(f"βœ… Using downloaded audio: {audio_prompt_path}")
298
- else:
299
- logger.warning("⚠️ Failed to download audio, proceeding without reference")
300
- audio_prompt_path = None
301
- elif audio_prompt_path_input and not os.path.exists(audio_prompt_path_input):
302
- logger.warning(f"⚠️ Audio file not found: {audio_prompt_path_input}, proceeding without reference")
303
- audio_prompt_path = None
304
 
305
  # Generate audio
306
  wav = current_model.generate(
@@ -322,8 +400,8 @@ def generate_tts_audio(
322
  logger.error(f"❌ Audio generation failed: {e}")
323
  raise
324
  finally:
325
- # Clean up temporary file
326
- if temp_audio_file and os.path.exists(temp_audio_file):
327
  try:
328
  os.unlink(temp_audio_file)
329
  logger.info(f"πŸ—‘οΈ Cleaned up temporary file: {temp_audio_file}")
@@ -332,9 +410,9 @@ def generate_tts_audio(
332
 
333
  # FastAPI app for API endpoints
334
  app = FastAPI(
335
- title="ChatterboxTTS API",
336
- description="High-quality text-to-speech synthesis using ChatterboxTTS",
337
- version="1.0.0"
338
  )
339
 
340
  app.add_middleware(
@@ -349,15 +427,18 @@ app.add_middleware(
349
  async def root():
350
  """API status endpoint"""
351
  return {
352
- "service": "ChatterboxTTS API",
353
- "version": "1.0.0",
354
  "status": "operational" if MODEL else "model_loading",
355
  "model_loaded": MODEL is not None,
356
  "real_chatterbox": CHATTERBOX_AVAILABLE,
357
  "device": DEVICE,
 
358
  "message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
359
  "endpoints": {
360
  "synthesize": "/api/tts/synthesize",
 
 
361
  "audio": "/api/audio/{audio_id}",
362
  "health": "/health"
363
  }
@@ -371,14 +452,105 @@ async def health_check():
371
  "model_loaded": MODEL is not None,
372
  "real_chatterbox": CHATTERBOX_AVAILABLE,
373
  "device": DEVICE,
 
374
  "timestamp": time.time(),
375
  "warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
376
  }
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  @app.post("/api/tts/synthesize", response_model=TTSResponse)
379
  async def synthesize_speech(request: TTSRequest):
380
  """
381
- Synthesize speech from text
382
  """
383
  try:
384
  if MODEL is None:
@@ -390,70 +562,55 @@ async def synthesize_speech(request: TTSRequest):
390
  if len(request.text) > 500:
391
  raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
392
 
 
 
 
393
  start_time = time.time()
394
 
395
- # Handle audio prompt URL
396
- audio_prompt_path = request.audio_prompt_url
397
- temp_audio_file = None
 
 
 
 
 
 
398
 
399
- if request.audio_prompt_url and (request.audio_prompt_url.startswith('http://') or request.audio_prompt_url.startswith('https://')):
400
- temp_audio_file = download_audio_from_url(request.audio_prompt_url)
401
- if temp_audio_file:
402
- audio_prompt_path = temp_audio_file
403
- else:
404
- logger.warning("Failed to download reference audio, proceeding without")
405
- audio_prompt_path = None
406
 
407
- try:
408
- # Generate audio
409
- sample_rate, audio_data = generate_tts_audio(
410
- request.text,
411
- audio_prompt_path,
412
- request.exaggeration,
413
- request.temperature,
414
- request.seed,
415
- request.cfg_weight
416
- )
417
-
418
- generation_time = time.time() - start_time
419
-
420
- # Save audio file
421
- audio_id = generate_id()
422
- audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
423
- sf.write(audio_path, audio_data, sample_rate)
424
-
425
- # Cache audio info
426
- audio_cache[audio_id] = {
427
- "path": audio_path,
428
- "text": request.text,
429
- "sample_rate": sample_rate,
430
- "duration": len(audio_data) / sample_rate,
431
- "generated_at": time.time(),
432
- "generation_time": generation_time,
433
- "real_chatterbox": CHATTERBOX_AVAILABLE
434
- }
435
-
436
- message = "Speech synthesized successfully"
437
- if not CHATTERBOX_AVAILABLE:
438
- message += " (using fallback - upload ChatterboxTTS for real synthesis)"
439
-
440
- logger.info(f"βœ… Audio saved: {audio_id} ({generation_time:.2f}s)")
441
-
442
- return TTSResponse(
443
- success=True,
444
- audio_id=audio_id,
445
- message=message,
446
- sample_rate=sample_rate,
447
- duration=len(audio_data) / sample_rate
448
- )
449
 
450
- finally:
451
- # Clean up temporary audio file
452
- if temp_audio_file and os.path.exists(temp_audio_file):
453
- try:
454
- os.unlink(temp_audio_file)
455
- except:
456
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  except HTTPException:
459
  raise
@@ -463,9 +620,7 @@ async def synthesize_speech(request: TTSRequest):
463
 
464
  @app.get("/api/audio/{audio_id}")
465
  async def get_audio(audio_id: str):
466
- """
467
- Download generated audio file
468
- """
469
  if audio_id not in audio_cache:
470
  raise HTTPException(status_code=404, detail="Audio not found")
471
 
@@ -489,9 +644,7 @@ async def get_audio(audio_id: str):
489
 
490
  @app.get("/api/audio/{audio_id}/info")
491
  async def get_audio_info(audio_id: str):
492
- """
493
- Get audio file information
494
- """
495
  if audio_id not in audio_cache:
496
  raise HTTPException(status_code=404, detail="Audio not found")
497
 
@@ -499,14 +652,13 @@ async def get_audio_info(audio_id: str):
499
 
500
  @app.get("/api/audio")
501
  async def list_audio():
502
- """
503
- List all generated audio files
504
- """
505
  return {
506
  "audio_files": [
507
  {
508
  "audio_id": audio_id,
509
  "text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
 
510
  "duration": info["duration"],
511
  "generated_at": info["generated_at"],
512
  "real_chatterbox": info.get("real_chatterbox", False)
@@ -518,9 +670,135 @@ async def list_audio():
518
 
519
  # Gradio interface
520
  def create_gradio_interface():
521
- """Create Gradio interface with better ChatterboxTTS status"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
- with gr.Blocks(title="ChatterboxTTS", theme=gr.themes.Soft()) as demo:
524
 
525
  # Status indicator at the top
526
  if CHATTERBOX_AVAILABLE:
@@ -537,141 +815,223 @@ def create_gradio_interface():
537
  """)
538
 
539
  gr.Markdown("""
540
- # 🎡 ChatterboxTTS
541
 
542
- High-quality text-to-speech synthesis with voice cloning capabilities.
543
  """)
544
 
545
- if not CHATTERBOX_AVAILABLE:
546
- gr.Markdown("""
547
- ### 🚨 Currently Using Fallback Model
548
-
549
- You're hearing beep sounds because the real ChatterboxTTS isn't loaded.
550
-
551
- **The Resemble AI ChatterboxTTS from GitHub should auto-install from requirements.txt.**
552
-
553
- If you're still seeing this message:
554
-
555
- 1. **Check build logs** for any installation errors
556
- 2. **Verify requirements.txt** contains: `git+https://github.com/resemble-ai/chatterbox.git`
557
- 3. **Restart the Space** if needed
558
- 4. **Check logs** for import errors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
- πŸ“ GitHub repo being used: https://github.com/resemble-ai/chatterbox.git
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
- If the GitHub installation fails, you can alternatively upload the package manually.
563
- """)
564
-
565
- with gr.Row():
566
- with gr.Column():
567
- text_input = gr.Textbox(
568
- value="Hello, this is ChatterboxTTS. I can generate natural-sounding speech from any text you provide.",
569
- label="Text to synthesize (max 300 characters)",
570
- max_lines=5,
571
- placeholder="Enter your text here..."
572
- )
573
 
574
- audio_prompt = gr.Textbox(
575
- value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
576
- label="Reference Audio URL or File Path",
577
- placeholder="https://example.com/audio.wav or /path/to/local/file.wav",
578
- info="URL will be downloaded automatically, or use local file path"
579
- )
580
 
581
- with gr.Row():
582
- exaggeration = gr.Slider(
583
- 0.25, 2,
584
- step=0.05,
585
- label="Exaggeration",
586
- value=0.5,
587
- info="Controls expressiveness (0.5 = neutral)"
588
- )
589
-
590
- cfg_weight = gr.Slider(
591
- 0.2, 1,
592
- step=0.05,
593
- label="CFG Weight",
594
- value=0.5,
595
- info="Controls pace and clarity"
596
- )
597
 
598
- with gr.Accordion("Advanced Settings", open=False):
599
- temperature = gr.Slider(
600
- 0.05, 5,
601
- step=0.05,
602
- label="Temperature",
603
- value=0.8,
604
- info="Controls randomness"
605
- )
606
-
607
- seed = gr.Number(
608
- value=0,
609
- label="Seed (0 = random)",
610
- info="Set to non-zero for reproducible results"
611
- )
612
 
613
- generate_btn = gr.Button("🎡 Generate Speech", variant="primary")
 
 
 
 
 
614
 
615
- with gr.Column():
616
- audio_output = gr.Audio(label="Generated Speech")
 
 
 
 
617
 
618
- status_text = gr.Textbox(
619
- label="Status",
620
- interactive=False,
621
- placeholder="Click 'Generate Speech' to start..."
622
- )
623
-
624
- def generate_speech_ui(text, prompt_url, exag, temp, seed_val, cfg):
625
- """Generate speech from UI"""
626
- try:
627
- if not text.strip():
628
- return None, "❌ Please enter some text"
629
 
630
- if len(text) > 300:
631
- return None, "❌ Text too long (max 300 characters)"
 
 
 
632
 
633
- start_time = time.time()
 
 
 
634
 
635
- # Generate audio
636
- sample_rate, audio_data = generate_tts_audio(
637
- text, prompt_url, exag, temp, int(seed_val), cfg
638
- )
639
 
640
- generation_time = time.time() - start_time
641
- duration = len(audio_data) / sample_rate
 
 
 
 
642
 
643
- if CHATTERBOX_AVAILABLE:
644
- status = f"""βœ… Real ChatterboxTTS synthesis completed!
645
-
646
- ⏱️ Generation time: {generation_time:.2f}s
647
- 🎡 Audio duration: {duration:.2f}s
648
- πŸ“Š Sample rate: {sample_rate} Hz
649
- πŸ”Š Audio samples: {len(audio_data):,}
650
- """
651
- else:
652
- status = f"""⚠️ Fallback audio generated (beep sound)
653
-
654
- 🚨 This is NOT real speech synthesis!
655
- πŸ“¦ Upload ChatterboxTTS package for real synthesis
656
- ⏱️ Generation time: {generation_time:.2f}s
657
- 🎡 Audio duration: {duration:.2f}s
658
-
659
- πŸ’‘ To fix: Upload your ChatterboxTTS files to this Space
660
- """
661
 
662
- return (sample_rate, audio_data), status
663
-
664
- except Exception as e:
665
- logger.error(f"UI generation failed: {e}")
666
- return None, f"❌ Generation failed: {str(e)}"
 
667
 
 
668
  generate_btn.click(
669
  fn=generate_speech_ui,
670
- inputs=[text_input, audio_prompt, exaggeration, temperature, seed, cfg_weight],
671
  outputs=[audio_output, status_text]
672
  )
673
 
674
- # System info with better warnings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  model_status = "βœ… Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "⚠️ Fallback Model (Beep Sounds)"
676
  chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
677
 
@@ -679,26 +1039,23 @@ def create_gradio_interface():
679
  ### πŸ“Š System Status
680
  - **Model**: {model_status}
681
  - **Device**: {DEVICE}
682
- - **Generated Files**: {len(audio_cache)}
683
  - **ChatterboxTTS**: {chatterbox_status}
 
 
 
684
 
685
  {'''### πŸŽ‰ Production Ready!
686
- Your ChatterboxTTS model is loaded and working correctly.''' if CHATTERBOX_AVAILABLE else '''### ⚠️ Action Required
687
  **You're hearing beep sounds because ChatterboxTTS isn't loaded.**
688
 
689
- **To fix this:**
690
- 1. Upload your ChatterboxTTS package to this Space
691
- 2. Ensure proper directory structure with `__init__.py` files
692
- 3. Restart the Space
693
-
694
- The current fallback generates beeps to indicate missing package.'''}
695
  """)
696
 
697
  return demo
698
 
699
  # Main execution
700
  if __name__ == "__main__":
701
- logger.info("πŸŽ‰ Starting ChatterboxTTS Service...")
702
 
703
  # Model status
704
  if CHATTERBOX_AVAILABLE and MODEL:
@@ -711,10 +1068,11 @@ if __name__ == "__main__":
711
  logger.info(f"Model Status: {model_status}")
712
  logger.info(f"Device: {DEVICE}")
713
  logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
 
 
714
 
715
  if not CHATTERBOX_AVAILABLE:
716
  logger.warning("🚨 IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
717
- logger.warning("πŸ“ Expected location: ./chatterbox/src/chatterbox/tts.py")
718
 
719
  if os.getenv("SPACE_ID"):
720
  # Running in Hugging Face Spaces
@@ -739,6 +1097,11 @@ if __name__ == "__main__":
739
 
740
  logger.info("🌐 FastAPI: http://localhost:8000")
741
  logger.info("πŸ“š API Docs: http://localhost:8000/docs")
 
 
 
 
 
742
 
743
  # Start Gradio
744
  demo = create_gradio_interface()
 
9
  import logging
10
  import requests
11
  import io
12
+ import json
13
+ from typing import Optional, Dict, Any, List
14
  from pathlib import Path
15
 
16
  import gradio as gr
17
  import spaces
18
+ from fastapi import FastAPI, HTTPException, UploadFile, File
19
  from fastapi.responses import StreamingResponse
20
  from fastapi.middleware.cors import CORSMiddleware
21
  from pydantic import BaseModel
 
32
  MODEL = None
33
  CHATTERBOX_AVAILABLE = False
34
 
35
+ # Storage directories
36
  AUDIO_DIR = "generated_audio"
37
+ VOICES_DIR = "custom_voices"
38
  os.makedirs(AUDIO_DIR, exist_ok=True)
39
+ os.makedirs(VOICES_DIR, exist_ok=True)
40
+
41
+ # Voice storage
42
  audio_cache = {}
43
+ voice_library = {}
44
+
45
+ # Default/Built-in voices
46
+ BUILTIN_VOICES = {
47
+ "female_default": {
48
+ "voice_id": "female_default",
49
+ "name": "Female Default",
50
+ "description": "Professional female voice",
51
+ "audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
52
+ "type": "builtin",
53
+ "created_at": "2024-01-01T00:00:00Z"
54
+ },
55
+ "male_professional": {
56
+ "voice_id": "male_professional",
57
+ "name": "Male Professional",
58
+ "description": "Confident male voice",
59
+ "audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_professional.flac",
60
+ "type": "builtin",
61
+ "created_at": "2024-01-01T00:00:00Z"
62
+ }
63
+ }
64
+
65
+ def load_voice_library():
66
+ """Load saved custom voices from disk"""
67
+ global voice_library
68
+ voice_library = BUILTIN_VOICES.copy()
69
+
70
+ voices_json_path = os.path.join(VOICES_DIR, "voices.json")
71
+ if os.path.exists(voices_json_path):
72
+ try:
73
+ with open(voices_json_path, 'r', encoding='utf-8') as f:
74
+ custom_voices = json.load(f)
75
+ voice_library.update(custom_voices)
76
+ logger.info(f"βœ… Loaded {len(custom_voices)} custom voices from disk")
77
+ except Exception as e:
78
+ logger.error(f"❌ Error loading voice library: {e}")
79
+
80
+ def save_voice_library():
81
+ """Save custom voices to disk"""
82
+ try:
83
+ # Only save custom voices (not builtin)
84
+ custom_voices = {k: v for k, v in voice_library.items() if v.get("type") != "builtin"}
85
+
86
+ voices_json_path = os.path.join(VOICES_DIR, "voices.json")
87
+ with open(voices_json_path, 'w', encoding='utf-8') as f:
88
+ json.dump(custom_voices, f, ensure_ascii=False, indent=2)
89
+ logger.info(f"βœ… Saved {len(custom_voices)} custom voices to disk")
90
+ except Exception as e:
91
+ logger.error(f"❌ Error saving voice library: {e}")
92
+
93
+ def create_voice_from_audio(audio_file, voice_name, voice_description="Custom voice"):
94
+ """Create a new voice from uploaded audio"""
95
+ try:
96
+ voice_id = f"voice_{int(time.time())}_{uuid.uuid4().hex[:8]}"
97
+
98
+ # Save audio file
99
+ audio_filename = f"{voice_id}.wav"
100
+ audio_path = os.path.join(VOICES_DIR, audio_filename)
101
+
102
+ # Convert and save audio
103
+ if isinstance(audio_file, tuple):
104
+ # Gradio audio format (sample_rate, audio_data)
105
+ sample_rate, audio_data = audio_file
106
+ sf.write(audio_path, audio_data, sample_rate)
107
+ else:
108
+ # File upload
109
+ sf.write(audio_path, audio_file, 22050) # Default sample rate
110
+
111
+ # Create voice entry
112
+ voice_entry = {
113
+ "voice_id": voice_id,
114
+ "name": voice_name,
115
+ "description": voice_description,
116
+ "audio_path": audio_path,
117
+ "type": "custom",
118
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ")
119
+ }
120
+
121
+ # Add to voice library
122
+ voice_library[voice_id] = voice_entry
123
+ save_voice_library()
124
+
125
+ logger.info(f"βœ… Created voice: {voice_name} ({voice_id})")
126
+ return voice_id, voice_entry
127
+
128
+ except Exception as e:
129
+ logger.error(f"❌ Error creating voice: {e}")
130
+ return None, None
131
+
132
+ def download_audio_from_url(url):
133
+ """Download audio from URL and save to temporary file"""
134
+ try:
135
+ logger.info(f"πŸ“₯ Downloading reference audio from: {url}")
136
+ response = requests.get(url, timeout=30, headers={
137
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
138
+ })
139
+
140
+ if response.status_code == 200:
141
+ # Create temporary file
142
+ temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
143
+ temp_file.write(response.content)
144
+ temp_file.close()
145
+
146
+ logger.info(f"βœ… Audio downloaded to: {temp_file.name}")
147
+ return temp_file.name
148
+ else:
149
+ logger.error(f"❌ HTTP {response.status_code} when downloading audio")
150
+ return None
151
+
152
+ except Exception as e:
153
+ logger.error(f"❌ Error downloading audio from URL: {e}")
154
+ return None
155
+
156
+ def get_voice_audio_path(voice_id):
157
+ """Get the audio path for a voice (download if URL, return path if local)"""
158
+ if voice_id not in voice_library:
159
+ return None
160
+
161
+ voice_info = voice_library[voice_id]
162
+
163
+ # If it's a custom voice with local file
164
+ if voice_info.get("type") == "custom" and "audio_path" in voice_info:
165
+ audio_path = voice_info["audio_path"]
166
+ if os.path.exists(audio_path):
167
+ return audio_path
168
+ else:
169
+ logger.warning(f"⚠️ Voice audio file not found: {audio_path}")
170
+ return None
171
+
172
+ # If it's a builtin voice with URL
173
+ elif voice_info.get("type") == "builtin" and "audio_url" in voice_info:
174
+ return download_audio_from_url(voice_info["audio_url"])
175
+
176
+ return None
177
 
178
  def load_chatterbox_model():
179
  """Try multiple ways to load ChatterboxTTS from Resemble AI"""
 
220
  except Exception as e:
221
  logger.warning(f"Method 3 failed with error: {e}")
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  # If we get here, the GitHub repo might have a different structure
224
  logger.error("❌ Could not load ChatterboxTTS from Resemble AI repository")
225
  logger.error("πŸ’‘ The GitHub repo might have a different structure than expected")
 
228
 
229
  return False
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def get_or_load_model():
232
  """Load ChatterboxTTS model if not already loaded"""
233
  global MODEL
 
240
  logger.info("βœ… ChatterboxTTS model loaded successfully")
241
  else:
242
  logger.error("❌ Failed to load ChatterboxTTS - using fallback")
 
243
  create_fallback_model()
244
  return MODEL
245
 
 
298
  """Generate unique ID"""
299
  return str(uuid.uuid4())
300
 
301
+ # Load voice library at startup
302
+ load_voice_library()
303
+
304
  # Pydantic models for API
305
  class TTSRequest(BaseModel):
306
  text: str
307
+ voice_id: Optional[str] = "female_default"
308
  exaggeration: Optional[float] = 0.5
309
  temperature: Optional[float] = 0.8
310
  cfg_weight: Optional[float] = 0.5
311
  seed: Optional[int] = 0
312
 
313
+ class VoiceCreateRequest(BaseModel):
314
+ voice_name: str
315
+ voice_description: Optional[str] = "Custom voice"
316
+
317
+ class VoiceInfo(BaseModel):
318
+ voice_id: str
319
+ name: str
320
+ description: str
321
+ type: str
322
+ created_at: str
323
+
324
  class TTSResponse(BaseModel):
325
  success: bool
326
  audio_id: Optional[str] = None
 
342
  @spaces.GPU
343
  def generate_tts_audio(
344
  text_input: str,
345
+ voice_id: str,
346
  exaggeration_input: float,
347
  temperature_input: float,
348
  seed_num_input: int,
349
  cfgw_input: float
350
  ) -> tuple[int, np.ndarray]:
351
  """
352
+ Generate TTS audio using ChatterboxTTS model with voice ID
353
  """
354
  current_model = get_or_load_model()
355
 
 
360
  set_seed(int(seed_num_input))
361
 
362
  logger.info(f"🎡 Generating audio for: '{text_input[:50]}...'")
363
+ logger.info(f"🎭 Using voice: {voice_id}")
364
 
365
  if not CHATTERBOX_AVAILABLE:
366
  logger.warning("🚨 USING FALLBACK - Real ChatterboxTTS not found!")
 
367
 
368
+ # Get audio path for the voice
369
+ audio_prompt_path = get_voice_audio_path(voice_id)
370
  temp_audio_file = None
371
 
372
  try:
373
+ if audio_prompt_path and audio_prompt_path.startswith('/tmp/'):
374
+ # It's a temporary file from URL download
375
+ temp_audio_file = audio_prompt_path
376
+
377
+ if audio_prompt_path:
378
+ voice_name = voice_library.get(voice_id, {}).get("name", voice_id)
379
+ logger.info(f"βœ… Using voice '{voice_name}' audio: {audio_prompt_path}")
380
+ else:
381
+ logger.warning(f"⚠️ Could not load audio for voice {voice_id}, using default")
 
 
 
 
382
 
383
  # Generate audio
384
  wav = current_model.generate(
 
400
  logger.error(f"❌ Audio generation failed: {e}")
401
  raise
402
  finally:
403
+ # Clean up temporary file (only if it's a downloaded URL)
404
+ if temp_audio_file and temp_audio_file.startswith('/tmp/') and os.path.exists(temp_audio_file):
405
  try:
406
  os.unlink(temp_audio_file)
407
  logger.info(f"πŸ—‘οΈ Cleaned up temporary file: {temp_audio_file}")
 
410
 
411
  # FastAPI app for API endpoints
412
  app = FastAPI(
413
+ title="ChatterboxTTS Voice Manager API",
414
+ description="Advanced text-to-speech with voice cloning and management",
415
+ version="2.0.0"
416
  )
417
 
418
  app.add_middleware(
 
427
  async def root():
428
  """API status endpoint"""
429
  return {
430
+ "service": "ChatterboxTTS Voice Manager API",
431
+ "version": "2.0.0",
432
  "status": "operational" if MODEL else "model_loading",
433
  "model_loaded": MODEL is not None,
434
  "real_chatterbox": CHATTERBOX_AVAILABLE,
435
  "device": DEVICE,
436
+ "voices_available": len(voice_library),
437
  "message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
438
  "endpoints": {
439
  "synthesize": "/api/tts/synthesize",
440
+ "voices": "/api/voices",
441
+ "create_voice": "/api/voices/create",
442
  "audio": "/api/audio/{audio_id}",
443
  "health": "/health"
444
  }
 
452
  "model_loaded": MODEL is not None,
453
  "real_chatterbox": CHATTERBOX_AVAILABLE,
454
  "device": DEVICE,
455
+ "voices_total": len(voice_library),
456
  "timestamp": time.time(),
457
  "warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
458
  }
459
 
460
+ @app.get("/api/voices")
461
+ async def get_voices():
462
+ """Get all available voices"""
463
+ voices = []
464
+ for voice_id, voice_info in voice_library.items():
465
+ voices.append(VoiceInfo(
466
+ voice_id=voice_id,
467
+ name=voice_info["name"],
468
+ description=voice_info["description"],
469
+ type=voice_info["type"],
470
+ created_at=voice_info["created_at"]
471
+ ))
472
+
473
+ return {
474
+ "voices": voices,
475
+ "total": len(voices),
476
+ "builtin": len([v for v in voices if v.type == "builtin"]),
477
+ "custom": len([v for v in voices if v.type == "custom"])
478
+ }
479
+
480
+ @app.post("/api/voices/create")
481
+ async def create_voice_api(
482
+ voice_name: str,
483
+ voice_description: str = "Custom voice",
484
+ audio_file: UploadFile = File(...)
485
+ ):
486
+ """Create a new voice from uploaded audio"""
487
+ try:
488
+ # Read uploaded file
489
+ audio_data = await audio_file.read()
490
+
491
+ # Save to temporary file for processing
492
+ temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
493
+ temp_file.write(audio_data)
494
+ temp_file.close()
495
+
496
+ # Create voice
497
+ voice_id, voice_entry = create_voice_from_audio(
498
+ temp_file.name,
499
+ voice_name,
500
+ voice_description
501
+ )
502
+
503
+ # Cleanup temp file
504
+ os.unlink(temp_file.name)
505
+
506
+ if voice_id:
507
+ return {
508
+ "success": True,
509
+ "voice_id": voice_id,
510
+ "message": f"Voice '{voice_name}' created successfully",
511
+ "voice_info": voice_entry
512
+ }
513
+ else:
514
+ raise HTTPException(status_code=500, detail="Failed to create voice")
515
+
516
+ except Exception as e:
517
+ logger.error(f"❌ Voice creation failed: {e}")
518
+ raise HTTPException(status_code=500, detail=f"Voice creation failed: {str(e)}")
519
+
520
+ @app.delete("/api/voices/{voice_id}")
521
+ async def delete_voice(voice_id: str):
522
+ """Delete a custom voice"""
523
+ if voice_id not in voice_library:
524
+ raise HTTPException(status_code=404, detail="Voice not found")
525
+
526
+ voice_info = voice_library[voice_id]
527
+
528
+ if voice_info.get("type") == "builtin":
529
+ raise HTTPException(status_code=400, detail="Cannot delete builtin voices")
530
+
531
+ try:
532
+ # Delete audio file
533
+ if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
534
+ os.unlink(voice_info["audio_path"])
535
+
536
+ # Remove from library
537
+ voice_name = voice_info["name"]
538
+ del voice_library[voice_id]
539
+ save_voice_library()
540
+
541
+ return {
542
+ "success": True,
543
+ "message": f"Voice '{voice_name}' deleted successfully"
544
+ }
545
+
546
+ except Exception as e:
547
+ logger.error(f"❌ Voice deletion failed: {e}")
548
+ raise HTTPException(status_code=500, detail=f"Voice deletion failed: {str(e)}")
549
+
550
  @app.post("/api/tts/synthesize", response_model=TTSResponse)
551
  async def synthesize_speech(request: TTSRequest):
552
  """
553
+ Synthesize speech from text using voice ID
554
  """
555
  try:
556
  if MODEL is None:
 
562
  if len(request.text) > 500:
563
  raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
564
 
565
+ if request.voice_id not in voice_library:
566
+ raise HTTPException(status_code=404, detail=f"Voice '{request.voice_id}' not found")
567
+
568
  start_time = time.time()
569
 
570
+ # Generate audio using voice ID
571
+ sample_rate, audio_data = generate_tts_audio(
572
+ request.text,
573
+ request.voice_id,
574
+ request.exaggeration,
575
+ request.temperature,
576
+ request.seed,
577
+ request.cfg_weight
578
+ )
579
 
580
+ generation_time = time.time() - start_time
 
 
 
 
 
 
581
 
582
+ # Save audio file
583
+ audio_id = generate_id()
584
+ audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
585
+ sf.write(audio_path, audio_data, sample_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
+ # Cache audio info
588
+ voice_name = voice_library[request.voice_id]["name"]
589
+ audio_cache[audio_id] = {
590
+ "path": audio_path,
591
+ "text": request.text,
592
+ "voice_id": request.voice_id,
593
+ "voice_name": voice_name,
594
+ "sample_rate": sample_rate,
595
+ "duration": len(audio_data) / sample_rate,
596
+ "generated_at": time.time(),
597
+ "generation_time": generation_time,
598
+ "real_chatterbox": CHATTERBOX_AVAILABLE
599
+ }
600
+
601
+ message = f"Speech synthesized successfully using voice '{voice_name}'"
602
+ if not CHATTERBOX_AVAILABLE:
603
+ message += " (using fallback - upload ChatterboxTTS for real synthesis)"
604
+
605
+ logger.info(f"βœ… Audio saved: {audio_id} ({generation_time:.2f}s) with voice '{voice_name}'")
606
+
607
+ return TTSResponse(
608
+ success=True,
609
+ audio_id=audio_id,
610
+ message=message,
611
+ sample_rate=sample_rate,
612
+ duration=len(audio_data) / sample_rate
613
+ )
614
 
615
  except HTTPException:
616
  raise
 
620
 
621
  @app.get("/api/audio/{audio_id}")
622
  async def get_audio(audio_id: str):
623
+ """Download generated audio file"""
 
 
624
  if audio_id not in audio_cache:
625
  raise HTTPException(status_code=404, detail="Audio not found")
626
 
 
644
 
645
  @app.get("/api/audio/{audio_id}/info")
646
  async def get_audio_info(audio_id: str):
647
+ """Get audio file information"""
 
 
648
  if audio_id not in audio_cache:
649
  raise HTTPException(status_code=404, detail="Audio not found")
650
 
 
652
 
653
  @app.get("/api/audio")
654
  async def list_audio():
655
+ """List all generated audio files"""
 
 
656
  return {
657
  "audio_files": [
658
  {
659
  "audio_id": audio_id,
660
  "text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
661
+ "voice_name": info.get("voice_name", "Unknown"),
662
  "duration": info["duration"],
663
  "generated_at": info["generated_at"],
664
  "real_chatterbox": info.get("real_chatterbox", False)
 
670
 
671
  # Gradio interface
672
  def create_gradio_interface():
673
+ """Create Gradio interface with voice management"""
674
+
675
+ def get_voice_choices():
676
+ """Get voice choices for dropdown"""
677
+ choices = []
678
+ for voice_id, voice_info in voice_library.items():
679
+ voice_type = "πŸ”§" if voice_info["type"] == "builtin" else "🎭"
680
+ choices.append((f"{voice_type} {voice_info['name']} - {voice_info['description']}", voice_id))
681
+ return choices
682
+
683
+ def refresh_voice_choices():
684
+ """Refresh voice dropdown"""
685
+ return gr.update(choices=get_voice_choices())
686
+
687
+ def create_voice_ui(voice_name, voice_description, audio_file):
688
+ """Create voice from UI"""
689
+ try:
690
+ if not voice_name.strip():
691
+ return "❌ Please enter a voice name", gr.update()
692
+
693
+ if audio_file is None:
694
+ return "❌ Please upload an audio file", gr.update()
695
+
696
+ voice_id, voice_entry = create_voice_from_audio(
697
+ audio_file,
698
+ voice_name.strip(),
699
+ voice_description.strip() or "Custom voice"
700
+ )
701
+
702
+ if voice_id:
703
+ updated_choices = get_voice_choices()
704
+ return (
705
+ f"βœ… Voice '{voice_name}' created successfully!\n"
706
+ f"πŸ†” Voice ID: {voice_id}\n"
707
+ f"πŸ“ Audio saved and ready to use\n"
708
+ f"🎭 Available in voice selection dropdown",
709
+ gr.update(choices=updated_choices, value=voice_id)
710
+ )
711
+ else:
712
+ return "❌ Failed to create voice", gr.update()
713
+
714
+ except Exception as e:
715
+ logger.error(f"UI voice creation failed: {e}")
716
+ return f"❌ Voice creation failed: {str(e)}", gr.update()
717
+
718
+ def generate_speech_ui(text, voice_id, exag, temp, seed_val, cfg):
719
+ """Generate speech from UI using voice ID"""
720
+ try:
721
+ if not text.strip():
722
+ return None, "❌ Please enter some text"
723
+
724
+ if len(text) > 300:
725
+ return None, "❌ Text too long (max 300 characters)"
726
+
727
+ if not voice_id or voice_id not in voice_library:
728
+ return None, "❌ Please select a valid voice"
729
+
730
+ start_time = time.time()
731
+
732
+ # Generate audio using voice ID
733
+ sample_rate, audio_data = generate_tts_audio(
734
+ text, voice_id, exag, temp, int(seed_val), cfg
735
+ )
736
+
737
+ generation_time = time.time() - start_time
738
+ duration = len(audio_data) / sample_rate
739
+
740
+ voice_name = voice_library[voice_id]["name"]
741
+ voice_type = voice_library[voice_id]["type"]
742
+
743
+ if CHATTERBOX_AVAILABLE:
744
+ status = f"""βœ… Real ChatterboxTTS synthesis completed!
745
+
746
+ 🎭 Voice: {voice_name} ({voice_type})
747
+ ⏱️ Generation time: {generation_time:.2f}s
748
+ 🎡 Audio duration: {duration:.2f}s
749
+ πŸ“Š Sample rate: {sample_rate} Hz
750
+ πŸ”Š Audio samples: {len(audio_data):,}
751
+ """
752
+ else:
753
+ status = f"""⚠️ Fallback audio generated (beep sound)
754
+
755
+ 🚨 This is NOT real speech synthesis!
756
+ 🎭 Voice: {voice_name} ({voice_type})
757
+ πŸ“¦ Upload ChatterboxTTS package for real synthesis
758
+ ⏱️ Generation time: {generation_time:.2f}s
759
+ 🎡 Audio duration: {duration:.2f}s
760
+
761
+ πŸ’‘ To fix: Upload your ChatterboxTTS files to this Space
762
+ """
763
+
764
+ return (sample_rate, audio_data), status
765
+
766
+ except Exception as e:
767
+ logger.error(f"UI generation failed: {e}")
768
+ return None, f"❌ Generation failed: {str(e)}"
769
+
770
+ def delete_voice_ui(voice_id):
771
+ """Delete voice from UI"""
772
+ try:
773
+ if not voice_id or voice_id not in voice_library:
774
+ return "❌ Please select a voice to delete", gr.update()
775
+
776
+ voice_info = voice_library[voice_id]
777
+
778
+ if voice_info.get("type") == "builtin":
779
+ return "❌ Cannot delete builtin voices", gr.update()
780
+
781
+ voice_name = voice_info["name"]
782
+
783
+ # Delete audio file
784
+ if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
785
+ os.unlink(voice_info["audio_path"])
786
+
787
+ # Remove from library
788
+ del voice_library[voice_id]
789
+ save_voice_library()
790
+
791
+ updated_choices = get_voice_choices()
792
+ return (
793
+ f"βœ… Voice '{voice_name}' deleted successfully",
794
+ gr.update(choices=updated_choices, value=updated_choices[0][1] if updated_choices else None)
795
+ )
796
+
797
+ except Exception as e:
798
+ logger.error(f"UI voice deletion failed: {e}")
799
+ return f"❌ Voice deletion failed: {str(e)}", gr.update()
800
 
801
+ with gr.Blocks(title="ChatterboxTTS Voice Manager", theme=gr.themes.Soft()) as demo:
802
 
803
  # Status indicator at the top
804
  if CHATTERBOX_AVAILABLE:
 
815
  """)
816
 
817
  gr.Markdown("""
818
+ # 🎡 ChatterboxTTS Voice Manager
819
 
820
+ **Advanced text-to-speech with custom voice cloning and voice library management**
821
  """)
822
 
823
+ with gr.Tabs():
824
+ # Text-to-Speech Tab
825
+ with gr.TabItem("🎡 Generate Speech"):
826
+ with gr.Row():
827
+ with gr.Column():
828
+ text_input = gr.Textbox(
829
+ value="Hello, this is ChatterboxTTS with custom voice cloning. I can speak in any voice you train me with!",
830
+ label="Text to synthesize (max 300 characters)",
831
+ max_lines=5,
832
+ placeholder="Enter your text here..."
833
+ )
834
+
835
+ voice_selector = gr.Dropdown(
836
+ label="🎭 Select Voice",
837
+ choices=get_voice_choices(),
838
+ value=list(voice_library.keys())[0] if voice_library else None,
839
+ interactive=True,
840
+ info="Choose from builtin voices (πŸ”§) or your custom voices (🎭)"
841
+ )
842
+
843
+ with gr.Row():
844
+ generate_btn = gr.Button("🎡 Generate Speech", variant="primary")
845
+ refresh_voices_btn = gr.Button("πŸ”„ Refresh Voices", size="sm")
846
+
847
+ with gr.Row():
848
+ exaggeration = gr.Slider(
849
+ 0.25, 2,
850
+ step=0.05,
851
+ label="Exaggeration",
852
+ value=0.5,
853
+ info="Controls expressiveness (0.5 = neutral)"
854
+ )
855
+
856
+ cfg_weight = gr.Slider(
857
+ 0.2, 1,
858
+ step=0.05,
859
+ label="CFG Weight",
860
+ value=0.5,
861
+ info="Controls pace and clarity"
862
+ )
863
+
864
+ with gr.Accordion("Advanced Settings", open=False):
865
+ temperature = gr.Slider(
866
+ 0.05, 5,
867
+ step=0.05,
868
+ label="Temperature",
869
+ value=0.8,
870
+ info="Controls randomness"
871
+ )
872
+
873
+ seed = gr.Number(
874
+ value=0,
875
+ label="Seed (0 = random)",
876
+ info="Set to non-zero for reproducible results"
877
+ )
878
+
879
+ with gr.Column():
880
+ audio_output = gr.Audio(label="πŸ”Š Generated Speech")
881
+
882
+ status_text = gr.Textbox(
883
+ label="πŸ“Š Generation Status",
884
+ interactive=False,
885
+ lines=8,
886
+ placeholder="Select a voice and click 'Generate Speech' to start..."
887
+ )
888
 
889
+ # Voice Management Tab
890
+ with gr.TabItem("🎭 Voice Library"):
891
+ with gr.Row():
892
+ with gr.Column():
893
+ gr.Markdown("### πŸ“š Available Voices")
894
+
895
+ voices_display = gr.HTML(
896
+ value=f"""
897
+ <div style="max-height: 300px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
898
+ {''.join([f"<p><strong>{voice_info['name']}</strong> ({voice_info['type']})<br><small>{voice_info['description']}</small></p>" for voice_info in voice_library.values()])}
899
+ </div>
900
+ """
901
+ )
902
+
903
+ gr.Markdown("### πŸ—‘οΈ Delete Voice")
904
+ delete_voice_selector = gr.Dropdown(
905
+ label="Select voice to delete",
906
+ choices=[(f"{info['name']} ({info['type']})", vid) for vid, info in voice_library.items() if info['type'] == 'custom'],
907
+ value=None
908
+ )
909
+
910
+ delete_voice_btn = gr.Button("πŸ—‘οΈ Delete Selected Voice", variant="stop")
911
+ delete_status = gr.Textbox(label="Delete Status", interactive=False)
912
+
913
+ with gr.Column():
914
+ gr.Markdown("### βž• Create New Voice")
915
+
916
+ new_voice_name = gr.Textbox(
917
+ label="Voice Name",
918
+ placeholder="e.g., 'John's Voice', 'Narrator Voice'",
919
+ value=""
920
+ )
921
+
922
+ new_voice_description = gr.Textbox(
923
+ label="Voice Description",
924
+ placeholder="e.g., 'Professional male voice', 'Warm female narrator'",
925
+ value=""
926
+ )
927
+
928
+ new_voice_audio = gr.Audio(
929
+ label="Upload Voice Sample",
930
+ type="numpy",
931
+ info="Upload 5-30 seconds of clear speech"
932
+ )
933
+
934
+ create_voice_btn = gr.Button("🎯 Create Voice", variant="primary")
935
+
936
+ create_status = gr.Textbox(
937
+ label="πŸ“Š Creation Status",
938
+ interactive=False,
939
+ lines=6
940
+ )
941
 
942
+ # Voice Library Info Tab
943
+ with gr.TabItem("πŸ“‹ Voice Guide"):
944
+ gr.Markdown(f"""
945
+ ## 🎭 Voice Library Management
 
 
 
 
 
 
 
946
 
947
+ ### πŸ“š Current Library Status
948
+ - **Total Voices**: {len(voice_library)}
949
+ - **Builtin Voices**: {len([v for v in voice_library.values() if v['type'] == 'builtin'])}
950
+ - **Custom Voices**: {len([v for v in voice_library.values() if v['type'] == 'custom'])}
 
 
951
 
952
+ ### πŸ”§ Builtin Voices
953
+ These are pre-configured voices that come with the system:
954
+ {chr(10).join([f"- **{voice_info['name']}**: {voice_info['description']}" for voice_info in voice_library.values() if voice_info['type'] == 'builtin'])}
 
 
 
 
 
 
 
 
 
 
 
 
 
955
 
956
+ ### 🎯 Creating Custom Voices
 
 
 
 
 
 
 
 
 
 
 
 
 
957
 
958
+ #### πŸ“ Best Practices:
959
+ 1. **Audio Quality**: Use clear, noise-free recordings
960
+ 2. **Duration**: 5-30 seconds of natural speech
961
+ 3. **Content**: Normal conversational speech works best
962
+ 4. **Format**: WAV, MP3, or FLAC files supported
963
+ 5. **Voice Consistency**: Use the same speaker throughout
964
 
965
+ #### 🎀 Recording Tips:
966
+ - Record in a quiet environment
967
+ - Speak naturally and clearly
968
+ - Avoid background noise
969
+ - Use a decent microphone if possible
970
+ - Read a paragraph of normal text
971
 
972
+ #### πŸ”„ Voice Management:
973
+ - **Create**: Upload audio + provide name and description
974
+ - **Use**: Select from dropdown in speech generation
975
+ - **Delete**: Remove custom voices you no longer need
976
+ - **Persistent**: Custom voices are saved permanently
 
 
 
 
 
 
977
 
978
+ ### πŸš€ Usage Workflow:
979
+ 1. **Upload Voice Sample** β†’ Create custom voice
980
+ 2. **Select Voice** β†’ Choose from library
981
+ 3. **Generate Speech** β†’ Use selected voice for TTS
982
+ 4. **Manage Library** β†’ Add, delete, organize voices
983
 
984
+ ### πŸ”„ API Integration:
985
+ ```python
986
+ # List voices
987
+ GET /api/voices
988
 
989
+ # Create voice
990
+ POST /api/voices/create
 
 
991
 
992
+ # Generate speech with voice
993
+ POST /api/tts/synthesize
994
+ {{
995
+ "text": "Hello world",
996
+ "voice_id": "your_voice_id"
997
+ }}
998
 
999
+ # Delete voice
1000
+ DELETE /api/voices/voice_id
1001
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
 
1003
+ ### πŸ’‘ Pro Tips:
1004
+ - **Voice Naming**: Use descriptive names like "John_Professional" or "Sarah_Narrator"
1005
+ - **Voice Testing**: Generate short test phrases after creating voices
1006
+ - **Voice Backup**: Custom voices are saved to disk automatically
1007
+ - **Voice Sharing**: Voice IDs can be shared via API
1008
+ """)
1009
 
1010
+ # Event handlers
1011
  generate_btn.click(
1012
  fn=generate_speech_ui,
1013
+ inputs=[text_input, voice_selector, exaggeration, temperature, seed, cfg_weight],
1014
  outputs=[audio_output, status_text]
1015
  )
1016
 
1017
+ refresh_voices_btn.click(
1018
+ fn=refresh_voice_choices,
1019
+ outputs=[voice_selector]
1020
+ )
1021
+
1022
+ create_voice_btn.click(
1023
+ fn=create_voice_ui,
1024
+ inputs=[new_voice_name, new_voice_description, new_voice_audio],
1025
+ outputs=[create_status, voice_selector]
1026
+ )
1027
+
1028
+ delete_voice_btn.click(
1029
+ fn=delete_voice_ui,
1030
+ inputs=[delete_voice_selector],
1031
+ outputs=[delete_status, voice_selector]
1032
+ )
1033
+
1034
+ # System info with voice library status
1035
  model_status = "βœ… Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "⚠️ Fallback Model (Beep Sounds)"
1036
  chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
1037
 
 
1039
  ### πŸ“Š System Status
1040
  - **Model**: {model_status}
1041
  - **Device**: {DEVICE}
 
1042
  - **ChatterboxTTS**: {chatterbox_status}
1043
+ - **Voice Library**: {len(voice_library)} voices loaded
1044
+ - **Generated Files**: {len(audio_cache)}
1045
+ - **Storage**: `{VOICES_DIR}/` for voices, `{AUDIO_DIR}/` for output
1046
 
1047
  {'''### πŸŽ‰ Production Ready!
1048
+ Your ChatterboxTTS model is loaded with voice management system.''' if CHATTERBOX_AVAILABLE else '''### ⚠️ Action Required
1049
  **You're hearing beep sounds because ChatterboxTTS isn't loaded.**
1050
 
1051
+ Voice management is working, but you need ChatterboxTTS for real synthesis.'''}
 
 
 
 
 
1052
  """)
1053
 
1054
  return demo
1055
 
1056
  # Main execution
1057
  if __name__ == "__main__":
1058
+ logger.info("πŸŽ‰ Starting ChatterboxTTS Voice Management Service...")
1059
 
1060
  # Model status
1061
  if CHATTERBOX_AVAILABLE and MODEL:
 
1068
  logger.info(f"Model Status: {model_status}")
1069
  logger.info(f"Device: {DEVICE}")
1070
  logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
1071
+ logger.info(f"Voice Library: {len(voice_library)} voices loaded")
1072
+ logger.info(f"Custom Voices: {len([v for v in voice_library.values() if v['type'] == 'custom'])}")
1073
 
1074
  if not CHATTERBOX_AVAILABLE:
1075
  logger.warning("🚨 IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
 
1076
 
1077
  if os.getenv("SPACE_ID"):
1078
  # Running in Hugging Face Spaces
 
1097
 
1098
  logger.info("🌐 FastAPI: http://localhost:8000")
1099
  logger.info("πŸ“š API Docs: http://localhost:8000/docs")
1100
+ logger.info("πŸ”— API Endpoints:")
1101
+ logger.info(" - GET /api/voices")
1102
+ logger.info(" - POST /api/voices/create")
1103
+ logger.info(" - DELETE /api/voices/{voice_id}")
1104
+ logger.info(" - POST /api/tts/synthesize")
1105
 
1106
  # Start Gradio
1107
  demo = create_gradio_interface()