Spaces:

ceymox
/

Chatterbox_AP

Sleeping

App Files Files Community

ceymox commited on Jun 11

Commit

7810a88

verified ·

1 Parent(s): 4ac6a9c

Update app.py

Browse files

Files changed (1) hide show

app.py +656 -293

app.py CHANGED Viewed

@@ -9,12 +9,13 @@ import uuid
 import logging
 import requests
 import io
-from typing import Optional, Dict, Any
 from pathlib import Path
 import gradio as gr
 import spaces
-from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -31,10 +32,148 @@ logger.info(f"🚀 Running on device: {DEVICE}")
 MODEL = None
 CHATTERBOX_AVAILABLE = False
-# Storage for generated audio
 AUDIO_DIR = "generated_audio"
 os.makedirs(AUDIO_DIR, exist_ok=True)
 audio_cache = {}
 def load_chatterbox_model():
     """Try multiple ways to load ChatterboxTTS from Resemble AI"""
@@ -81,52 +220,6 @@ def load_chatterbox_model():
     except Exception as e:
         logger.warning(f"Method 3 failed with error: {e}")
-    # Method 4: Try exploring the installed package
-    try:
-        import chatterbox
-        import inspect
-        # Log what's available in the chatterbox package
-        logger.info(f"Chatterbox module path: {chatterbox.__file__}")
-        logger.info(f"Chatterbox contents: {dir(chatterbox)}")
-        # Try to find ChatterboxTTS class anywhere in the module
-        for name, obj in inspect.getmembers(chatterbox):
-            if name == 'ChatterboxTTS' or (inspect.isclass(obj) and 'TTS' in name):
-                logger.info(f"Found potential TTS class: {name}")
-                MODEL = obj.from_pretrained(DEVICE)
-                CHATTERBOX_AVAILABLE = True
-                return True
-        raise ImportError("ChatterboxTTS class not found in chatterbox package")
-    except ImportError as e:
-        logger.warning(f"Method 4 failed: {e}")
-    except Exception as e:
-        logger.warning(f"Method 4 failed with error: {e}")
-    # Method 5: Check if the GitHub repo was installed correctly
-    try:
-        import pkg_resources
-        try:
-            pkg_resources.get_distribution('chatterbox')
-            logger.info("✅ Chatterbox package is installed")
-        except pkg_resources.DistributionNotFound:
-            logger.warning("❌ Chatterbox package not found in installed packages")
-        # Try to import and inspect what we got
-        import chatterbox
-        chatterbox_path = chatterbox.__path__[0] if hasattr(chatterbox, '__path__') else str(chatterbox.__file__)
-        logger.info(f"Chatterbox installed at: {chatterbox_path}")
-        # List all available modules/classes
-        import pkgutil
-        for importer, modname, ispkg in pkgutil.walk_packages(chatterbox.__path__, chatterbox.__name__ + "."):
-            logger.info(f"Available module: {modname}")
-    except Exception as e:
-        logger.warning(f"Package inspection failed: {e}")
     # If we get here, the GitHub repo might have a different structure
     logger.error("❌ Could not load ChatterboxTTS from Resemble AI repository")
     logger.error("💡 The GitHub repo might have a different structure than expected")
@@ -135,30 +228,6 @@ def load_chatterbox_model():
     return False
-def download_audio_from_url(url):
-    """Download audio from URL and save to temporary file"""
-    try:
-        logger.info(f"📥 Downloading reference audio from: {url}")
-        response = requests.get(url, timeout=30, headers={
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        })
-        if response.status_code == 200:
-            # Create temporary file
-            temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
-            temp_file.write(response.content)
-            temp_file.close()
-            logger.info(f"✅ Audio downloaded to: {temp_file.name}")
-            return temp_file.name
-        else:
-            logger.error(f"❌ HTTP {response.status_code} when downloading audio")
-            return None
-    except Exception as e:
-        logger.error(f"❌ Error downloading audio from URL: {e}")
-        return None
 def get_or_load_model():
     """Load ChatterboxTTS model if not already loaded"""
     global MODEL
@@ -171,7 +240,6 @@ def get_or_load_model():
             logger.info("✅ ChatterboxTTS model loaded successfully")
         else:
             logger.error("❌ Failed to load ChatterboxTTS - using fallback")
-            # Create a better fallback that shows the issue
             create_fallback_model()
     return MODEL
@@ -230,15 +298,29 @@ def generate_id():
     """Generate unique ID"""
     return str(uuid.uuid4())
 # Pydantic models for API
 class TTSRequest(BaseModel):
     text: str
-    audio_prompt_url: Optional[str] = "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
     exaggeration: Optional[float] = 0.5
     temperature: Optional[float] = 0.8
     cfg_weight: Optional[float] = 0.5
     seed: Optional[int] = 0
 class TTSResponse(BaseModel):
     success: bool
     audio_id: Optional[str] = None
@@ -260,14 +342,14 @@ except Exception as e:
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
-    audio_prompt_path_input: str,
     exaggeration_input: float,
     temperature_input: float,
     seed_num_input: int,
     cfgw_input: float
 ) -> tuple[int, np.ndarray]:
     """
-    Generate TTS audio using ChatterboxTTS model
     """
     current_model = get_or_load_model()
@@ -278,29 +360,25 @@ def generate_tts_audio(
         set_seed(int(seed_num_input))
     logger.info(f"🎵 Generating audio for: '{text_input[:50]}...'")
     if not CHATTERBOX_AVAILABLE:
         logger.warning("🚨 USING FALLBACK - Real ChatterboxTTS not found!")
-        logger.warning("📋 To fix: Upload your ChatterboxTTS package to this Space")
-    # Handle audio prompt - download if it's a URL
-    audio_prompt_path = audio_prompt_path_input
     temp_audio_file = None
     try:
-        # Check if it's a URL
-        if audio_prompt_path_input and (audio_prompt_path_input.startswith('http://') or audio_prompt_path_input.startswith('https://')):
-            logger.info(f"🌐 Detected URL, downloading audio: {audio_prompt_path_input}")
-            temp_audio_file = download_audio_from_url(audio_prompt_path_input)
-            if temp_audio_file:
-                audio_prompt_path = temp_audio_file
-                logger.info(f"✅ Using downloaded audio: {audio_prompt_path}")
-            else:
-                logger.warning("⚠️ Failed to download audio, proceeding without reference")
-                audio_prompt_path = None
-        elif audio_prompt_path_input and not os.path.exists(audio_prompt_path_input):
-            logger.warning(f"⚠️ Audio file not found: {audio_prompt_path_input}, proceeding without reference")
-            audio_prompt_path = None
         # Generate audio
         wav = current_model.generate(
@@ -322,8 +400,8 @@ def generate_tts_audio(
         logger.error(f"❌ Audio generation failed: {e}")
         raise
     finally:
-        # Clean up temporary file
-        if temp_audio_file and os.path.exists(temp_audio_file):
             try:
                 os.unlink(temp_audio_file)
                 logger.info(f"🗑️ Cleaned up temporary file: {temp_audio_file}")
@@ -332,9 +410,9 @@ def generate_tts_audio(
 # FastAPI app for API endpoints
 app = FastAPI(
-    title="ChatterboxTTS API",
-    description="High-quality text-to-speech synthesis using ChatterboxTTS",
-    version="1.0.0"
 )
 app.add_middleware(
@@ -349,15 +427,18 @@ app.add_middleware(
 async def root():
     """API status endpoint"""
     return {
-        "service": "ChatterboxTTS API",
-        "version": "1.0.0",
         "status": "operational" if MODEL else "model_loading",
         "model_loaded": MODEL is not None,
         "real_chatterbox": CHATTERBOX_AVAILABLE,
         "device": DEVICE,
         "message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
         "endpoints": {
             "synthesize": "/api/tts/synthesize",
             "audio": "/api/audio/{audio_id}",
             "health": "/health"
         }
@@ -371,14 +452,105 @@ async def health_check():
         "model_loaded": MODEL is not None,
         "real_chatterbox": CHATTERBOX_AVAILABLE,
         "device": DEVICE,
         "timestamp": time.time(),
         "warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
     }
 @app.post("/api/tts/synthesize", response_model=TTSResponse)
 async def synthesize_speech(request: TTSRequest):
     """
-    Synthesize speech from text
     """
     try:
         if MODEL is None:
@@ -390,70 +562,55 @@ async def synthesize_speech(request: TTSRequest):
         if len(request.text) > 500:
             raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
         start_time = time.time()
-        # Handle audio prompt URL
-        audio_prompt_path = request.audio_prompt_url
-        temp_audio_file = None
-        if request.audio_prompt_url and (request.audio_prompt_url.startswith('http://') or request.audio_prompt_url.startswith('https://')):
-            temp_audio_file = download_audio_from_url(request.audio_prompt_url)
-            if temp_audio_file:
-                audio_prompt_path = temp_audio_file
-            else:
-                logger.warning("Failed to download reference audio, proceeding without")
-                audio_prompt_path = None
-        try:
-            # Generate audio
-            sample_rate, audio_data = generate_tts_audio(
-                request.text,
-                audio_prompt_path,
-                request.exaggeration,
-                request.temperature,
-                request.seed,
-                request.cfg_weight
-            )
-            generation_time = time.time() - start_time
-            # Save audio file
-            audio_id = generate_id()
-            audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
-            sf.write(audio_path, audio_data, sample_rate)
-            # Cache audio info
-            audio_cache[audio_id] = {
-                "path": audio_path,
-                "text": request.text,
-                "sample_rate": sample_rate,
-                "duration": len(audio_data) / sample_rate,
-                "generated_at": time.time(),
-                "generation_time": generation_time,
-                "real_chatterbox": CHATTERBOX_AVAILABLE
-            }
-            message = "Speech synthesized successfully"
-            if not CHATTERBOX_AVAILABLE:
-                message += " (using fallback - upload ChatterboxTTS for real synthesis)"
-            logger.info(f"✅ Audio saved: {audio_id} ({generation_time:.2f}s)")
-            return TTSResponse(
-                success=True,
-                audio_id=audio_id,
-                message=message,
-                sample_rate=sample_rate,
-                duration=len(audio_data) / sample_rate
-            )
-        finally:
-            # Clean up temporary audio file
-            if temp_audio_file and os.path.exists(temp_audio_file):
-                try:
-                    os.unlink(temp_audio_file)
-                except:
-                    pass
     except HTTPException:
         raise
@@ -463,9 +620,7 @@ async def synthesize_speech(request: TTSRequest):
 @app.get("/api/audio/{audio_id}")
 async def get_audio(audio_id: str):
-    """
-    Download generated audio file
-    """
     if audio_id not in audio_cache:
         raise HTTPException(status_code=404, detail="Audio not found")
@@ -489,9 +644,7 @@ async def get_audio(audio_id: str):
 @app.get("/api/audio/{audio_id}/info")
 async def get_audio_info(audio_id: str):
-    """
-    Get audio file information
-    """
     if audio_id not in audio_cache:
         raise HTTPException(status_code=404, detail="Audio not found")
@@ -499,14 +652,13 @@ async def get_audio_info(audio_id: str):
 @app.get("/api/audio")
 async def list_audio():
-    """
-    List all generated audio files
-    """
     return {
         "audio_files": [
             {
                 "audio_id": audio_id,
                 "text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
                 "duration": info["duration"],
                 "generated_at": info["generated_at"],
                 "real_chatterbox": info.get("real_chatterbox", False)
@@ -518,9 +670,135 @@ async def list_audio():
 # Gradio interface
 def create_gradio_interface():
-    """Create Gradio interface with better ChatterboxTTS status"""
-    with gr.Blocks(title="ChatterboxTTS", theme=gr.themes.Soft()) as demo:
         # Status indicator at the top
         if CHATTERBOX_AVAILABLE:
@@ -537,141 +815,223 @@ def create_gradio_interface():
         """)
         gr.Markdown("""
-        # 🎵 ChatterboxTTS
-        High-quality text-to-speech synthesis with voice cloning capabilities.
         """)
-        if not CHATTERBOX_AVAILABLE:
-            gr.Markdown("""
-            ### 🚨 Currently Using Fallback Model
-            You're hearing beep sounds because the real ChatterboxTTS isn't loaded.
-            **The Resemble AI ChatterboxTTS from GitHub should auto-install from requirements.txt.**
-            If you're still seeing this message:
-            1. **Check build logs** for any installation errors
-            2. **Verify requirements.txt** contains: `git+https://github.com/resemble-ai/chatterbox.git`
-            3. **Restart the Space** if needed
-            4. **Check logs** for import errors
-            📁 GitHub repo being used: https://github.com/resemble-ai/chatterbox.git
-            If the GitHub installation fails, you can alternatively upload the package manually.
-            """)
-        with gr.Row():
-            with gr.Column():
-                text_input = gr.Textbox(
-                    value="Hello, this is ChatterboxTTS. I can generate natural-sounding speech from any text you provide.",
-                    label="Text to synthesize (max 300 characters)",
-                    max_lines=5,
-                    placeholder="Enter your text here..."
-                )
-                audio_prompt = gr.Textbox(
-                    value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
-                    label="Reference Audio URL or File Path",
-                    placeholder="https://example.com/audio.wav or /path/to/local/file.wav",
-                    info="URL will be downloaded automatically, or use local file path"
-                )
-                with gr.Row():
-                    exaggeration = gr.Slider(
-                        0.25, 2,
-                        step=0.05,
-                        label="Exaggeration",
-                        value=0.5,
-                        info="Controls expressiveness (0.5 = neutral)"
-                    )
-                    cfg_weight = gr.Slider(
-                        0.2, 1,
-                        step=0.05,
-                        label="CFG Weight",
-                        value=0.5,
-                        info="Controls pace and clarity"
-                    )
-                with gr.Accordion("Advanced Settings", open=False):
-                    temperature = gr.Slider(
-                        0.05, 5,
-                        step=0.05,
-                        label="Temperature",
-                        value=0.8,
-                        info="Controls randomness"
-                    )
-                    seed = gr.Number(
-                        value=0,
-                        label="Seed (0 = random)",
-                        info="Set to non-zero for reproducible results"
-                    )
-                generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
-            with gr.Column():
-                audio_output = gr.Audio(label="Generated Speech")
-                status_text = gr.Textbox(
-                    label="Status",
-                    interactive=False,
-                    placeholder="Click 'Generate Speech' to start..."
-                )
-        def generate_speech_ui(text, prompt_url, exag, temp, seed_val, cfg):
-            """Generate speech from UI"""
-            try:
-                if not text.strip():
-                    return None, "❌ Please enter some text"
-                if len(text) > 300:
-                    return None, "❌ Text too long (max 300 characters)"
-                start_time = time.time()
-                # Generate audio
-                sample_rate, audio_data = generate_tts_audio(
-                    text, prompt_url, exag, temp, int(seed_val), cfg
-                )
-                generation_time = time.time() - start_time
-                duration = len(audio_data) / sample_rate
-                if CHATTERBOX_AVAILABLE:
-                    status = f"""✅ Real ChatterboxTTS synthesis completed!
-⏱️ Generation time: {generation_time:.2f}s
-🎵 Audio duration: {duration:.2f}s
-📊 Sample rate: {sample_rate} Hz
-🔊 Audio samples: {len(audio_data):,}
-                    """
-                else:
-                    status = f"""⚠️ Fallback audio generated (beep sound)
-🚨 This is NOT real speech synthesis!
-📦 Upload ChatterboxTTS package for real synthesis
-⏱️ Generation time: {generation_time:.2f}s
-🎵 Audio duration: {duration:.2f}s
-💡 To fix: Upload your ChatterboxTTS files to this Space
-                    """
-                return (sample_rate, audio_data), status
-            except Exception as e:
-                logger.error(f"UI generation failed: {e}")
-                return None, f"❌ Generation failed: {str(e)}"
         generate_btn.click(
             fn=generate_speech_ui,
-            inputs=[text_input, audio_prompt, exaggeration, temperature, seed, cfg_weight],
             outputs=[audio_output, status_text]
         )
-        # System info with better warnings
         model_status = "✅ Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "⚠️ Fallback Model (Beep Sounds)"
         chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
@@ -679,26 +1039,23 @@ def create_gradio_interface():
         ### 📊 System Status
         - **Model**: {model_status}
         - **Device**: {DEVICE}
-        - **Generated Files**: {len(audio_cache)}
         - **ChatterboxTTS**: {chatterbox_status}
         {'''### 🎉 Production Ready!
-        Your ChatterboxTTS model is loaded and working correctly.''' if CHATTERBOX_AVAILABLE else '''### ⚠️ Action Required
         **You're hearing beep sounds because ChatterboxTTS isn't loaded.**
-        **To fix this:**
-        1. Upload your ChatterboxTTS package to this Space
-        2. Ensure proper directory structure with `__init__.py` files
-        3. Restart the Space
-        The current fallback generates beeps to indicate missing package.'''}
         """)
     return demo
 # Main execution
 if __name__ == "__main__":
-    logger.info("🎉 Starting ChatterboxTTS Service...")
     # Model status
     if CHATTERBOX_AVAILABLE and MODEL:
@@ -711,10 +1068,11 @@ if __name__ == "__main__":
     logger.info(f"Model Status: {model_status}")
     logger.info(f"Device: {DEVICE}")
     logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
     if not CHATTERBOX_AVAILABLE:
         logger.warning("🚨 IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
-        logger.warning("📁 Expected location: ./chatterbox/src/chatterbox/tts.py")
     if os.getenv("SPACE_ID"):
         # Running in Hugging Face Spaces
@@ -739,6 +1097,11 @@ if __name__ == "__main__":
         logger.info("🌐 FastAPI: http://localhost:8000")
         logger.info("📚 API Docs: http://localhost:8000/docs")
         # Start Gradio
         demo = create_gradio_interface()

 import logging
 import requests
 import io
+import json
+from typing import Optional, Dict, Any, List
 from pathlib import Path
 import gradio as gr
 import spaces
+from fastapi import FastAPI, HTTPException, UploadFile, File
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 MODEL = None
 CHATTERBOX_AVAILABLE = False
+# Storage directories
 AUDIO_DIR = "generated_audio"
+VOICES_DIR = "custom_voices"
 os.makedirs(AUDIO_DIR, exist_ok=True)
+os.makedirs(VOICES_DIR, exist_ok=True)
+# Voice storage
 audio_cache = {}
+voice_library = {}
+# Default/Built-in voices
+BUILTIN_VOICES = {
+    "female_default": {
+        "voice_id": "female_default",
+        "name": "Female Default",
+        "description": "Professional female voice",
+        "audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
+        "type": "builtin",
+        "created_at": "2024-01-01T00:00:00Z"
+    },
+    "male_professional": {
+        "voice_id": "male_professional",
+        "name": "Male Professional",
+        "description": "Confident male voice",
+        "audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_professional.flac",
+        "type": "builtin",
+        "created_at": "2024-01-01T00:00:00Z"
+    }
+}
+def load_voice_library():
+    """Load saved custom voices from disk"""
+    global voice_library
+    voice_library = BUILTIN_VOICES.copy()
+    voices_json_path = os.path.join(VOICES_DIR, "voices.json")
+    if os.path.exists(voices_json_path):
+        try:
+            with open(voices_json_path, 'r', encoding='utf-8') as f:
+                custom_voices = json.load(f)
+                voice_library.update(custom_voices)
+            logger.info(f"✅ Loaded {len(custom_voices)} custom voices from disk")
+        except Exception as e:
+            logger.error(f"❌ Error loading voice library: {e}")
+def save_voice_library():
+    """Save custom voices to disk"""
+    try:
+        # Only save custom voices (not builtin)
+        custom_voices = {k: v for k, v in voice_library.items() if v.get("type") != "builtin"}
+        voices_json_path = os.path.join(VOICES_DIR, "voices.json")
+        with open(voices_json_path, 'w', encoding='utf-8') as f:
+            json.dump(custom_voices, f, ensure_ascii=False, indent=2)
+        logger.info(f"✅ Saved {len(custom_voices)} custom voices to disk")
+    except Exception as e:
+        logger.error(f"❌ Error saving voice library: {e}")
+def create_voice_from_audio(audio_file, voice_name, voice_description="Custom voice"):
+    """Create a new voice from uploaded audio"""
+    try:
+        voice_id = f"voice_{int(time.time())}_{uuid.uuid4().hex[:8]}"
+        # Save audio file
+        audio_filename = f"{voice_id}.wav"
+        audio_path = os.path.join(VOICES_DIR, audio_filename)
+        # Convert and save audio
+        if isinstance(audio_file, tuple):
+            # Gradio audio format (sample_rate, audio_data)
+            sample_rate, audio_data = audio_file
+            sf.write(audio_path, audio_data, sample_rate)
+        else:
+            # File upload
+            sf.write(audio_path, audio_file, 22050)  # Default sample rate
+        # Create voice entry
+        voice_entry = {
+            "voice_id": voice_id,
+            "name": voice_name,
+            "description": voice_description,
+            "audio_path": audio_path,
+            "type": "custom",
+            "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ")
+        }
+        # Add to voice library
+        voice_library[voice_id] = voice_entry
+        save_voice_library()
+        logger.info(f"✅ Created voice: {voice_name} ({voice_id})")
+        return voice_id, voice_entry
+    except Exception as e:
+        logger.error(f"❌ Error creating voice: {e}")
+        return None, None
+def download_audio_from_url(url):
+    """Download audio from URL and save to temporary file"""
+    try:
+        logger.info(f"📥 Downloading reference audio from: {url}")
+        response = requests.get(url, timeout=30, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+        if response.status_code == 200:
+            # Create temporary file
+            temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+            temp_file.write(response.content)
+            temp_file.close()
+            logger.info(f"✅ Audio downloaded to: {temp_file.name}")
+            return temp_file.name
+        else:
+            logger.error(f"❌ HTTP {response.status_code} when downloading audio")
+            return None
+    except Exception as e:
+        logger.error(f"❌ Error downloading audio from URL: {e}")
+        return None
+def get_voice_audio_path(voice_id):
+    """Get the audio path for a voice (download if URL, return path if local)"""
+    if voice_id not in voice_library:
+        return None
+    voice_info = voice_library[voice_id]
+    # If it's a custom voice with local file
+    if voice_info.get("type") == "custom" and "audio_path" in voice_info:
+        audio_path = voice_info["audio_path"]
+        if os.path.exists(audio_path):
+            return audio_path
+        else:
+            logger.warning(f"⚠️ Voice audio file not found: {audio_path}")
+            return None
+    # If it's a builtin voice with URL
+    elif voice_info.get("type") == "builtin" and "audio_url" in voice_info:
+        return download_audio_from_url(voice_info["audio_url"])
+    return None
 def load_chatterbox_model():
     """Try multiple ways to load ChatterboxTTS from Resemble AI"""
     except Exception as e:
         logger.warning(f"Method 3 failed with error: {e}")
     # If we get here, the GitHub repo might have a different structure
     logger.error("❌ Could not load ChatterboxTTS from Resemble AI repository")
     logger.error("💡 The GitHub repo might have a different structure than expected")
     return False
 def get_or_load_model():
     """Load ChatterboxTTS model if not already loaded"""
     global MODEL
             logger.info("✅ ChatterboxTTS model loaded successfully")
         else:
             logger.error("❌ Failed to load ChatterboxTTS - using fallback")
             create_fallback_model()
     return MODEL
     """Generate unique ID"""
     return str(uuid.uuid4())
+# Load voice library at startup
+load_voice_library()
 # Pydantic models for API
 class TTSRequest(BaseModel):
     text: str
+    voice_id: Optional[str] = "female_default"
     exaggeration: Optional[float] = 0.5
     temperature: Optional[float] = 0.8
     cfg_weight: Optional[float] = 0.5
     seed: Optional[int] = 0
+class VoiceCreateRequest(BaseModel):
+    voice_name: str
+    voice_description: Optional[str] = "Custom voice"
+class VoiceInfo(BaseModel):
+    voice_id: str
+    name: str
+    description: str
+    type: str
+    created_at: str
 class TTSResponse(BaseModel):
     success: bool
     audio_id: Optional[str] = None
 @spaces.GPU
 def generate_tts_audio(
     text_input: str,
+    voice_id: str,
     exaggeration_input: float,
     temperature_input: float,
     seed_num_input: int,
     cfgw_input: float
 ) -> tuple[int, np.ndarray]:
     """
+    Generate TTS audio using ChatterboxTTS model with voice ID
     """
     current_model = get_or_load_model()
         set_seed(int(seed_num_input))
     logger.info(f"🎵 Generating audio for: '{text_input[:50]}...'")
+    logger.info(f"🎭 Using voice: {voice_id}")
     if not CHATTERBOX_AVAILABLE:
         logger.warning("🚨 USING FALLBACK - Real ChatterboxTTS not found!")
+    # Get audio path for the voice
+    audio_prompt_path = get_voice_audio_path(voice_id)
     temp_audio_file = None
     try:
+        if audio_prompt_path and audio_prompt_path.startswith('/tmp/'):
+            # It's a temporary file from URL download
+            temp_audio_file = audio_prompt_path
+        if audio_prompt_path:
+            voice_name = voice_library.get(voice_id, {}).get("name", voice_id)
+            logger.info(f"✅ Using voice '{voice_name}' audio: {audio_prompt_path}")
+        else:
+            logger.warning(f"⚠️ Could not load audio for voice {voice_id}, using default")
         # Generate audio
         wav = current_model.generate(
         logger.error(f"❌ Audio generation failed: {e}")
         raise
     finally:
+        # Clean up temporary file (only if it's a downloaded URL)
+        if temp_audio_file and temp_audio_file.startswith('/tmp/') and os.path.exists(temp_audio_file):
             try:
                 os.unlink(temp_audio_file)
                 logger.info(f"🗑️ Cleaned up temporary file: {temp_audio_file}")
 # FastAPI app for API endpoints
 app = FastAPI(
+    title="ChatterboxTTS Voice Manager API",
+    description="Advanced text-to-speech with voice cloning and management",
+    version="2.0.0"
 )
 app.add_middleware(
 async def root():
     """API status endpoint"""
     return {
+        "service": "ChatterboxTTS Voice Manager API",
+        "version": "2.0.0",
         "status": "operational" if MODEL else "model_loading",
         "model_loaded": MODEL is not None,
         "real_chatterbox": CHATTERBOX_AVAILABLE,
         "device": DEVICE,
+        "voices_available": len(voice_library),
         "message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
         "endpoints": {
             "synthesize": "/api/tts/synthesize",
+            "voices": "/api/voices",
+            "create_voice": "/api/voices/create",
             "audio": "/api/audio/{audio_id}",
             "health": "/health"
         }
         "model_loaded": MODEL is not None,
         "real_chatterbox": CHATTERBOX_AVAILABLE,
         "device": DEVICE,
+        "voices_total": len(voice_library),
         "timestamp": time.time(),
         "warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
     }
+@app.get("/api/voices")
+async def get_voices():
+    """Get all available voices"""
+    voices = []
+    for voice_id, voice_info in voice_library.items():
+        voices.append(VoiceInfo(
+            voice_id=voice_id,
+            name=voice_info["name"],
+            description=voice_info["description"],
+            type=voice_info["type"],
+            created_at=voice_info["created_at"]
+        ))
+    return {
+        "voices": voices,
+        "total": len(voices),
+        "builtin": len([v for v in voices if v.type == "builtin"]),
+        "custom": len([v for v in voices if v.type == "custom"])
+    }
+@app.post("/api/voices/create")
+async def create_voice_api(
+    voice_name: str,
+    voice_description: str = "Custom voice",
+    audio_file: UploadFile = File(...)
+):
+    """Create a new voice from uploaded audio"""
+    try:
+        # Read uploaded file
+        audio_data = await audio_file.read()
+        # Save to temporary file for processing
+        temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        temp_file.write(audio_data)
+        temp_file.close()
+        # Create voice
+        voice_id, voice_entry = create_voice_from_audio(
+            temp_file.name,
+            voice_name,
+            voice_description
+        )
+        # Cleanup temp file
+        os.unlink(temp_file.name)
+        if voice_id:
+            return {
+                "success": True,
+                "voice_id": voice_id,
+                "message": f"Voice '{voice_name}' created successfully",
+                "voice_info": voice_entry
+            }
+        else:
+            raise HTTPException(status_code=500, detail="Failed to create voice")
+    except Exception as e:
+        logger.error(f"❌ Voice creation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Voice creation failed: {str(e)}")
+@app.delete("/api/voices/{voice_id}")
+async def delete_voice(voice_id: str):
+    """Delete a custom voice"""
+    if voice_id not in voice_library:
+        raise HTTPException(status_code=404, detail="Voice not found")
+    voice_info = voice_library[voice_id]
+    if voice_info.get("type") == "builtin":
+        raise HTTPException(status_code=400, detail="Cannot delete builtin voices")
+    try:
+        # Delete audio file
+        if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
+            os.unlink(voice_info["audio_path"])
+        # Remove from library
+        voice_name = voice_info["name"]
+        del voice_library[voice_id]
+        save_voice_library()
+        return {
+            "success": True,
+            "message": f"Voice '{voice_name}' deleted successfully"
+        }
+    except Exception as e:
+        logger.error(f"❌ Voice deletion failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Voice deletion failed: {str(e)}")
 @app.post("/api/tts/synthesize", response_model=TTSResponse)
 async def synthesize_speech(request: TTSRequest):
     """
+    Synthesize speech from text using voice ID
     """
     try:
         if MODEL is None:
         if len(request.text) > 500:
             raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
+        if request.voice_id not in voice_library:
+            raise HTTPException(status_code=404, detail=f"Voice '{request.voice_id}' not found")
         start_time = time.time()
+        # Generate audio using voice ID
+        sample_rate, audio_data = generate_tts_audio(
+            request.text,
+            request.voice_id,
+            request.exaggeration,
+            request.temperature,
+            request.seed,
+            request.cfg_weight
+        )
+        generation_time = time.time() - start_time
+        # Save audio file
+        audio_id = generate_id()
+        audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
+        sf.write(audio_path, audio_data, sample_rate)
+        # Cache audio info
+        voice_name = voice_library[request.voice_id]["name"]
+        audio_cache[audio_id] = {
+            "path": audio_path,
+            "text": request.text,
+            "voice_id": request.voice_id,
+            "voice_name": voice_name,
+            "sample_rate": sample_rate,
+            "duration": len(audio_data) / sample_rate,
+            "generated_at": time.time(),
+            "generation_time": generation_time,
+            "real_chatterbox": CHATTERBOX_AVAILABLE
+        }
+        message = f"Speech synthesized successfully using voice '{voice_name}'"
+        if not CHATTERBOX_AVAILABLE:
+            message += " (using fallback - upload ChatterboxTTS for real synthesis)"
+        logger.info(f"✅ Audio saved: {audio_id} ({generation_time:.2f}s) with voice '{voice_name}'")
+        return TTSResponse(
+            success=True,
+            audio_id=audio_id,
+            message=message,
+            sample_rate=sample_rate,
+            duration=len(audio_data) / sample_rate
+        )
     except HTTPException:
         raise
 @app.get("/api/audio/{audio_id}")
 async def get_audio(audio_id: str):
+    """Download generated audio file"""
     if audio_id not in audio_cache:
         raise HTTPException(status_code=404, detail="Audio not found")
 @app.get("/api/audio/{audio_id}/info")
 async def get_audio_info(audio_id: str):
+    """Get audio file information"""
     if audio_id not in audio_cache:
         raise HTTPException(status_code=404, detail="Audio not found")
 @app.get("/api/audio")
 async def list_audio():
+    """List all generated audio files"""
     return {
         "audio_files": [
             {
                 "audio_id": audio_id,
                 "text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
+                "voice_name": info.get("voice_name", "Unknown"),
                 "duration": info["duration"],
                 "generated_at": info["generated_at"],
                 "real_chatterbox": info.get("real_chatterbox", False)
 # Gradio interface
 def create_gradio_interface():
+    """Create Gradio interface with voice management"""
+    def get_voice_choices():
+        """Get voice choices for dropdown"""
+        choices = []
+        for voice_id, voice_info in voice_library.items():
+            voice_type = "🔧" if voice_info["type"] == "builtin" else "🎭"
+            choices.append((f"{voice_type} {voice_info['name']} - {voice_info['description']}", voice_id))
+        return choices
+    def refresh_voice_choices():
+        """Refresh voice dropdown"""
+        return gr.update(choices=get_voice_choices())
+    def create_voice_ui(voice_name, voice_description, audio_file):
+        """Create voice from UI"""
+        try:
+            if not voice_name.strip():
+                return "❌ Please enter a voice name", gr.update()
+            if audio_file is None:
+                return "❌ Please upload an audio file", gr.update()
+            voice_id, voice_entry = create_voice_from_audio(
+                audio_file,
+                voice_name.strip(),
+                voice_description.strip() or "Custom voice"
+            )
+            if voice_id:
+                updated_choices = get_voice_choices()
+                return (
+                    f"✅ Voice '{voice_name}' created successfully!\n"
+                    f"🆔 Voice ID: {voice_id}\n"
+                    f"📁 Audio saved and ready to use\n"
+                    f"🎭 Available in voice selection dropdown",
+                    gr.update(choices=updated_choices, value=voice_id)
+                )
+            else:
+                return "❌ Failed to create voice", gr.update()
+        except Exception as e:
+            logger.error(f"UI voice creation failed: {e}")
+            return f"❌ Voice creation failed: {str(e)}", gr.update()
+    def generate_speech_ui(text, voice_id, exag, temp, seed_val, cfg):
+        """Generate speech from UI using voice ID"""
+        try:
+            if not text.strip():
+                return None, "❌ Please enter some text"
+            if len(text) > 300:
+                return None, "❌ Text too long (max 300 characters)"
+            if not voice_id or voice_id not in voice_library:
+                return None, "❌ Please select a valid voice"
+            start_time = time.time()
+            # Generate audio using voice ID
+            sample_rate, audio_data = generate_tts_audio(
+                text, voice_id, exag, temp, int(seed_val), cfg
+            )
+            generation_time = time.time() - start_time
+            duration = len(audio_data) / sample_rate
+            voice_name = voice_library[voice_id]["name"]
+            voice_type = voice_library[voice_id]["type"]
+            if CHATTERBOX_AVAILABLE:
+                status = f"""✅ Real ChatterboxTTS synthesis completed!
+🎭 Voice: {voice_name} ({voice_type})
+⏱️ Generation time: {generation_time:.2f}s
+🎵 Audio duration: {duration:.2f}s
+📊 Sample rate: {sample_rate} Hz
+🔊 Audio samples: {len(audio_data):,}
+                """
+            else:
+                status = f"""⚠️ Fallback audio generated (beep sound)
+🚨 This is NOT real speech synthesis!
+🎭 Voice: {voice_name} ({voice_type})
+📦 Upload ChatterboxTTS package for real synthesis
+⏱️ Generation time: {generation_time:.2f}s
+🎵 Audio duration: {duration:.2f}s
+💡 To fix: Upload your ChatterboxTTS files to this Space
+                """
+            return (sample_rate, audio_data), status
+        except Exception as e:
+            logger.error(f"UI generation failed: {e}")
+            return None, f"❌ Generation failed: {str(e)}"
+    def delete_voice_ui(voice_id):
+        """Delete voice from UI"""
+        try:
+            if not voice_id or voice_id not in voice_library:
+                return "❌ Please select a voice to delete", gr.update()
+            voice_info = voice_library[voice_id]
+            if voice_info.get("type") == "builtin":
+                return "❌ Cannot delete builtin voices", gr.update()
+            voice_name = voice_info["name"]
+            # Delete audio file
+            if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
+                os.unlink(voice_info["audio_path"])
+            # Remove from library
+            del voice_library[voice_id]
+            save_voice_library()
+            updated_choices = get_voice_choices()
+            return (
+                f"✅ Voice '{voice_name}' deleted successfully",
+                gr.update(choices=updated_choices, value=updated_choices[0][1] if updated_choices else None)
+            )
+        except Exception as e:
+            logger.error(f"UI voice deletion failed: {e}")
+            return f"❌ Voice deletion failed: {str(e)}", gr.update()
+    with gr.Blocks(title="ChatterboxTTS Voice Manager", theme=gr.themes.Soft()) as demo:
         # Status indicator at the top
         if CHATTERBOX_AVAILABLE:
         """)
         gr.Markdown("""
+        # 🎵 ChatterboxTTS Voice Manager
+        **Advanced text-to-speech with custom voice cloning and voice library management**
         """)
+        with gr.Tabs():
+            # Text-to-Speech Tab
+            with gr.TabItem("🎵 Generate Speech"):
+                with gr.Row():
+                    with gr.Column():
+                        text_input = gr.Textbox(
+                            value="Hello, this is ChatterboxTTS with custom voice cloning. I can speak in any voice you train me with!",
+                            label="Text to synthesize (max 300 characters)",
+                            max_lines=5,
+                            placeholder="Enter your text here..."
+                        )
+                        voice_selector = gr.Dropdown(
+                            label="🎭 Select Voice",
+                            choices=get_voice_choices(),
+                            value=list(voice_library.keys())[0] if voice_library else None,
+                            interactive=True,
+                            info="Choose from builtin voices (🔧) or your custom voices (🎭)"
+                        )
+                        with gr.Row():
+                            generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
+                            refresh_voices_btn = gr.Button("🔄 Refresh Voices", size="sm")
+                        with gr.Row():
+                            exaggeration = gr.Slider(
+                                0.25, 2,
+                                step=0.05,
+                                label="Exaggeration",
+                                value=0.5,
+                                info="Controls expressiveness (0.5 = neutral)"
+                            )
+                            cfg_weight = gr.Slider(
+                                0.2, 1,
+                                step=0.05,
+                                label="CFG Weight",
+                                value=0.5,
+                                info="Controls pace and clarity"
+                            )
+                        with gr.Accordion("Advanced Settings", open=False):
+                            temperature = gr.Slider(
+                                0.05, 5,
+                                step=0.05,
+                                label="Temperature",
+                                value=0.8,
+                                info="Controls randomness"
+                            )
+                            seed = gr.Number(
+                                value=0,
+                                label="Seed (0 = random)",
+                                info="Set to non-zero for reproducible results"
+                            )
+                    with gr.Column():
+                        audio_output = gr.Audio(label="🔊 Generated Speech")
+                        status_text = gr.Textbox(
+                            label="📊 Generation Status",
+                            interactive=False,
+                            lines=8,
+                            placeholder="Select a voice and click 'Generate Speech' to start..."
+                        )
+            # Voice Management Tab
+            with gr.TabItem("🎭 Voice Library"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### 📚 Available Voices")
+                        voices_display = gr.HTML(
+                            value=f"""
+                            <div style="max-height: 300px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
+                                {''.join([f"<p><strong>{voice_info['name']}</strong> ({voice_info['type']})<br><small>{voice_info['description']}</small></p>" for voice_info in voice_library.values()])}
+                            </div>
+                            """
+                        )
+                        gr.Markdown("### 🗑️ Delete Voice")
+                        delete_voice_selector = gr.Dropdown(
+                            label="Select voice to delete",
+                            choices=[(f"{info['name']} ({info['type']})", vid) for vid, info in voice_library.items() if info['type'] == 'custom'],
+                            value=None
+                        )
+                        delete_voice_btn = gr.Button("🗑️ Delete Selected Voice", variant="stop")
+                        delete_status = gr.Textbox(label="Delete Status", interactive=False)
+                    with gr.Column():
+                        gr.Markdown("### ➕ Create New Voice")
+                        new_voice_name = gr.Textbox(
+                            label="Voice Name",
+                            placeholder="e.g., 'John's Voice', 'Narrator Voice'",
+                            value=""
+                        )
+                        new_voice_description = gr.Textbox(
+                            label="Voice Description",
+                            placeholder="e.g., 'Professional male voice', 'Warm female narrator'",
+                            value=""
+                        )
+                        new_voice_audio = gr.Audio(
+                            label="Upload Voice Sample",
+                            type="numpy",
+                            info="Upload 5-30 seconds of clear speech"
+                        )
+                        create_voice_btn = gr.Button("🎯 Create Voice", variant="primary")
+                        create_status = gr.Textbox(
+                            label="📊 Creation Status",
+                            interactive=False,
+                            lines=6
+                        )
+            # Voice Library Info Tab
+            with gr.TabItem("📋 Voice Guide"):
+                gr.Markdown(f"""
+                ## 🎭 Voice Library Management
+                ### 📚 Current Library Status
+                - **Total Voices**: {len(voice_library)}
+                - **Builtin Voices**: {len([v for v in voice_library.values() if v['type'] == 'builtin'])}
+                - **Custom Voices**: {len([v for v in voice_library.values() if v['type'] == 'custom'])}
+                ### 🔧 Builtin Voices
+                These are pre-configured voices that come with the system:
+                {chr(10).join([f"- **{voice_info['name']}**: {voice_info['description']}" for voice_info in voice_library.values() if voice_info['type'] == 'builtin'])}
+                ### 🎯 Creating Custom Voices
+                #### 📝 Best Practices:
+                1. **Audio Quality**: Use clear, noise-free recordings
+                2. **Duration**: 5-30 seconds of natural speech
+                3. **Content**: Normal conversational speech works best
+                4. **Format**: WAV, MP3, or FLAC files supported
+                5. **Voice Consistency**: Use the same speaker throughout
+                #### 🎤 Recording Tips:
+                - Record in a quiet environment
+                - Speak naturally and clearly
+                - Avoid background noise
+                - Use a decent microphone if possible
+                - Read a paragraph of normal text
+                #### 🔄 Voice Management:
+                - **Create**: Upload audio + provide name and description
+                - **Use**: Select from dropdown in speech generation
+                - **Delete**: Remove custom voices you no longer need
+                - **Persistent**: Custom voices are saved permanently
+                ### 🚀 Usage Workflow:
+                1. **Upload Voice Sample** → Create custom voice
+                2. **Select Voice** → Choose from library
+                3. **Generate Speech** → Use selected voice for TTS
+                4. **Manage Library** → Add, delete, organize voices
+                ### 🔄 API Integration:
+                ```python
+                # List voices
+                GET /api/voices
+                # Create voice
+                POST /api/voices/create
+                # Generate speech with voice
+                POST /api/tts/synthesize
+                {{
+                    "text": "Hello world",
+                    "voice_id": "your_voice_id"
+                }}
+                # Delete voice
+                DELETE /api/voices/voice_id
+                ```
+                ### 💡 Pro Tips:
+                - **Voice Naming**: Use descriptive names like "John_Professional" or "Sarah_Narrator"
+                - **Voice Testing**: Generate short test phrases after creating voices
+                - **Voice Backup**: Custom voices are saved to disk automatically
+                - **Voice Sharing**: Voice IDs can be shared via API
+                """)
+        # Event handlers
         generate_btn.click(
             fn=generate_speech_ui,
+            inputs=[text_input, voice_selector, exaggeration, temperature, seed, cfg_weight],
             outputs=[audio_output, status_text]
         )
+        refresh_voices_btn.click(
+            fn=refresh_voice_choices,
+            outputs=[voice_selector]
+        )
+        create_voice_btn.click(
+            fn=create_voice_ui,
+            inputs=[new_voice_name, new_voice_description, new_voice_audio],
+            outputs=[create_status, voice_selector]
+        )
+        delete_voice_btn.click(
+            fn=delete_voice_ui,
+            inputs=[delete_voice_selector],
+            outputs=[delete_status, voice_selector]
+        )
+        # System info with voice library status
         model_status = "✅ Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "⚠️ Fallback Model (Beep Sounds)"
         chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
         ### 📊 System Status
         - **Model**: {model_status}
         - **Device**: {DEVICE}
         - **ChatterboxTTS**: {chatterbox_status}
+        - **Voice Library**: {len(voice_library)} voices loaded
+        - **Generated Files**: {len(audio_cache)}
+        - **Storage**: `{VOICES_DIR}/` for voices, `{AUDIO_DIR}/` for output
         {'''### 🎉 Production Ready!
+        Your ChatterboxTTS model is loaded with voice management system.''' if CHATTERBOX_AVAILABLE else '''### ⚠️ Action Required
         **You're hearing beep sounds because ChatterboxTTS isn't loaded.**
+        Voice management is working, but you need ChatterboxTTS for real synthesis.'''}
         """)
     return demo
 # Main execution
 if __name__ == "__main__":
+    logger.info("🎉 Starting ChatterboxTTS Voice Management Service...")
     # Model status
     if CHATTERBOX_AVAILABLE and MODEL:
     logger.info(f"Model Status: {model_status}")
     logger.info(f"Device: {DEVICE}")
     logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
+    logger.info(f"Voice Library: {len(voice_library)} voices loaded")
+    logger.info(f"Custom Voices: {len([v for v in voice_library.values() if v['type'] == 'custom'])}")
     if not CHATTERBOX_AVAILABLE:
         logger.warning("🚨 IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
     if os.getenv("SPACE_ID"):
         # Running in Hugging Face Spaces
         logger.info("🌐 FastAPI: http://localhost:8000")
         logger.info("📚 API Docs: http://localhost:8000/docs")
+        logger.info("🔗 API Endpoints:")
+        logger.info("   - GET  /api/voices")
+        logger.info("   - POST /api/voices/create")
+        logger.info("   - DELETE /api/voices/{voice_id}")
+        logger.info("   - POST /api/tts/synthesize")
         # Start Gradio
         demo = create_gradio_interface()