Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,12 +9,13 @@ import uuid
|
|
| 9 |
import logging
|
| 10 |
import requests
|
| 11 |
import io
|
| 12 |
-
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
|
| 15 |
import gradio as gr
|
| 16 |
import spaces
|
| 17 |
-
from fastapi import FastAPI, HTTPException
|
| 18 |
from fastapi.responses import StreamingResponse
|
| 19 |
from fastapi.middleware.cors import CORSMiddleware
|
| 20 |
from pydantic import BaseModel
|
|
@@ -31,10 +32,148 @@ logger.info(f"π Running on device: {DEVICE}")
|
|
| 31 |
MODEL = None
|
| 32 |
CHATTERBOX_AVAILABLE = False
|
| 33 |
|
| 34 |
-
# Storage
|
| 35 |
AUDIO_DIR = "generated_audio"
|
|
|
|
| 36 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
| 37 |
audio_cache = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def load_chatterbox_model():
|
| 40 |
"""Try multiple ways to load ChatterboxTTS from Resemble AI"""
|
|
@@ -81,52 +220,6 @@ def load_chatterbox_model():
|
|
| 81 |
except Exception as e:
|
| 82 |
logger.warning(f"Method 3 failed with error: {e}")
|
| 83 |
|
| 84 |
-
# Method 4: Try exploring the installed package
|
| 85 |
-
try:
|
| 86 |
-
import chatterbox
|
| 87 |
-
import inspect
|
| 88 |
-
|
| 89 |
-
# Log what's available in the chatterbox package
|
| 90 |
-
logger.info(f"Chatterbox module path: {chatterbox.__file__}")
|
| 91 |
-
logger.info(f"Chatterbox contents: {dir(chatterbox)}")
|
| 92 |
-
|
| 93 |
-
# Try to find ChatterboxTTS class anywhere in the module
|
| 94 |
-
for name, obj in inspect.getmembers(chatterbox):
|
| 95 |
-
if name == 'ChatterboxTTS' or (inspect.isclass(obj) and 'TTS' in name):
|
| 96 |
-
logger.info(f"Found potential TTS class: {name}")
|
| 97 |
-
MODEL = obj.from_pretrained(DEVICE)
|
| 98 |
-
CHATTERBOX_AVAILABLE = True
|
| 99 |
-
return True
|
| 100 |
-
|
| 101 |
-
raise ImportError("ChatterboxTTS class not found in chatterbox package")
|
| 102 |
-
|
| 103 |
-
except ImportError as e:
|
| 104 |
-
logger.warning(f"Method 4 failed: {e}")
|
| 105 |
-
except Exception as e:
|
| 106 |
-
logger.warning(f"Method 4 failed with error: {e}")
|
| 107 |
-
|
| 108 |
-
# Method 5: Check if the GitHub repo was installed correctly
|
| 109 |
-
try:
|
| 110 |
-
import pkg_resources
|
| 111 |
-
try:
|
| 112 |
-
pkg_resources.get_distribution('chatterbox')
|
| 113 |
-
logger.info("β
Chatterbox package is installed")
|
| 114 |
-
except pkg_resources.DistributionNotFound:
|
| 115 |
-
logger.warning("β Chatterbox package not found in installed packages")
|
| 116 |
-
|
| 117 |
-
# Try to import and inspect what we got
|
| 118 |
-
import chatterbox
|
| 119 |
-
chatterbox_path = chatterbox.__path__[0] if hasattr(chatterbox, '__path__') else str(chatterbox.__file__)
|
| 120 |
-
logger.info(f"Chatterbox installed at: {chatterbox_path}")
|
| 121 |
-
|
| 122 |
-
# List all available modules/classes
|
| 123 |
-
import pkgutil
|
| 124 |
-
for importer, modname, ispkg in pkgutil.walk_packages(chatterbox.__path__, chatterbox.__name__ + "."):
|
| 125 |
-
logger.info(f"Available module: {modname}")
|
| 126 |
-
|
| 127 |
-
except Exception as e:
|
| 128 |
-
logger.warning(f"Package inspection failed: {e}")
|
| 129 |
-
|
| 130 |
# If we get here, the GitHub repo might have a different structure
|
| 131 |
logger.error("β Could not load ChatterboxTTS from Resemble AI repository")
|
| 132 |
logger.error("π‘ The GitHub repo might have a different structure than expected")
|
|
@@ -135,30 +228,6 @@ def load_chatterbox_model():
|
|
| 135 |
|
| 136 |
return False
|
| 137 |
|
| 138 |
-
def download_audio_from_url(url):
|
| 139 |
-
"""Download audio from URL and save to temporary file"""
|
| 140 |
-
try:
|
| 141 |
-
logger.info(f"π₯ Downloading reference audio from: {url}")
|
| 142 |
-
response = requests.get(url, timeout=30, headers={
|
| 143 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 144 |
-
})
|
| 145 |
-
|
| 146 |
-
if response.status_code == 200:
|
| 147 |
-
# Create temporary file
|
| 148 |
-
temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 149 |
-
temp_file.write(response.content)
|
| 150 |
-
temp_file.close()
|
| 151 |
-
|
| 152 |
-
logger.info(f"β
Audio downloaded to: {temp_file.name}")
|
| 153 |
-
return temp_file.name
|
| 154 |
-
else:
|
| 155 |
-
logger.error(f"β HTTP {response.status_code} when downloading audio")
|
| 156 |
-
return None
|
| 157 |
-
|
| 158 |
-
except Exception as e:
|
| 159 |
-
logger.error(f"β Error downloading audio from URL: {e}")
|
| 160 |
-
return None
|
| 161 |
-
|
| 162 |
def get_or_load_model():
|
| 163 |
"""Load ChatterboxTTS model if not already loaded"""
|
| 164 |
global MODEL
|
|
@@ -171,7 +240,6 @@ def get_or_load_model():
|
|
| 171 |
logger.info("β
ChatterboxTTS model loaded successfully")
|
| 172 |
else:
|
| 173 |
logger.error("β Failed to load ChatterboxTTS - using fallback")
|
| 174 |
-
# Create a better fallback that shows the issue
|
| 175 |
create_fallback_model()
|
| 176 |
return MODEL
|
| 177 |
|
|
@@ -230,15 +298,29 @@ def generate_id():
|
|
| 230 |
"""Generate unique ID"""
|
| 231 |
return str(uuid.uuid4())
|
| 232 |
|
|
|
|
|
|
|
|
|
|
| 233 |
# Pydantic models for API
|
| 234 |
class TTSRequest(BaseModel):
|
| 235 |
text: str
|
| 236 |
-
|
| 237 |
exaggeration: Optional[float] = 0.5
|
| 238 |
temperature: Optional[float] = 0.8
|
| 239 |
cfg_weight: Optional[float] = 0.5
|
| 240 |
seed: Optional[int] = 0
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
class TTSResponse(BaseModel):
|
| 243 |
success: bool
|
| 244 |
audio_id: Optional[str] = None
|
|
@@ -260,14 +342,14 @@ except Exception as e:
|
|
| 260 |
@spaces.GPU
|
| 261 |
def generate_tts_audio(
|
| 262 |
text_input: str,
|
| 263 |
-
|
| 264 |
exaggeration_input: float,
|
| 265 |
temperature_input: float,
|
| 266 |
seed_num_input: int,
|
| 267 |
cfgw_input: float
|
| 268 |
) -> tuple[int, np.ndarray]:
|
| 269 |
"""
|
| 270 |
-
Generate TTS audio using ChatterboxTTS model
|
| 271 |
"""
|
| 272 |
current_model = get_or_load_model()
|
| 273 |
|
|
@@ -278,29 +360,25 @@ def generate_tts_audio(
|
|
| 278 |
set_seed(int(seed_num_input))
|
| 279 |
|
| 280 |
logger.info(f"π΅ Generating audio for: '{text_input[:50]}...'")
|
|
|
|
| 281 |
|
| 282 |
if not CHATTERBOX_AVAILABLE:
|
| 283 |
logger.warning("π¨ USING FALLBACK - Real ChatterboxTTS not found!")
|
| 284 |
-
logger.warning("π To fix: Upload your ChatterboxTTS package to this Space")
|
| 285 |
|
| 286 |
-
#
|
| 287 |
-
audio_prompt_path =
|
| 288 |
temp_audio_file = None
|
| 289 |
|
| 290 |
try:
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
audio_prompt_path = None
|
| 301 |
-
elif audio_prompt_path_input and not os.path.exists(audio_prompt_path_input):
|
| 302 |
-
logger.warning(f"β οΈ Audio file not found: {audio_prompt_path_input}, proceeding without reference")
|
| 303 |
-
audio_prompt_path = None
|
| 304 |
|
| 305 |
# Generate audio
|
| 306 |
wav = current_model.generate(
|
|
@@ -322,8 +400,8 @@ def generate_tts_audio(
|
|
| 322 |
logger.error(f"β Audio generation failed: {e}")
|
| 323 |
raise
|
| 324 |
finally:
|
| 325 |
-
# Clean up temporary file
|
| 326 |
-
if temp_audio_file and os.path.exists(temp_audio_file):
|
| 327 |
try:
|
| 328 |
os.unlink(temp_audio_file)
|
| 329 |
logger.info(f"ποΈ Cleaned up temporary file: {temp_audio_file}")
|
|
@@ -332,9 +410,9 @@ def generate_tts_audio(
|
|
| 332 |
|
| 333 |
# FastAPI app for API endpoints
|
| 334 |
app = FastAPI(
|
| 335 |
-
title="ChatterboxTTS API",
|
| 336 |
-
description="
|
| 337 |
-
version="
|
| 338 |
)
|
| 339 |
|
| 340 |
app.add_middleware(
|
|
@@ -349,15 +427,18 @@ app.add_middleware(
|
|
| 349 |
async def root():
|
| 350 |
"""API status endpoint"""
|
| 351 |
return {
|
| 352 |
-
"service": "ChatterboxTTS API",
|
| 353 |
-
"version": "
|
| 354 |
"status": "operational" if MODEL else "model_loading",
|
| 355 |
"model_loaded": MODEL is not None,
|
| 356 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
| 357 |
"device": DEVICE,
|
|
|
|
| 358 |
"message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
|
| 359 |
"endpoints": {
|
| 360 |
"synthesize": "/api/tts/synthesize",
|
|
|
|
|
|
|
| 361 |
"audio": "/api/audio/{audio_id}",
|
| 362 |
"health": "/health"
|
| 363 |
}
|
|
@@ -371,14 +452,105 @@ async def health_check():
|
|
| 371 |
"model_loaded": MODEL is not None,
|
| 372 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
| 373 |
"device": DEVICE,
|
|
|
|
| 374 |
"timestamp": time.time(),
|
| 375 |
"warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
|
| 376 |
}
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
@app.post("/api/tts/synthesize", response_model=TTSResponse)
|
| 379 |
async def synthesize_speech(request: TTSRequest):
|
| 380 |
"""
|
| 381 |
-
Synthesize speech from text
|
| 382 |
"""
|
| 383 |
try:
|
| 384 |
if MODEL is None:
|
|
@@ -390,70 +562,55 @@ async def synthesize_speech(request: TTSRequest):
|
|
| 390 |
if len(request.text) > 500:
|
| 391 |
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
| 392 |
|
|
|
|
|
|
|
|
|
|
| 393 |
start_time = time.time()
|
| 394 |
|
| 395 |
-
#
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
-
|
| 400 |
-
temp_audio_file = download_audio_from_url(request.audio_prompt_url)
|
| 401 |
-
if temp_audio_file:
|
| 402 |
-
audio_prompt_path = temp_audio_file
|
| 403 |
-
else:
|
| 404 |
-
logger.warning("Failed to download reference audio, proceeding without")
|
| 405 |
-
audio_prompt_path = None
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
audio_prompt_path,
|
| 412 |
-
request.exaggeration,
|
| 413 |
-
request.temperature,
|
| 414 |
-
request.seed,
|
| 415 |
-
request.cfg_weight
|
| 416 |
-
)
|
| 417 |
-
|
| 418 |
-
generation_time = time.time() - start_time
|
| 419 |
-
|
| 420 |
-
# Save audio file
|
| 421 |
-
audio_id = generate_id()
|
| 422 |
-
audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
|
| 423 |
-
sf.write(audio_path, audio_data, sample_rate)
|
| 424 |
-
|
| 425 |
-
# Cache audio info
|
| 426 |
-
audio_cache[audio_id] = {
|
| 427 |
-
"path": audio_path,
|
| 428 |
-
"text": request.text,
|
| 429 |
-
"sample_rate": sample_rate,
|
| 430 |
-
"duration": len(audio_data) / sample_rate,
|
| 431 |
-
"generated_at": time.time(),
|
| 432 |
-
"generation_time": generation_time,
|
| 433 |
-
"real_chatterbox": CHATTERBOX_AVAILABLE
|
| 434 |
-
}
|
| 435 |
-
|
| 436 |
-
message = "Speech synthesized successfully"
|
| 437 |
-
if not CHATTERBOX_AVAILABLE:
|
| 438 |
-
message += " (using fallback - upload ChatterboxTTS for real synthesis)"
|
| 439 |
-
|
| 440 |
-
logger.info(f"β
Audio saved: {audio_id} ({generation_time:.2f}s)")
|
| 441 |
-
|
| 442 |
-
return TTSResponse(
|
| 443 |
-
success=True,
|
| 444 |
-
audio_id=audio_id,
|
| 445 |
-
message=message,
|
| 446 |
-
sample_rate=sample_rate,
|
| 447 |
-
duration=len(audio_data) / sample_rate
|
| 448 |
-
)
|
| 449 |
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
except HTTPException:
|
| 459 |
raise
|
|
@@ -463,9 +620,7 @@ async def synthesize_speech(request: TTSRequest):
|
|
| 463 |
|
| 464 |
@app.get("/api/audio/{audio_id}")
|
| 465 |
async def get_audio(audio_id: str):
|
| 466 |
-
"""
|
| 467 |
-
Download generated audio file
|
| 468 |
-
"""
|
| 469 |
if audio_id not in audio_cache:
|
| 470 |
raise HTTPException(status_code=404, detail="Audio not found")
|
| 471 |
|
|
@@ -489,9 +644,7 @@ async def get_audio(audio_id: str):
|
|
| 489 |
|
| 490 |
@app.get("/api/audio/{audio_id}/info")
|
| 491 |
async def get_audio_info(audio_id: str):
|
| 492 |
-
"""
|
| 493 |
-
Get audio file information
|
| 494 |
-
"""
|
| 495 |
if audio_id not in audio_cache:
|
| 496 |
raise HTTPException(status_code=404, detail="Audio not found")
|
| 497 |
|
|
@@ -499,14 +652,13 @@ async def get_audio_info(audio_id: str):
|
|
| 499 |
|
| 500 |
@app.get("/api/audio")
|
| 501 |
async def list_audio():
|
| 502 |
-
"""
|
| 503 |
-
List all generated audio files
|
| 504 |
-
"""
|
| 505 |
return {
|
| 506 |
"audio_files": [
|
| 507 |
{
|
| 508 |
"audio_id": audio_id,
|
| 509 |
"text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
|
|
|
|
| 510 |
"duration": info["duration"],
|
| 511 |
"generated_at": info["generated_at"],
|
| 512 |
"real_chatterbox": info.get("real_chatterbox", False)
|
|
@@ -518,9 +670,135 @@ async def list_audio():
|
|
| 518 |
|
| 519 |
# Gradio interface
|
| 520 |
def create_gradio_interface():
|
| 521 |
-
"""Create Gradio interface with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
-
with gr.Blocks(title="ChatterboxTTS", theme=gr.themes.Soft()) as demo:
|
| 524 |
|
| 525 |
# Status indicator at the top
|
| 526 |
if CHATTERBOX_AVAILABLE:
|
|
@@ -537,141 +815,223 @@ def create_gradio_interface():
|
|
| 537 |
""")
|
| 538 |
|
| 539 |
gr.Markdown("""
|
| 540 |
-
# π΅ ChatterboxTTS
|
| 541 |
|
| 542 |
-
|
| 543 |
""")
|
| 544 |
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
-
|
| 563 |
-
""
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
with gr.Column():
|
| 567 |
-
text_input = gr.Textbox(
|
| 568 |
-
value="Hello, this is ChatterboxTTS. I can generate natural-sounding speech from any text you provide.",
|
| 569 |
-
label="Text to synthesize (max 300 characters)",
|
| 570 |
-
max_lines=5,
|
| 571 |
-
placeholder="Enter your text here..."
|
| 572 |
-
)
|
| 573 |
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
info="URL will be downloaded automatically, or use local file path"
|
| 579 |
-
)
|
| 580 |
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
step=0.05,
|
| 585 |
-
label="Exaggeration",
|
| 586 |
-
value=0.5,
|
| 587 |
-
info="Controls expressiveness (0.5 = neutral)"
|
| 588 |
-
)
|
| 589 |
-
|
| 590 |
-
cfg_weight = gr.Slider(
|
| 591 |
-
0.2, 1,
|
| 592 |
-
step=0.05,
|
| 593 |
-
label="CFG Weight",
|
| 594 |
-
value=0.5,
|
| 595 |
-
info="Controls pace and clarity"
|
| 596 |
-
)
|
| 597 |
|
| 598 |
-
|
| 599 |
-
temperature = gr.Slider(
|
| 600 |
-
0.05, 5,
|
| 601 |
-
step=0.05,
|
| 602 |
-
label="Temperature",
|
| 603 |
-
value=0.8,
|
| 604 |
-
info="Controls randomness"
|
| 605 |
-
)
|
| 606 |
-
|
| 607 |
-
seed = gr.Number(
|
| 608 |
-
value=0,
|
| 609 |
-
label="Seed (0 = random)",
|
| 610 |
-
info="Set to non-zero for reproducible results"
|
| 611 |
-
)
|
| 612 |
|
| 613 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
|
| 615 |
-
|
| 616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
def generate_speech_ui(text, prompt_url, exag, temp, seed_val, cfg):
|
| 625 |
-
"""Generate speech from UI"""
|
| 626 |
-
try:
|
| 627 |
-
if not text.strip():
|
| 628 |
-
return None, "β Please enter some text"
|
| 629 |
|
| 630 |
-
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
| 634 |
|
| 635 |
-
#
|
| 636 |
-
|
| 637 |
-
text, prompt_url, exag, temp, int(seed_val), cfg
|
| 638 |
-
)
|
| 639 |
|
| 640 |
-
|
| 641 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
β±οΈ Generation time: {generation_time:.2f}s
|
| 647 |
-
π΅ Audio duration: {duration:.2f}s
|
| 648 |
-
π Sample rate: {sample_rate} Hz
|
| 649 |
-
π Audio samples: {len(audio_data):,}
|
| 650 |
-
"""
|
| 651 |
-
else:
|
| 652 |
-
status = f"""β οΈ Fallback audio generated (beep sound)
|
| 653 |
-
|
| 654 |
-
π¨ This is NOT real speech synthesis!
|
| 655 |
-
π¦ Upload ChatterboxTTS package for real synthesis
|
| 656 |
-
β±οΈ Generation time: {generation_time:.2f}s
|
| 657 |
-
π΅ Audio duration: {duration:.2f}s
|
| 658 |
-
|
| 659 |
-
π‘ To fix: Upload your ChatterboxTTS files to this Space
|
| 660 |
-
"""
|
| 661 |
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
|
|
|
| 667 |
|
|
|
|
| 668 |
generate_btn.click(
|
| 669 |
fn=generate_speech_ui,
|
| 670 |
-
inputs=[text_input,
|
| 671 |
outputs=[audio_output, status_text]
|
| 672 |
)
|
| 673 |
|
| 674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
model_status = "β
Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "β οΈ Fallback Model (Beep Sounds)"
|
| 676 |
chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
|
| 677 |
|
|
@@ -679,26 +1039,23 @@ def create_gradio_interface():
|
|
| 679 |
### π System Status
|
| 680 |
- **Model**: {model_status}
|
| 681 |
- **Device**: {DEVICE}
|
| 682 |
-
- **Generated Files**: {len(audio_cache)}
|
| 683 |
- **ChatterboxTTS**: {chatterbox_status}
|
|
|
|
|
|
|
|
|
|
| 684 |
|
| 685 |
{'''### π Production Ready!
|
| 686 |
-
Your ChatterboxTTS model is loaded
|
| 687 |
**You're hearing beep sounds because ChatterboxTTS isn't loaded.**
|
| 688 |
|
| 689 |
-
|
| 690 |
-
1. Upload your ChatterboxTTS package to this Space
|
| 691 |
-
2. Ensure proper directory structure with `__init__.py` files
|
| 692 |
-
3. Restart the Space
|
| 693 |
-
|
| 694 |
-
The current fallback generates beeps to indicate missing package.'''}
|
| 695 |
""")
|
| 696 |
|
| 697 |
return demo
|
| 698 |
|
| 699 |
# Main execution
|
| 700 |
if __name__ == "__main__":
|
| 701 |
-
logger.info("π Starting ChatterboxTTS Service...")
|
| 702 |
|
| 703 |
# Model status
|
| 704 |
if CHATTERBOX_AVAILABLE and MODEL:
|
|
@@ -711,10 +1068,11 @@ if __name__ == "__main__":
|
|
| 711 |
logger.info(f"Model Status: {model_status}")
|
| 712 |
logger.info(f"Device: {DEVICE}")
|
| 713 |
logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
|
|
|
|
|
|
|
| 714 |
|
| 715 |
if not CHATTERBOX_AVAILABLE:
|
| 716 |
logger.warning("π¨ IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
|
| 717 |
-
logger.warning("π Expected location: ./chatterbox/src/chatterbox/tts.py")
|
| 718 |
|
| 719 |
if os.getenv("SPACE_ID"):
|
| 720 |
# Running in Hugging Face Spaces
|
|
@@ -739,6 +1097,11 @@ if __name__ == "__main__":
|
|
| 739 |
|
| 740 |
logger.info("π FastAPI: http://localhost:8000")
|
| 741 |
logger.info("π API Docs: http://localhost:8000/docs")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
|
| 743 |
# Start Gradio
|
| 744 |
demo = create_gradio_interface()
|
|
|
|
| 9 |
import logging
|
| 10 |
import requests
|
| 11 |
import io
|
| 12 |
+
import json
|
| 13 |
+
from typing import Optional, Dict, Any, List
|
| 14 |
from pathlib import Path
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
import spaces
|
| 18 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
| 19 |
from fastapi.responses import StreamingResponse
|
| 20 |
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
from pydantic import BaseModel
|
|
|
|
| 32 |
MODEL = None
|
| 33 |
CHATTERBOX_AVAILABLE = False
|
| 34 |
|
| 35 |
+
# Storage directories
|
| 36 |
AUDIO_DIR = "generated_audio"
|
| 37 |
+
VOICES_DIR = "custom_voices"
|
| 38 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
| 39 |
+
os.makedirs(VOICES_DIR, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
# Voice storage
|
| 42 |
audio_cache = {}
|
| 43 |
+
voice_library = {}
|
| 44 |
+
|
| 45 |
+
# Default/Built-in voices
|
| 46 |
+
BUILTIN_VOICES = {
|
| 47 |
+
"female_default": {
|
| 48 |
+
"voice_id": "female_default",
|
| 49 |
+
"name": "Female Default",
|
| 50 |
+
"description": "Professional female voice",
|
| 51 |
+
"audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
|
| 52 |
+
"type": "builtin",
|
| 53 |
+
"created_at": "2024-01-01T00:00:00Z"
|
| 54 |
+
},
|
| 55 |
+
"male_professional": {
|
| 56 |
+
"voice_id": "male_professional",
|
| 57 |
+
"name": "Male Professional",
|
| 58 |
+
"description": "Confident male voice",
|
| 59 |
+
"audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_professional.flac",
|
| 60 |
+
"type": "builtin",
|
| 61 |
+
"created_at": "2024-01-01T00:00:00Z"
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
def load_voice_library():
|
| 66 |
+
"""Load saved custom voices from disk"""
|
| 67 |
+
global voice_library
|
| 68 |
+
voice_library = BUILTIN_VOICES.copy()
|
| 69 |
+
|
| 70 |
+
voices_json_path = os.path.join(VOICES_DIR, "voices.json")
|
| 71 |
+
if os.path.exists(voices_json_path):
|
| 72 |
+
try:
|
| 73 |
+
with open(voices_json_path, 'r', encoding='utf-8') as f:
|
| 74 |
+
custom_voices = json.load(f)
|
| 75 |
+
voice_library.update(custom_voices)
|
| 76 |
+
logger.info(f"β
Loaded {len(custom_voices)} custom voices from disk")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"β Error loading voice library: {e}")
|
| 79 |
+
|
| 80 |
+
def save_voice_library():
|
| 81 |
+
"""Save custom voices to disk"""
|
| 82 |
+
try:
|
| 83 |
+
# Only save custom voices (not builtin)
|
| 84 |
+
custom_voices = {k: v for k, v in voice_library.items() if v.get("type") != "builtin"}
|
| 85 |
+
|
| 86 |
+
voices_json_path = os.path.join(VOICES_DIR, "voices.json")
|
| 87 |
+
with open(voices_json_path, 'w', encoding='utf-8') as f:
|
| 88 |
+
json.dump(custom_voices, f, ensure_ascii=False, indent=2)
|
| 89 |
+
logger.info(f"β
Saved {len(custom_voices)} custom voices to disk")
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"β Error saving voice library: {e}")
|
| 92 |
+
|
| 93 |
+
def create_voice_from_audio(audio_file, voice_name, voice_description="Custom voice"):
|
| 94 |
+
"""Create a new voice from uploaded audio"""
|
| 95 |
+
try:
|
| 96 |
+
voice_id = f"voice_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
| 97 |
+
|
| 98 |
+
# Save audio file
|
| 99 |
+
audio_filename = f"{voice_id}.wav"
|
| 100 |
+
audio_path = os.path.join(VOICES_DIR, audio_filename)
|
| 101 |
+
|
| 102 |
+
# Convert and save audio
|
| 103 |
+
if isinstance(audio_file, tuple):
|
| 104 |
+
# Gradio audio format (sample_rate, audio_data)
|
| 105 |
+
sample_rate, audio_data = audio_file
|
| 106 |
+
sf.write(audio_path, audio_data, sample_rate)
|
| 107 |
+
else:
|
| 108 |
+
# File upload
|
| 109 |
+
sf.write(audio_path, audio_file, 22050) # Default sample rate
|
| 110 |
+
|
| 111 |
+
# Create voice entry
|
| 112 |
+
voice_entry = {
|
| 113 |
+
"voice_id": voice_id,
|
| 114 |
+
"name": voice_name,
|
| 115 |
+
"description": voice_description,
|
| 116 |
+
"audio_path": audio_path,
|
| 117 |
+
"type": "custom",
|
| 118 |
+
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# Add to voice library
|
| 122 |
+
voice_library[voice_id] = voice_entry
|
| 123 |
+
save_voice_library()
|
| 124 |
+
|
| 125 |
+
logger.info(f"β
Created voice: {voice_name} ({voice_id})")
|
| 126 |
+
return voice_id, voice_entry
|
| 127 |
+
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.error(f"β Error creating voice: {e}")
|
| 130 |
+
return None, None
|
| 131 |
+
|
| 132 |
+
def download_audio_from_url(url):
|
| 133 |
+
"""Download audio from URL and save to temporary file"""
|
| 134 |
+
try:
|
| 135 |
+
logger.info(f"π₯ Downloading reference audio from: {url}")
|
| 136 |
+
response = requests.get(url, timeout=30, headers={
|
| 137 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
if response.status_code == 200:
|
| 141 |
+
# Create temporary file
|
| 142 |
+
temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 143 |
+
temp_file.write(response.content)
|
| 144 |
+
temp_file.close()
|
| 145 |
+
|
| 146 |
+
logger.info(f"β
Audio downloaded to: {temp_file.name}")
|
| 147 |
+
return temp_file.name
|
| 148 |
+
else:
|
| 149 |
+
logger.error(f"β HTTP {response.status_code} when downloading audio")
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.error(f"β Error downloading audio from URL: {e}")
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
def get_voice_audio_path(voice_id):
|
| 157 |
+
"""Get the audio path for a voice (download if URL, return path if local)"""
|
| 158 |
+
if voice_id not in voice_library:
|
| 159 |
+
return None
|
| 160 |
+
|
| 161 |
+
voice_info = voice_library[voice_id]
|
| 162 |
+
|
| 163 |
+
# If it's a custom voice with local file
|
| 164 |
+
if voice_info.get("type") == "custom" and "audio_path" in voice_info:
|
| 165 |
+
audio_path = voice_info["audio_path"]
|
| 166 |
+
if os.path.exists(audio_path):
|
| 167 |
+
return audio_path
|
| 168 |
+
else:
|
| 169 |
+
logger.warning(f"β οΈ Voice audio file not found: {audio_path}")
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
# If it's a builtin voice with URL
|
| 173 |
+
elif voice_info.get("type") == "builtin" and "audio_url" in voice_info:
|
| 174 |
+
return download_audio_from_url(voice_info["audio_url"])
|
| 175 |
+
|
| 176 |
+
return None
|
| 177 |
|
| 178 |
def load_chatterbox_model():
|
| 179 |
"""Try multiple ways to load ChatterboxTTS from Resemble AI"""
|
|
|
|
| 220 |
except Exception as e:
|
| 221 |
logger.warning(f"Method 3 failed with error: {e}")
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
# If we get here, the GitHub repo might have a different structure
|
| 224 |
logger.error("β Could not load ChatterboxTTS from Resemble AI repository")
|
| 225 |
logger.error("π‘ The GitHub repo might have a different structure than expected")
|
|
|
|
| 228 |
|
| 229 |
return False
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
def get_or_load_model():
|
| 232 |
"""Load ChatterboxTTS model if not already loaded"""
|
| 233 |
global MODEL
|
|
|
|
| 240 |
logger.info("β
ChatterboxTTS model loaded successfully")
|
| 241 |
else:
|
| 242 |
logger.error("β Failed to load ChatterboxTTS - using fallback")
|
|
|
|
| 243 |
create_fallback_model()
|
| 244 |
return MODEL
|
| 245 |
|
|
|
|
| 298 |
"""Generate unique ID"""
|
| 299 |
return str(uuid.uuid4())
|
| 300 |
|
| 301 |
+
# Load voice library at startup
|
| 302 |
+
load_voice_library()
|
| 303 |
+
|
| 304 |
# Pydantic models for API
|
| 305 |
class TTSRequest(BaseModel):
|
| 306 |
text: str
|
| 307 |
+
voice_id: Optional[str] = "female_default"
|
| 308 |
exaggeration: Optional[float] = 0.5
|
| 309 |
temperature: Optional[float] = 0.8
|
| 310 |
cfg_weight: Optional[float] = 0.5
|
| 311 |
seed: Optional[int] = 0
|
| 312 |
|
| 313 |
+
class VoiceCreateRequest(BaseModel):
|
| 314 |
+
voice_name: str
|
| 315 |
+
voice_description: Optional[str] = "Custom voice"
|
| 316 |
+
|
| 317 |
+
class VoiceInfo(BaseModel):
|
| 318 |
+
voice_id: str
|
| 319 |
+
name: str
|
| 320 |
+
description: str
|
| 321 |
+
type: str
|
| 322 |
+
created_at: str
|
| 323 |
+
|
| 324 |
class TTSResponse(BaseModel):
|
| 325 |
success: bool
|
| 326 |
audio_id: Optional[str] = None
|
|
|
|
| 342 |
@spaces.GPU
|
| 343 |
def generate_tts_audio(
|
| 344 |
text_input: str,
|
| 345 |
+
voice_id: str,
|
| 346 |
exaggeration_input: float,
|
| 347 |
temperature_input: float,
|
| 348 |
seed_num_input: int,
|
| 349 |
cfgw_input: float
|
| 350 |
) -> tuple[int, np.ndarray]:
|
| 351 |
"""
|
| 352 |
+
Generate TTS audio using ChatterboxTTS model with voice ID
|
| 353 |
"""
|
| 354 |
current_model = get_or_load_model()
|
| 355 |
|
|
|
|
| 360 |
set_seed(int(seed_num_input))
|
| 361 |
|
| 362 |
logger.info(f"π΅ Generating audio for: '{text_input[:50]}...'")
|
| 363 |
+
logger.info(f"π Using voice: {voice_id}")
|
| 364 |
|
| 365 |
if not CHATTERBOX_AVAILABLE:
|
| 366 |
logger.warning("π¨ USING FALLBACK - Real ChatterboxTTS not found!")
|
|
|
|
| 367 |
|
| 368 |
+
# Get audio path for the voice
|
| 369 |
+
audio_prompt_path = get_voice_audio_path(voice_id)
|
| 370 |
temp_audio_file = None
|
| 371 |
|
| 372 |
try:
|
| 373 |
+
if audio_prompt_path and audio_prompt_path.startswith('/tmp/'):
|
| 374 |
+
# It's a temporary file from URL download
|
| 375 |
+
temp_audio_file = audio_prompt_path
|
| 376 |
+
|
| 377 |
+
if audio_prompt_path:
|
| 378 |
+
voice_name = voice_library.get(voice_id, {}).get("name", voice_id)
|
| 379 |
+
logger.info(f"β
Using voice '{voice_name}' audio: {audio_prompt_path}")
|
| 380 |
+
else:
|
| 381 |
+
logger.warning(f"β οΈ Could not load audio for voice {voice_id}, using default")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
# Generate audio
|
| 384 |
wav = current_model.generate(
|
|
|
|
| 400 |
logger.error(f"β Audio generation failed: {e}")
|
| 401 |
raise
|
| 402 |
finally:
|
| 403 |
+
# Clean up temporary file (only if it's a downloaded URL)
|
| 404 |
+
if temp_audio_file and temp_audio_file.startswith('/tmp/') and os.path.exists(temp_audio_file):
|
| 405 |
try:
|
| 406 |
os.unlink(temp_audio_file)
|
| 407 |
logger.info(f"ποΈ Cleaned up temporary file: {temp_audio_file}")
|
|
|
|
| 410 |
|
| 411 |
# FastAPI app for API endpoints
|
| 412 |
app = FastAPI(
|
| 413 |
+
title="ChatterboxTTS Voice Manager API",
|
| 414 |
+
description="Advanced text-to-speech with voice cloning and management",
|
| 415 |
+
version="2.0.0"
|
| 416 |
)
|
| 417 |
|
| 418 |
app.add_middleware(
|
|
|
|
| 427 |
async def root():
|
| 428 |
"""API status endpoint"""
|
| 429 |
return {
|
| 430 |
+
"service": "ChatterboxTTS Voice Manager API",
|
| 431 |
+
"version": "2.0.0",
|
| 432 |
"status": "operational" if MODEL else "model_loading",
|
| 433 |
"model_loaded": MODEL is not None,
|
| 434 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
| 435 |
"device": DEVICE,
|
| 436 |
+
"voices_available": len(voice_library),
|
| 437 |
"message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
|
| 438 |
"endpoints": {
|
| 439 |
"synthesize": "/api/tts/synthesize",
|
| 440 |
+
"voices": "/api/voices",
|
| 441 |
+
"create_voice": "/api/voices/create",
|
| 442 |
"audio": "/api/audio/{audio_id}",
|
| 443 |
"health": "/health"
|
| 444 |
}
|
|
|
|
| 452 |
"model_loaded": MODEL is not None,
|
| 453 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
| 454 |
"device": DEVICE,
|
| 455 |
+
"voices_total": len(voice_library),
|
| 456 |
"timestamp": time.time(),
|
| 457 |
"warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
|
| 458 |
}
|
| 459 |
|
| 460 |
+
@app.get("/api/voices")
|
| 461 |
+
async def get_voices():
|
| 462 |
+
"""Get all available voices"""
|
| 463 |
+
voices = []
|
| 464 |
+
for voice_id, voice_info in voice_library.items():
|
| 465 |
+
voices.append(VoiceInfo(
|
| 466 |
+
voice_id=voice_id,
|
| 467 |
+
name=voice_info["name"],
|
| 468 |
+
description=voice_info["description"],
|
| 469 |
+
type=voice_info["type"],
|
| 470 |
+
created_at=voice_info["created_at"]
|
| 471 |
+
))
|
| 472 |
+
|
| 473 |
+
return {
|
| 474 |
+
"voices": voices,
|
| 475 |
+
"total": len(voices),
|
| 476 |
+
"builtin": len([v for v in voices if v.type == "builtin"]),
|
| 477 |
+
"custom": len([v for v in voices if v.type == "custom"])
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
@app.post("/api/voices/create")
|
| 481 |
+
async def create_voice_api(
|
| 482 |
+
voice_name: str,
|
| 483 |
+
voice_description: str = "Custom voice",
|
| 484 |
+
audio_file: UploadFile = File(...)
|
| 485 |
+
):
|
| 486 |
+
"""Create a new voice from uploaded audio"""
|
| 487 |
+
try:
|
| 488 |
+
# Read uploaded file
|
| 489 |
+
audio_data = await audio_file.read()
|
| 490 |
+
|
| 491 |
+
# Save to temporary file for processing
|
| 492 |
+
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 493 |
+
temp_file.write(audio_data)
|
| 494 |
+
temp_file.close()
|
| 495 |
+
|
| 496 |
+
# Create voice
|
| 497 |
+
voice_id, voice_entry = create_voice_from_audio(
|
| 498 |
+
temp_file.name,
|
| 499 |
+
voice_name,
|
| 500 |
+
voice_description
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
# Cleanup temp file
|
| 504 |
+
os.unlink(temp_file.name)
|
| 505 |
+
|
| 506 |
+
if voice_id:
|
| 507 |
+
return {
|
| 508 |
+
"success": True,
|
| 509 |
+
"voice_id": voice_id,
|
| 510 |
+
"message": f"Voice '{voice_name}' created successfully",
|
| 511 |
+
"voice_info": voice_entry
|
| 512 |
+
}
|
| 513 |
+
else:
|
| 514 |
+
raise HTTPException(status_code=500, detail="Failed to create voice")
|
| 515 |
+
|
| 516 |
+
except Exception as e:
|
| 517 |
+
logger.error(f"β Voice creation failed: {e}")
|
| 518 |
+
raise HTTPException(status_code=500, detail=f"Voice creation failed: {str(e)}")
|
| 519 |
+
|
| 520 |
+
@app.delete("/api/voices/{voice_id}")
|
| 521 |
+
async def delete_voice(voice_id: str):
|
| 522 |
+
"""Delete a custom voice"""
|
| 523 |
+
if voice_id not in voice_library:
|
| 524 |
+
raise HTTPException(status_code=404, detail="Voice not found")
|
| 525 |
+
|
| 526 |
+
voice_info = voice_library[voice_id]
|
| 527 |
+
|
| 528 |
+
if voice_info.get("type") == "builtin":
|
| 529 |
+
raise HTTPException(status_code=400, detail="Cannot delete builtin voices")
|
| 530 |
+
|
| 531 |
+
try:
|
| 532 |
+
# Delete audio file
|
| 533 |
+
if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
|
| 534 |
+
os.unlink(voice_info["audio_path"])
|
| 535 |
+
|
| 536 |
+
# Remove from library
|
| 537 |
+
voice_name = voice_info["name"]
|
| 538 |
+
del voice_library[voice_id]
|
| 539 |
+
save_voice_library()
|
| 540 |
+
|
| 541 |
+
return {
|
| 542 |
+
"success": True,
|
| 543 |
+
"message": f"Voice '{voice_name}' deleted successfully"
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
except Exception as e:
|
| 547 |
+
logger.error(f"β Voice deletion failed: {e}")
|
| 548 |
+
raise HTTPException(status_code=500, detail=f"Voice deletion failed: {str(e)}")
|
| 549 |
+
|
| 550 |
@app.post("/api/tts/synthesize", response_model=TTSResponse)
|
| 551 |
async def synthesize_speech(request: TTSRequest):
|
| 552 |
"""
|
| 553 |
+
Synthesize speech from text using voice ID
|
| 554 |
"""
|
| 555 |
try:
|
| 556 |
if MODEL is None:
|
|
|
|
| 562 |
if len(request.text) > 500:
|
| 563 |
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
| 564 |
|
| 565 |
+
if request.voice_id not in voice_library:
|
| 566 |
+
raise HTTPException(status_code=404, detail=f"Voice '{request.voice_id}' not found")
|
| 567 |
+
|
| 568 |
start_time = time.time()
|
| 569 |
|
| 570 |
+
# Generate audio using voice ID
|
| 571 |
+
sample_rate, audio_data = generate_tts_audio(
|
| 572 |
+
request.text,
|
| 573 |
+
request.voice_id,
|
| 574 |
+
request.exaggeration,
|
| 575 |
+
request.temperature,
|
| 576 |
+
request.seed,
|
| 577 |
+
request.cfg_weight
|
| 578 |
+
)
|
| 579 |
|
| 580 |
+
generation_time = time.time() - start_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
|
| 582 |
+
# Save audio file
|
| 583 |
+
audio_id = generate_id()
|
| 584 |
+
audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
|
| 585 |
+
sf.write(audio_path, audio_data, sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
+
# Cache audio info
|
| 588 |
+
voice_name = voice_library[request.voice_id]["name"]
|
| 589 |
+
audio_cache[audio_id] = {
|
| 590 |
+
"path": audio_path,
|
| 591 |
+
"text": request.text,
|
| 592 |
+
"voice_id": request.voice_id,
|
| 593 |
+
"voice_name": voice_name,
|
| 594 |
+
"sample_rate": sample_rate,
|
| 595 |
+
"duration": len(audio_data) / sample_rate,
|
| 596 |
+
"generated_at": time.time(),
|
| 597 |
+
"generation_time": generation_time,
|
| 598 |
+
"real_chatterbox": CHATTERBOX_AVAILABLE
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
message = f"Speech synthesized successfully using voice '{voice_name}'"
|
| 602 |
+
if not CHATTERBOX_AVAILABLE:
|
| 603 |
+
message += " (using fallback - upload ChatterboxTTS for real synthesis)"
|
| 604 |
+
|
| 605 |
+
logger.info(f"β
Audio saved: {audio_id} ({generation_time:.2f}s) with voice '{voice_name}'")
|
| 606 |
+
|
| 607 |
+
return TTSResponse(
|
| 608 |
+
success=True,
|
| 609 |
+
audio_id=audio_id,
|
| 610 |
+
message=message,
|
| 611 |
+
sample_rate=sample_rate,
|
| 612 |
+
duration=len(audio_data) / sample_rate
|
| 613 |
+
)
|
| 614 |
|
| 615 |
except HTTPException:
|
| 616 |
raise
|
|
|
|
| 620 |
|
| 621 |
@app.get("/api/audio/{audio_id}")
|
| 622 |
async def get_audio(audio_id: str):
|
| 623 |
+
"""Download generated audio file"""
|
|
|
|
|
|
|
| 624 |
if audio_id not in audio_cache:
|
| 625 |
raise HTTPException(status_code=404, detail="Audio not found")
|
| 626 |
|
|
|
|
| 644 |
|
| 645 |
@app.get("/api/audio/{audio_id}/info")
|
| 646 |
async def get_audio_info(audio_id: str):
|
| 647 |
+
"""Get audio file information"""
|
|
|
|
|
|
|
| 648 |
if audio_id not in audio_cache:
|
| 649 |
raise HTTPException(status_code=404, detail="Audio not found")
|
| 650 |
|
|
|
|
| 652 |
|
| 653 |
@app.get("/api/audio")
|
| 654 |
async def list_audio():
|
| 655 |
+
"""List all generated audio files"""
|
|
|
|
|
|
|
| 656 |
return {
|
| 657 |
"audio_files": [
|
| 658 |
{
|
| 659 |
"audio_id": audio_id,
|
| 660 |
"text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
|
| 661 |
+
"voice_name": info.get("voice_name", "Unknown"),
|
| 662 |
"duration": info["duration"],
|
| 663 |
"generated_at": info["generated_at"],
|
| 664 |
"real_chatterbox": info.get("real_chatterbox", False)
|
|
|
|
| 670 |
|
| 671 |
# Gradio interface
|
| 672 |
def create_gradio_interface():
|
| 673 |
+
"""Create Gradio interface with voice management"""
|
| 674 |
+
|
| 675 |
+
def get_voice_choices():
|
| 676 |
+
"""Get voice choices for dropdown"""
|
| 677 |
+
choices = []
|
| 678 |
+
for voice_id, voice_info in voice_library.items():
|
| 679 |
+
voice_type = "π§" if voice_info["type"] == "builtin" else "π"
|
| 680 |
+
choices.append((f"{voice_type} {voice_info['name']} - {voice_info['description']}", voice_id))
|
| 681 |
+
return choices
|
| 682 |
+
|
| 683 |
+
def refresh_voice_choices():
|
| 684 |
+
"""Refresh voice dropdown"""
|
| 685 |
+
return gr.update(choices=get_voice_choices())
|
| 686 |
+
|
| 687 |
+
def create_voice_ui(voice_name, voice_description, audio_file):
|
| 688 |
+
"""Create voice from UI"""
|
| 689 |
+
try:
|
| 690 |
+
if not voice_name.strip():
|
| 691 |
+
return "β Please enter a voice name", gr.update()
|
| 692 |
+
|
| 693 |
+
if audio_file is None:
|
| 694 |
+
return "β Please upload an audio file", gr.update()
|
| 695 |
+
|
| 696 |
+
voice_id, voice_entry = create_voice_from_audio(
|
| 697 |
+
audio_file,
|
| 698 |
+
voice_name.strip(),
|
| 699 |
+
voice_description.strip() or "Custom voice"
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
if voice_id:
|
| 703 |
+
updated_choices = get_voice_choices()
|
| 704 |
+
return (
|
| 705 |
+
f"β
Voice '{voice_name}' created successfully!\n"
|
| 706 |
+
f"π Voice ID: {voice_id}\n"
|
| 707 |
+
f"π Audio saved and ready to use\n"
|
| 708 |
+
f"π Available in voice selection dropdown",
|
| 709 |
+
gr.update(choices=updated_choices, value=voice_id)
|
| 710 |
+
)
|
| 711 |
+
else:
|
| 712 |
+
return "β Failed to create voice", gr.update()
|
| 713 |
+
|
| 714 |
+
except Exception as e:
|
| 715 |
+
logger.error(f"UI voice creation failed: {e}")
|
| 716 |
+
return f"β Voice creation failed: {str(e)}", gr.update()
|
| 717 |
+
|
| 718 |
+
def generate_speech_ui(text, voice_id, exag, temp, seed_val, cfg):
|
| 719 |
+
"""Generate speech from UI using voice ID"""
|
| 720 |
+
try:
|
| 721 |
+
if not text.strip():
|
| 722 |
+
return None, "β Please enter some text"
|
| 723 |
+
|
| 724 |
+
if len(text) > 300:
|
| 725 |
+
return None, "β Text too long (max 300 characters)"
|
| 726 |
+
|
| 727 |
+
if not voice_id or voice_id not in voice_library:
|
| 728 |
+
return None, "β Please select a valid voice"
|
| 729 |
+
|
| 730 |
+
start_time = time.time()
|
| 731 |
+
|
| 732 |
+
# Generate audio using voice ID
|
| 733 |
+
sample_rate, audio_data = generate_tts_audio(
|
| 734 |
+
text, voice_id, exag, temp, int(seed_val), cfg
|
| 735 |
+
)
|
| 736 |
+
|
| 737 |
+
generation_time = time.time() - start_time
|
| 738 |
+
duration = len(audio_data) / sample_rate
|
| 739 |
+
|
| 740 |
+
voice_name = voice_library[voice_id]["name"]
|
| 741 |
+
voice_type = voice_library[voice_id]["type"]
|
| 742 |
+
|
| 743 |
+
if CHATTERBOX_AVAILABLE:
|
| 744 |
+
status = f"""β
Real ChatterboxTTS synthesis completed!
|
| 745 |
+
|
| 746 |
+
π Voice: {voice_name} ({voice_type})
|
| 747 |
+
β±οΈ Generation time: {generation_time:.2f}s
|
| 748 |
+
π΅ Audio duration: {duration:.2f}s
|
| 749 |
+
π Sample rate: {sample_rate} Hz
|
| 750 |
+
π Audio samples: {len(audio_data):,}
|
| 751 |
+
"""
|
| 752 |
+
else:
|
| 753 |
+
status = f"""β οΈ Fallback audio generated (beep sound)
|
| 754 |
+
|
| 755 |
+
π¨ This is NOT real speech synthesis!
|
| 756 |
+
π Voice: {voice_name} ({voice_type})
|
| 757 |
+
π¦ Upload ChatterboxTTS package for real synthesis
|
| 758 |
+
β±οΈ Generation time: {generation_time:.2f}s
|
| 759 |
+
π΅ Audio duration: {duration:.2f}s
|
| 760 |
+
|
| 761 |
+
π‘ To fix: Upload your ChatterboxTTS files to this Space
|
| 762 |
+
"""
|
| 763 |
+
|
| 764 |
+
return (sample_rate, audio_data), status
|
| 765 |
+
|
| 766 |
+
except Exception as e:
|
| 767 |
+
logger.error(f"UI generation failed: {e}")
|
| 768 |
+
return None, f"β Generation failed: {str(e)}"
|
| 769 |
+
|
| 770 |
+
def delete_voice_ui(voice_id):
|
| 771 |
+
"""Delete voice from UI"""
|
| 772 |
+
try:
|
| 773 |
+
if not voice_id or voice_id not in voice_library:
|
| 774 |
+
return "β Please select a voice to delete", gr.update()
|
| 775 |
+
|
| 776 |
+
voice_info = voice_library[voice_id]
|
| 777 |
+
|
| 778 |
+
if voice_info.get("type") == "builtin":
|
| 779 |
+
return "β Cannot delete builtin voices", gr.update()
|
| 780 |
+
|
| 781 |
+
voice_name = voice_info["name"]
|
| 782 |
+
|
| 783 |
+
# Delete audio file
|
| 784 |
+
if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
|
| 785 |
+
os.unlink(voice_info["audio_path"])
|
| 786 |
+
|
| 787 |
+
# Remove from library
|
| 788 |
+
del voice_library[voice_id]
|
| 789 |
+
save_voice_library()
|
| 790 |
+
|
| 791 |
+
updated_choices = get_voice_choices()
|
| 792 |
+
return (
|
| 793 |
+
f"β
Voice '{voice_name}' deleted successfully",
|
| 794 |
+
gr.update(choices=updated_choices, value=updated_choices[0][1] if updated_choices else None)
|
| 795 |
+
)
|
| 796 |
+
|
| 797 |
+
except Exception as e:
|
| 798 |
+
logger.error(f"UI voice deletion failed: {e}")
|
| 799 |
+
return f"β Voice deletion failed: {str(e)}", gr.update()
|
| 800 |
|
| 801 |
+
with gr.Blocks(title="ChatterboxTTS Voice Manager", theme=gr.themes.Soft()) as demo:
|
| 802 |
|
| 803 |
# Status indicator at the top
|
| 804 |
if CHATTERBOX_AVAILABLE:
|
|
|
|
| 815 |
""")
|
| 816 |
|
| 817 |
gr.Markdown("""
|
| 818 |
+
# π΅ ChatterboxTTS Voice Manager
|
| 819 |
|
| 820 |
+
**Advanced text-to-speech with custom voice cloning and voice library management**
|
| 821 |
""")
|
| 822 |
|
| 823 |
+
with gr.Tabs():
|
| 824 |
+
# Text-to-Speech Tab
|
| 825 |
+
with gr.TabItem("π΅ Generate Speech"):
|
| 826 |
+
with gr.Row():
|
| 827 |
+
with gr.Column():
|
| 828 |
+
text_input = gr.Textbox(
|
| 829 |
+
value="Hello, this is ChatterboxTTS with custom voice cloning. I can speak in any voice you train me with!",
|
| 830 |
+
label="Text to synthesize (max 300 characters)",
|
| 831 |
+
max_lines=5,
|
| 832 |
+
placeholder="Enter your text here..."
|
| 833 |
+
)
|
| 834 |
+
|
| 835 |
+
voice_selector = gr.Dropdown(
|
| 836 |
+
label="π Select Voice",
|
| 837 |
+
choices=get_voice_choices(),
|
| 838 |
+
value=list(voice_library.keys())[0] if voice_library else None,
|
| 839 |
+
interactive=True,
|
| 840 |
+
info="Choose from builtin voices (π§) or your custom voices (π)"
|
| 841 |
+
)
|
| 842 |
+
|
| 843 |
+
with gr.Row():
|
| 844 |
+
generate_btn = gr.Button("π΅ Generate Speech", variant="primary")
|
| 845 |
+
refresh_voices_btn = gr.Button("π Refresh Voices", size="sm")
|
| 846 |
+
|
| 847 |
+
with gr.Row():
|
| 848 |
+
exaggeration = gr.Slider(
|
| 849 |
+
0.25, 2,
|
| 850 |
+
step=0.05,
|
| 851 |
+
label="Exaggeration",
|
| 852 |
+
value=0.5,
|
| 853 |
+
info="Controls expressiveness (0.5 = neutral)"
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
cfg_weight = gr.Slider(
|
| 857 |
+
0.2, 1,
|
| 858 |
+
step=0.05,
|
| 859 |
+
label="CFG Weight",
|
| 860 |
+
value=0.5,
|
| 861 |
+
info="Controls pace and clarity"
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 865 |
+
temperature = gr.Slider(
|
| 866 |
+
0.05, 5,
|
| 867 |
+
step=0.05,
|
| 868 |
+
label="Temperature",
|
| 869 |
+
value=0.8,
|
| 870 |
+
info="Controls randomness"
|
| 871 |
+
)
|
| 872 |
+
|
| 873 |
+
seed = gr.Number(
|
| 874 |
+
value=0,
|
| 875 |
+
label="Seed (0 = random)",
|
| 876 |
+
info="Set to non-zero for reproducible results"
|
| 877 |
+
)
|
| 878 |
+
|
| 879 |
+
with gr.Column():
|
| 880 |
+
audio_output = gr.Audio(label="π Generated Speech")
|
| 881 |
+
|
| 882 |
+
status_text = gr.Textbox(
|
| 883 |
+
label="π Generation Status",
|
| 884 |
+
interactive=False,
|
| 885 |
+
lines=8,
|
| 886 |
+
placeholder="Select a voice and click 'Generate Speech' to start..."
|
| 887 |
+
)
|
| 888 |
|
| 889 |
+
# Voice Management Tab
|
| 890 |
+
with gr.TabItem("π Voice Library"):
|
| 891 |
+
with gr.Row():
|
| 892 |
+
with gr.Column():
|
| 893 |
+
gr.Markdown("### π Available Voices")
|
| 894 |
+
|
| 895 |
+
voices_display = gr.HTML(
|
| 896 |
+
value=f"""
|
| 897 |
+
<div style="max-height: 300px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
|
| 898 |
+
{''.join([f"<p><strong>{voice_info['name']}</strong> ({voice_info['type']})<br><small>{voice_info['description']}</small></p>" for voice_info in voice_library.values()])}
|
| 899 |
+
</div>
|
| 900 |
+
"""
|
| 901 |
+
)
|
| 902 |
+
|
| 903 |
+
gr.Markdown("### ποΈ Delete Voice")
|
| 904 |
+
delete_voice_selector = gr.Dropdown(
|
| 905 |
+
label="Select voice to delete",
|
| 906 |
+
choices=[(f"{info['name']} ({info['type']})", vid) for vid, info in voice_library.items() if info['type'] == 'custom'],
|
| 907 |
+
value=None
|
| 908 |
+
)
|
| 909 |
+
|
| 910 |
+
delete_voice_btn = gr.Button("ποΈ Delete Selected Voice", variant="stop")
|
| 911 |
+
delete_status = gr.Textbox(label="Delete Status", interactive=False)
|
| 912 |
+
|
| 913 |
+
with gr.Column():
|
| 914 |
+
gr.Markdown("### β Create New Voice")
|
| 915 |
+
|
| 916 |
+
new_voice_name = gr.Textbox(
|
| 917 |
+
label="Voice Name",
|
| 918 |
+
placeholder="e.g., 'John's Voice', 'Narrator Voice'",
|
| 919 |
+
value=""
|
| 920 |
+
)
|
| 921 |
+
|
| 922 |
+
new_voice_description = gr.Textbox(
|
| 923 |
+
label="Voice Description",
|
| 924 |
+
placeholder="e.g., 'Professional male voice', 'Warm female narrator'",
|
| 925 |
+
value=""
|
| 926 |
+
)
|
| 927 |
+
|
| 928 |
+
new_voice_audio = gr.Audio(
|
| 929 |
+
label="Upload Voice Sample",
|
| 930 |
+
type="numpy",
|
| 931 |
+
info="Upload 5-30 seconds of clear speech"
|
| 932 |
+
)
|
| 933 |
+
|
| 934 |
+
create_voice_btn = gr.Button("π― Create Voice", variant="primary")
|
| 935 |
+
|
| 936 |
+
create_status = gr.Textbox(
|
| 937 |
+
label="π Creation Status",
|
| 938 |
+
interactive=False,
|
| 939 |
+
lines=6
|
| 940 |
+
)
|
| 941 |
|
| 942 |
+
# Voice Library Info Tab
|
| 943 |
+
with gr.TabItem("π Voice Guide"):
|
| 944 |
+
gr.Markdown(f"""
|
| 945 |
+
## π Voice Library Management
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
|
| 947 |
+
### π Current Library Status
|
| 948 |
+
- **Total Voices**: {len(voice_library)}
|
| 949 |
+
- **Builtin Voices**: {len([v for v in voice_library.values() if v['type'] == 'builtin'])}
|
| 950 |
+
- **Custom Voices**: {len([v for v in voice_library.values() if v['type'] == 'custom'])}
|
|
|
|
|
|
|
| 951 |
|
| 952 |
+
### π§ Builtin Voices
|
| 953 |
+
These are pre-configured voices that come with the system:
|
| 954 |
+
{chr(10).join([f"- **{voice_info['name']}**: {voice_info['description']}" for voice_info in voice_library.values() if voice_info['type'] == 'builtin'])}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
|
| 956 |
+
### π― Creating Custom Voices
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 957 |
|
| 958 |
+
#### π Best Practices:
|
| 959 |
+
1. **Audio Quality**: Use clear, noise-free recordings
|
| 960 |
+
2. **Duration**: 5-30 seconds of natural speech
|
| 961 |
+
3. **Content**: Normal conversational speech works best
|
| 962 |
+
4. **Format**: WAV, MP3, or FLAC files supported
|
| 963 |
+
5. **Voice Consistency**: Use the same speaker throughout
|
| 964 |
|
| 965 |
+
#### π€ Recording Tips:
|
| 966 |
+
- Record in a quiet environment
|
| 967 |
+
- Speak naturally and clearly
|
| 968 |
+
- Avoid background noise
|
| 969 |
+
- Use a decent microphone if possible
|
| 970 |
+
- Read a paragraph of normal text
|
| 971 |
|
| 972 |
+
#### π Voice Management:
|
| 973 |
+
- **Create**: Upload audio + provide name and description
|
| 974 |
+
- **Use**: Select from dropdown in speech generation
|
| 975 |
+
- **Delete**: Remove custom voices you no longer need
|
| 976 |
+
- **Persistent**: Custom voices are saved permanently
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
|
| 978 |
+
### π Usage Workflow:
|
| 979 |
+
1. **Upload Voice Sample** β Create custom voice
|
| 980 |
+
2. **Select Voice** β Choose from library
|
| 981 |
+
3. **Generate Speech** β Use selected voice for TTS
|
| 982 |
+
4. **Manage Library** β Add, delete, organize voices
|
| 983 |
|
| 984 |
+
### π API Integration:
|
| 985 |
+
```python
|
| 986 |
+
# List voices
|
| 987 |
+
GET /api/voices
|
| 988 |
|
| 989 |
+
# Create voice
|
| 990 |
+
POST /api/voices/create
|
|
|
|
|
|
|
| 991 |
|
| 992 |
+
# Generate speech with voice
|
| 993 |
+
POST /api/tts/synthesize
|
| 994 |
+
{{
|
| 995 |
+
"text": "Hello world",
|
| 996 |
+
"voice_id": "your_voice_id"
|
| 997 |
+
}}
|
| 998 |
|
| 999 |
+
# Delete voice
|
| 1000 |
+
DELETE /api/voices/voice_id
|
| 1001 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1002 |
|
| 1003 |
+
### π‘ Pro Tips:
|
| 1004 |
+
- **Voice Naming**: Use descriptive names like "John_Professional" or "Sarah_Narrator"
|
| 1005 |
+
- **Voice Testing**: Generate short test phrases after creating voices
|
| 1006 |
+
- **Voice Backup**: Custom voices are saved to disk automatically
|
| 1007 |
+
- **Voice Sharing**: Voice IDs can be shared via API
|
| 1008 |
+
""")
|
| 1009 |
|
| 1010 |
+
# Event handlers
|
| 1011 |
generate_btn.click(
|
| 1012 |
fn=generate_speech_ui,
|
| 1013 |
+
inputs=[text_input, voice_selector, exaggeration, temperature, seed, cfg_weight],
|
| 1014 |
outputs=[audio_output, status_text]
|
| 1015 |
)
|
| 1016 |
|
| 1017 |
+
refresh_voices_btn.click(
|
| 1018 |
+
fn=refresh_voice_choices,
|
| 1019 |
+
outputs=[voice_selector]
|
| 1020 |
+
)
|
| 1021 |
+
|
| 1022 |
+
create_voice_btn.click(
|
| 1023 |
+
fn=create_voice_ui,
|
| 1024 |
+
inputs=[new_voice_name, new_voice_description, new_voice_audio],
|
| 1025 |
+
outputs=[create_status, voice_selector]
|
| 1026 |
+
)
|
| 1027 |
+
|
| 1028 |
+
delete_voice_btn.click(
|
| 1029 |
+
fn=delete_voice_ui,
|
| 1030 |
+
inputs=[delete_voice_selector],
|
| 1031 |
+
outputs=[delete_status, voice_selector]
|
| 1032 |
+
)
|
| 1033 |
+
|
| 1034 |
+
# System info with voice library status
|
| 1035 |
model_status = "β
Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "β οΈ Fallback Model (Beep Sounds)"
|
| 1036 |
chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
|
| 1037 |
|
|
|
|
| 1039 |
### π System Status
|
| 1040 |
- **Model**: {model_status}
|
| 1041 |
- **Device**: {DEVICE}
|
|
|
|
| 1042 |
- **ChatterboxTTS**: {chatterbox_status}
|
| 1043 |
+
- **Voice Library**: {len(voice_library)} voices loaded
|
| 1044 |
+
- **Generated Files**: {len(audio_cache)}
|
| 1045 |
+
- **Storage**: `{VOICES_DIR}/` for voices, `{AUDIO_DIR}/` for output
|
| 1046 |
|
| 1047 |
{'''### π Production Ready!
|
| 1048 |
+
Your ChatterboxTTS model is loaded with voice management system.''' if CHATTERBOX_AVAILABLE else '''### β οΈ Action Required
|
| 1049 |
**You're hearing beep sounds because ChatterboxTTS isn't loaded.**
|
| 1050 |
|
| 1051 |
+
Voice management is working, but you need ChatterboxTTS for real synthesis.'''}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1052 |
""")
|
| 1053 |
|
| 1054 |
return demo
|
| 1055 |
|
| 1056 |
# Main execution
|
| 1057 |
if __name__ == "__main__":
|
| 1058 |
+
logger.info("π Starting ChatterboxTTS Voice Management Service...")
|
| 1059 |
|
| 1060 |
# Model status
|
| 1061 |
if CHATTERBOX_AVAILABLE and MODEL:
|
|
|
|
| 1068 |
logger.info(f"Model Status: {model_status}")
|
| 1069 |
logger.info(f"Device: {DEVICE}")
|
| 1070 |
logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
|
| 1071 |
+
logger.info(f"Voice Library: {len(voice_library)} voices loaded")
|
| 1072 |
+
logger.info(f"Custom Voices: {len([v for v in voice_library.values() if v['type'] == 'custom'])}")
|
| 1073 |
|
| 1074 |
if not CHATTERBOX_AVAILABLE:
|
| 1075 |
logger.warning("π¨ IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
|
|
|
|
| 1076 |
|
| 1077 |
if os.getenv("SPACE_ID"):
|
| 1078 |
# Running in Hugging Face Spaces
|
|
|
|
| 1097 |
|
| 1098 |
logger.info("π FastAPI: http://localhost:8000")
|
| 1099 |
logger.info("π API Docs: http://localhost:8000/docs")
|
| 1100 |
+
logger.info("π API Endpoints:")
|
| 1101 |
+
logger.info(" - GET /api/voices")
|
| 1102 |
+
logger.info(" - POST /api/voices/create")
|
| 1103 |
+
logger.info(" - DELETE /api/voices/{voice_id}")
|
| 1104 |
+
logger.info(" - POST /api/tts/synthesize")
|
| 1105 |
|
| 1106 |
# Start Gradio
|
| 1107 |
demo = create_gradio_interface()
|