Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,12 +9,13 @@ import uuid
|
|
9 |
import logging
|
10 |
import requests
|
11 |
import io
|
12 |
-
|
|
|
13 |
from pathlib import Path
|
14 |
|
15 |
import gradio as gr
|
16 |
import spaces
|
17 |
-
from fastapi import FastAPI, HTTPException
|
18 |
from fastapi.responses import StreamingResponse
|
19 |
from fastapi.middleware.cors import CORSMiddleware
|
20 |
from pydantic import BaseModel
|
@@ -31,10 +32,148 @@ logger.info(f"π Running on device: {DEVICE}")
|
|
31 |
MODEL = None
|
32 |
CHATTERBOX_AVAILABLE = False
|
33 |
|
34 |
-
# Storage
|
35 |
AUDIO_DIR = "generated_audio"
|
|
|
36 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
|
|
|
|
|
|
37 |
audio_cache = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def load_chatterbox_model():
|
40 |
"""Try multiple ways to load ChatterboxTTS from Resemble AI"""
|
@@ -81,52 +220,6 @@ def load_chatterbox_model():
|
|
81 |
except Exception as e:
|
82 |
logger.warning(f"Method 3 failed with error: {e}")
|
83 |
|
84 |
-
# Method 4: Try exploring the installed package
|
85 |
-
try:
|
86 |
-
import chatterbox
|
87 |
-
import inspect
|
88 |
-
|
89 |
-
# Log what's available in the chatterbox package
|
90 |
-
logger.info(f"Chatterbox module path: {chatterbox.__file__}")
|
91 |
-
logger.info(f"Chatterbox contents: {dir(chatterbox)}")
|
92 |
-
|
93 |
-
# Try to find ChatterboxTTS class anywhere in the module
|
94 |
-
for name, obj in inspect.getmembers(chatterbox):
|
95 |
-
if name == 'ChatterboxTTS' or (inspect.isclass(obj) and 'TTS' in name):
|
96 |
-
logger.info(f"Found potential TTS class: {name}")
|
97 |
-
MODEL = obj.from_pretrained(DEVICE)
|
98 |
-
CHATTERBOX_AVAILABLE = True
|
99 |
-
return True
|
100 |
-
|
101 |
-
raise ImportError("ChatterboxTTS class not found in chatterbox package")
|
102 |
-
|
103 |
-
except ImportError as e:
|
104 |
-
logger.warning(f"Method 4 failed: {e}")
|
105 |
-
except Exception as e:
|
106 |
-
logger.warning(f"Method 4 failed with error: {e}")
|
107 |
-
|
108 |
-
# Method 5: Check if the GitHub repo was installed correctly
|
109 |
-
try:
|
110 |
-
import pkg_resources
|
111 |
-
try:
|
112 |
-
pkg_resources.get_distribution('chatterbox')
|
113 |
-
logger.info("β
Chatterbox package is installed")
|
114 |
-
except pkg_resources.DistributionNotFound:
|
115 |
-
logger.warning("β Chatterbox package not found in installed packages")
|
116 |
-
|
117 |
-
# Try to import and inspect what we got
|
118 |
-
import chatterbox
|
119 |
-
chatterbox_path = chatterbox.__path__[0] if hasattr(chatterbox, '__path__') else str(chatterbox.__file__)
|
120 |
-
logger.info(f"Chatterbox installed at: {chatterbox_path}")
|
121 |
-
|
122 |
-
# List all available modules/classes
|
123 |
-
import pkgutil
|
124 |
-
for importer, modname, ispkg in pkgutil.walk_packages(chatterbox.__path__, chatterbox.__name__ + "."):
|
125 |
-
logger.info(f"Available module: {modname}")
|
126 |
-
|
127 |
-
except Exception as e:
|
128 |
-
logger.warning(f"Package inspection failed: {e}")
|
129 |
-
|
130 |
# If we get here, the GitHub repo might have a different structure
|
131 |
logger.error("β Could not load ChatterboxTTS from Resemble AI repository")
|
132 |
logger.error("π‘ The GitHub repo might have a different structure than expected")
|
@@ -135,30 +228,6 @@ def load_chatterbox_model():
|
|
135 |
|
136 |
return False
|
137 |
|
138 |
-
def download_audio_from_url(url):
|
139 |
-
"""Download audio from URL and save to temporary file"""
|
140 |
-
try:
|
141 |
-
logger.info(f"π₯ Downloading reference audio from: {url}")
|
142 |
-
response = requests.get(url, timeout=30, headers={
|
143 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
144 |
-
})
|
145 |
-
|
146 |
-
if response.status_code == 200:
|
147 |
-
# Create temporary file
|
148 |
-
temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
149 |
-
temp_file.write(response.content)
|
150 |
-
temp_file.close()
|
151 |
-
|
152 |
-
logger.info(f"β
Audio downloaded to: {temp_file.name}")
|
153 |
-
return temp_file.name
|
154 |
-
else:
|
155 |
-
logger.error(f"β HTTP {response.status_code} when downloading audio")
|
156 |
-
return None
|
157 |
-
|
158 |
-
except Exception as e:
|
159 |
-
logger.error(f"β Error downloading audio from URL: {e}")
|
160 |
-
return None
|
161 |
-
|
162 |
def get_or_load_model():
|
163 |
"""Load ChatterboxTTS model if not already loaded"""
|
164 |
global MODEL
|
@@ -171,7 +240,6 @@ def get_or_load_model():
|
|
171 |
logger.info("β
ChatterboxTTS model loaded successfully")
|
172 |
else:
|
173 |
logger.error("β Failed to load ChatterboxTTS - using fallback")
|
174 |
-
# Create a better fallback that shows the issue
|
175 |
create_fallback_model()
|
176 |
return MODEL
|
177 |
|
@@ -230,15 +298,29 @@ def generate_id():
|
|
230 |
"""Generate unique ID"""
|
231 |
return str(uuid.uuid4())
|
232 |
|
|
|
|
|
|
|
233 |
# Pydantic models for API
|
234 |
class TTSRequest(BaseModel):
|
235 |
text: str
|
236 |
-
|
237 |
exaggeration: Optional[float] = 0.5
|
238 |
temperature: Optional[float] = 0.8
|
239 |
cfg_weight: Optional[float] = 0.5
|
240 |
seed: Optional[int] = 0
|
241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
class TTSResponse(BaseModel):
|
243 |
success: bool
|
244 |
audio_id: Optional[str] = None
|
@@ -260,14 +342,14 @@ except Exception as e:
|
|
260 |
@spaces.GPU
|
261 |
def generate_tts_audio(
|
262 |
text_input: str,
|
263 |
-
|
264 |
exaggeration_input: float,
|
265 |
temperature_input: float,
|
266 |
seed_num_input: int,
|
267 |
cfgw_input: float
|
268 |
) -> tuple[int, np.ndarray]:
|
269 |
"""
|
270 |
-
Generate TTS audio using ChatterboxTTS model
|
271 |
"""
|
272 |
current_model = get_or_load_model()
|
273 |
|
@@ -278,29 +360,25 @@ def generate_tts_audio(
|
|
278 |
set_seed(int(seed_num_input))
|
279 |
|
280 |
logger.info(f"π΅ Generating audio for: '{text_input[:50]}...'")
|
|
|
281 |
|
282 |
if not CHATTERBOX_AVAILABLE:
|
283 |
logger.warning("π¨ USING FALLBACK - Real ChatterboxTTS not found!")
|
284 |
-
logger.warning("π To fix: Upload your ChatterboxTTS package to this Space")
|
285 |
|
286 |
-
#
|
287 |
-
audio_prompt_path =
|
288 |
temp_audio_file = None
|
289 |
|
290 |
try:
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
audio_prompt_path = None
|
301 |
-
elif audio_prompt_path_input and not os.path.exists(audio_prompt_path_input):
|
302 |
-
logger.warning(f"β οΈ Audio file not found: {audio_prompt_path_input}, proceeding without reference")
|
303 |
-
audio_prompt_path = None
|
304 |
|
305 |
# Generate audio
|
306 |
wav = current_model.generate(
|
@@ -322,8 +400,8 @@ def generate_tts_audio(
|
|
322 |
logger.error(f"β Audio generation failed: {e}")
|
323 |
raise
|
324 |
finally:
|
325 |
-
# Clean up temporary file
|
326 |
-
if temp_audio_file and os.path.exists(temp_audio_file):
|
327 |
try:
|
328 |
os.unlink(temp_audio_file)
|
329 |
logger.info(f"ποΈ Cleaned up temporary file: {temp_audio_file}")
|
@@ -332,9 +410,9 @@ def generate_tts_audio(
|
|
332 |
|
333 |
# FastAPI app for API endpoints
|
334 |
app = FastAPI(
|
335 |
-
title="ChatterboxTTS API",
|
336 |
-
description="
|
337 |
-
version="
|
338 |
)
|
339 |
|
340 |
app.add_middleware(
|
@@ -349,15 +427,18 @@ app.add_middleware(
|
|
349 |
async def root():
|
350 |
"""API status endpoint"""
|
351 |
return {
|
352 |
-
"service": "ChatterboxTTS API",
|
353 |
-
"version": "
|
354 |
"status": "operational" if MODEL else "model_loading",
|
355 |
"model_loaded": MODEL is not None,
|
356 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
357 |
"device": DEVICE,
|
|
|
358 |
"message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
|
359 |
"endpoints": {
|
360 |
"synthesize": "/api/tts/synthesize",
|
|
|
|
|
361 |
"audio": "/api/audio/{audio_id}",
|
362 |
"health": "/health"
|
363 |
}
|
@@ -371,14 +452,105 @@ async def health_check():
|
|
371 |
"model_loaded": MODEL is not None,
|
372 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
373 |
"device": DEVICE,
|
|
|
374 |
"timestamp": time.time(),
|
375 |
"warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
|
376 |
}
|
377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
@app.post("/api/tts/synthesize", response_model=TTSResponse)
|
379 |
async def synthesize_speech(request: TTSRequest):
|
380 |
"""
|
381 |
-
Synthesize speech from text
|
382 |
"""
|
383 |
try:
|
384 |
if MODEL is None:
|
@@ -390,70 +562,55 @@ async def synthesize_speech(request: TTSRequest):
|
|
390 |
if len(request.text) > 500:
|
391 |
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
392 |
|
|
|
|
|
|
|
393 |
start_time = time.time()
|
394 |
|
395 |
-
#
|
396 |
-
|
397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
-
|
400 |
-
temp_audio_file = download_audio_from_url(request.audio_prompt_url)
|
401 |
-
if temp_audio_file:
|
402 |
-
audio_prompt_path = temp_audio_file
|
403 |
-
else:
|
404 |
-
logger.warning("Failed to download reference audio, proceeding without")
|
405 |
-
audio_prompt_path = None
|
406 |
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
audio_prompt_path,
|
412 |
-
request.exaggeration,
|
413 |
-
request.temperature,
|
414 |
-
request.seed,
|
415 |
-
request.cfg_weight
|
416 |
-
)
|
417 |
-
|
418 |
-
generation_time = time.time() - start_time
|
419 |
-
|
420 |
-
# Save audio file
|
421 |
-
audio_id = generate_id()
|
422 |
-
audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
|
423 |
-
sf.write(audio_path, audio_data, sample_rate)
|
424 |
-
|
425 |
-
# Cache audio info
|
426 |
-
audio_cache[audio_id] = {
|
427 |
-
"path": audio_path,
|
428 |
-
"text": request.text,
|
429 |
-
"sample_rate": sample_rate,
|
430 |
-
"duration": len(audio_data) / sample_rate,
|
431 |
-
"generated_at": time.time(),
|
432 |
-
"generation_time": generation_time,
|
433 |
-
"real_chatterbox": CHATTERBOX_AVAILABLE
|
434 |
-
}
|
435 |
-
|
436 |
-
message = "Speech synthesized successfully"
|
437 |
-
if not CHATTERBOX_AVAILABLE:
|
438 |
-
message += " (using fallback - upload ChatterboxTTS for real synthesis)"
|
439 |
-
|
440 |
-
logger.info(f"β
Audio saved: {audio_id} ({generation_time:.2f}s)")
|
441 |
-
|
442 |
-
return TTSResponse(
|
443 |
-
success=True,
|
444 |
-
audio_id=audio_id,
|
445 |
-
message=message,
|
446 |
-
sample_rate=sample_rate,
|
447 |
-
duration=len(audio_data) / sample_rate
|
448 |
-
)
|
449 |
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
except HTTPException:
|
459 |
raise
|
@@ -463,9 +620,7 @@ async def synthesize_speech(request: TTSRequest):
|
|
463 |
|
464 |
@app.get("/api/audio/{audio_id}")
|
465 |
async def get_audio(audio_id: str):
|
466 |
-
"""
|
467 |
-
Download generated audio file
|
468 |
-
"""
|
469 |
if audio_id not in audio_cache:
|
470 |
raise HTTPException(status_code=404, detail="Audio not found")
|
471 |
|
@@ -489,9 +644,7 @@ async def get_audio(audio_id: str):
|
|
489 |
|
490 |
@app.get("/api/audio/{audio_id}/info")
|
491 |
async def get_audio_info(audio_id: str):
|
492 |
-
"""
|
493 |
-
Get audio file information
|
494 |
-
"""
|
495 |
if audio_id not in audio_cache:
|
496 |
raise HTTPException(status_code=404, detail="Audio not found")
|
497 |
|
@@ -499,14 +652,13 @@ async def get_audio_info(audio_id: str):
|
|
499 |
|
500 |
@app.get("/api/audio")
|
501 |
async def list_audio():
|
502 |
-
"""
|
503 |
-
List all generated audio files
|
504 |
-
"""
|
505 |
return {
|
506 |
"audio_files": [
|
507 |
{
|
508 |
"audio_id": audio_id,
|
509 |
"text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
|
|
|
510 |
"duration": info["duration"],
|
511 |
"generated_at": info["generated_at"],
|
512 |
"real_chatterbox": info.get("real_chatterbox", False)
|
@@ -518,9 +670,135 @@ async def list_audio():
|
|
518 |
|
519 |
# Gradio interface
|
520 |
def create_gradio_interface():
|
521 |
-
"""Create Gradio interface with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
|
523 |
-
with gr.Blocks(title="ChatterboxTTS", theme=gr.themes.Soft()) as demo:
|
524 |
|
525 |
# Status indicator at the top
|
526 |
if CHATTERBOX_AVAILABLE:
|
@@ -537,141 +815,223 @@ def create_gradio_interface():
|
|
537 |
""")
|
538 |
|
539 |
gr.Markdown("""
|
540 |
-
# π΅ ChatterboxTTS
|
541 |
|
542 |
-
|
543 |
""")
|
544 |
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
|
562 |
-
|
563 |
-
""
|
564 |
-
|
565 |
-
|
566 |
-
with gr.Column():
|
567 |
-
text_input = gr.Textbox(
|
568 |
-
value="Hello, this is ChatterboxTTS. I can generate natural-sounding speech from any text you provide.",
|
569 |
-
label="Text to synthesize (max 300 characters)",
|
570 |
-
max_lines=5,
|
571 |
-
placeholder="Enter your text here..."
|
572 |
-
)
|
573 |
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
info="URL will be downloaded automatically, or use local file path"
|
579 |
-
)
|
580 |
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
step=0.05,
|
585 |
-
label="Exaggeration",
|
586 |
-
value=0.5,
|
587 |
-
info="Controls expressiveness (0.5 = neutral)"
|
588 |
-
)
|
589 |
-
|
590 |
-
cfg_weight = gr.Slider(
|
591 |
-
0.2, 1,
|
592 |
-
step=0.05,
|
593 |
-
label="CFG Weight",
|
594 |
-
value=0.5,
|
595 |
-
info="Controls pace and clarity"
|
596 |
-
)
|
597 |
|
598 |
-
|
599 |
-
temperature = gr.Slider(
|
600 |
-
0.05, 5,
|
601 |
-
step=0.05,
|
602 |
-
label="Temperature",
|
603 |
-
value=0.8,
|
604 |
-
info="Controls randomness"
|
605 |
-
)
|
606 |
-
|
607 |
-
seed = gr.Number(
|
608 |
-
value=0,
|
609 |
-
label="Seed (0 = random)",
|
610 |
-
info="Set to non-zero for reproducible results"
|
611 |
-
)
|
612 |
|
613 |
-
|
|
|
|
|
|
|
|
|
|
|
614 |
|
615 |
-
|
616 |
-
|
|
|
|
|
|
|
|
|
617 |
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
def generate_speech_ui(text, prompt_url, exag, temp, seed_val, cfg):
|
625 |
-
"""Generate speech from UI"""
|
626 |
-
try:
|
627 |
-
if not text.strip():
|
628 |
-
return None, "β Please enter some text"
|
629 |
|
630 |
-
|
631 |
-
|
|
|
|
|
|
|
632 |
|
633 |
-
|
|
|
|
|
|
|
634 |
|
635 |
-
#
|
636 |
-
|
637 |
-
text, prompt_url, exag, temp, int(seed_val), cfg
|
638 |
-
)
|
639 |
|
640 |
-
|
641 |
-
|
|
|
|
|
|
|
|
|
642 |
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
β±οΈ Generation time: {generation_time:.2f}s
|
647 |
-
π΅ Audio duration: {duration:.2f}s
|
648 |
-
π Sample rate: {sample_rate} Hz
|
649 |
-
π Audio samples: {len(audio_data):,}
|
650 |
-
"""
|
651 |
-
else:
|
652 |
-
status = f"""β οΈ Fallback audio generated (beep sound)
|
653 |
-
|
654 |
-
π¨ This is NOT real speech synthesis!
|
655 |
-
π¦ Upload ChatterboxTTS package for real synthesis
|
656 |
-
β±οΈ Generation time: {generation_time:.2f}s
|
657 |
-
π΅ Audio duration: {duration:.2f}s
|
658 |
-
|
659 |
-
π‘ To fix: Upload your ChatterboxTTS files to this Space
|
660 |
-
"""
|
661 |
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
|
|
667 |
|
|
|
668 |
generate_btn.click(
|
669 |
fn=generate_speech_ui,
|
670 |
-
inputs=[text_input,
|
671 |
outputs=[audio_output, status_text]
|
672 |
)
|
673 |
|
674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
675 |
model_status = "β
Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "β οΈ Fallback Model (Beep Sounds)"
|
676 |
chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
|
677 |
|
@@ -679,26 +1039,23 @@ def create_gradio_interface():
|
|
679 |
### π System Status
|
680 |
- **Model**: {model_status}
|
681 |
- **Device**: {DEVICE}
|
682 |
-
- **Generated Files**: {len(audio_cache)}
|
683 |
- **ChatterboxTTS**: {chatterbox_status}
|
|
|
|
|
|
|
684 |
|
685 |
{'''### π Production Ready!
|
686 |
-
Your ChatterboxTTS model is loaded
|
687 |
**You're hearing beep sounds because ChatterboxTTS isn't loaded.**
|
688 |
|
689 |
-
|
690 |
-
1. Upload your ChatterboxTTS package to this Space
|
691 |
-
2. Ensure proper directory structure with `__init__.py` files
|
692 |
-
3. Restart the Space
|
693 |
-
|
694 |
-
The current fallback generates beeps to indicate missing package.'''}
|
695 |
""")
|
696 |
|
697 |
return demo
|
698 |
|
699 |
# Main execution
|
700 |
if __name__ == "__main__":
|
701 |
-
logger.info("π Starting ChatterboxTTS Service...")
|
702 |
|
703 |
# Model status
|
704 |
if CHATTERBOX_AVAILABLE and MODEL:
|
@@ -711,10 +1068,11 @@ if __name__ == "__main__":
|
|
711 |
logger.info(f"Model Status: {model_status}")
|
712 |
logger.info(f"Device: {DEVICE}")
|
713 |
logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
|
|
|
|
|
714 |
|
715 |
if not CHATTERBOX_AVAILABLE:
|
716 |
logger.warning("π¨ IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
|
717 |
-
logger.warning("π Expected location: ./chatterbox/src/chatterbox/tts.py")
|
718 |
|
719 |
if os.getenv("SPACE_ID"):
|
720 |
# Running in Hugging Face Spaces
|
@@ -739,6 +1097,11 @@ if __name__ == "__main__":
|
|
739 |
|
740 |
logger.info("π FastAPI: http://localhost:8000")
|
741 |
logger.info("π API Docs: http://localhost:8000/docs")
|
|
|
|
|
|
|
|
|
|
|
742 |
|
743 |
# Start Gradio
|
744 |
demo = create_gradio_interface()
|
|
|
9 |
import logging
|
10 |
import requests
|
11 |
import io
|
12 |
+
import json
|
13 |
+
from typing import Optional, Dict, Any, List
|
14 |
from pathlib import Path
|
15 |
|
16 |
import gradio as gr
|
17 |
import spaces
|
18 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
19 |
from fastapi.responses import StreamingResponse
|
20 |
from fastapi.middleware.cors import CORSMiddleware
|
21 |
from pydantic import BaseModel
|
|
|
32 |
MODEL = None
|
33 |
CHATTERBOX_AVAILABLE = False
|
34 |
|
35 |
+
# Storage directories
|
36 |
AUDIO_DIR = "generated_audio"
|
37 |
+
VOICES_DIR = "custom_voices"
|
38 |
os.makedirs(AUDIO_DIR, exist_ok=True)
|
39 |
+
os.makedirs(VOICES_DIR, exist_ok=True)
|
40 |
+
|
41 |
+
# Voice storage
|
42 |
audio_cache = {}
|
43 |
+
voice_library = {}
|
44 |
+
|
45 |
+
# Default/Built-in voices
|
46 |
+
BUILTIN_VOICES = {
|
47 |
+
"female_default": {
|
48 |
+
"voice_id": "female_default",
|
49 |
+
"name": "Female Default",
|
50 |
+
"description": "Professional female voice",
|
51 |
+
"audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
|
52 |
+
"type": "builtin",
|
53 |
+
"created_at": "2024-01-01T00:00:00Z"
|
54 |
+
},
|
55 |
+
"male_professional": {
|
56 |
+
"voice_id": "male_professional",
|
57 |
+
"name": "Male Professional",
|
58 |
+
"description": "Confident male voice",
|
59 |
+
"audio_url": "https://storage.googleapis.com/chatterbox-demo-samples/prompts/male_professional.flac",
|
60 |
+
"type": "builtin",
|
61 |
+
"created_at": "2024-01-01T00:00:00Z"
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
def load_voice_library():
|
66 |
+
"""Load saved custom voices from disk"""
|
67 |
+
global voice_library
|
68 |
+
voice_library = BUILTIN_VOICES.copy()
|
69 |
+
|
70 |
+
voices_json_path = os.path.join(VOICES_DIR, "voices.json")
|
71 |
+
if os.path.exists(voices_json_path):
|
72 |
+
try:
|
73 |
+
with open(voices_json_path, 'r', encoding='utf-8') as f:
|
74 |
+
custom_voices = json.load(f)
|
75 |
+
voice_library.update(custom_voices)
|
76 |
+
logger.info(f"β
Loaded {len(custom_voices)} custom voices from disk")
|
77 |
+
except Exception as e:
|
78 |
+
logger.error(f"β Error loading voice library: {e}")
|
79 |
+
|
80 |
+
def save_voice_library():
|
81 |
+
"""Save custom voices to disk"""
|
82 |
+
try:
|
83 |
+
# Only save custom voices (not builtin)
|
84 |
+
custom_voices = {k: v for k, v in voice_library.items() if v.get("type") != "builtin"}
|
85 |
+
|
86 |
+
voices_json_path = os.path.join(VOICES_DIR, "voices.json")
|
87 |
+
with open(voices_json_path, 'w', encoding='utf-8') as f:
|
88 |
+
json.dump(custom_voices, f, ensure_ascii=False, indent=2)
|
89 |
+
logger.info(f"β
Saved {len(custom_voices)} custom voices to disk")
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"β Error saving voice library: {e}")
|
92 |
+
|
93 |
+
def create_voice_from_audio(audio_file, voice_name, voice_description="Custom voice"):
|
94 |
+
"""Create a new voice from uploaded audio"""
|
95 |
+
try:
|
96 |
+
voice_id = f"voice_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
97 |
+
|
98 |
+
# Save audio file
|
99 |
+
audio_filename = f"{voice_id}.wav"
|
100 |
+
audio_path = os.path.join(VOICES_DIR, audio_filename)
|
101 |
+
|
102 |
+
# Convert and save audio
|
103 |
+
if isinstance(audio_file, tuple):
|
104 |
+
# Gradio audio format (sample_rate, audio_data)
|
105 |
+
sample_rate, audio_data = audio_file
|
106 |
+
sf.write(audio_path, audio_data, sample_rate)
|
107 |
+
else:
|
108 |
+
# File upload
|
109 |
+
sf.write(audio_path, audio_file, 22050) # Default sample rate
|
110 |
+
|
111 |
+
# Create voice entry
|
112 |
+
voice_entry = {
|
113 |
+
"voice_id": voice_id,
|
114 |
+
"name": voice_name,
|
115 |
+
"description": voice_description,
|
116 |
+
"audio_path": audio_path,
|
117 |
+
"type": "custom",
|
118 |
+
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
119 |
+
}
|
120 |
+
|
121 |
+
# Add to voice library
|
122 |
+
voice_library[voice_id] = voice_entry
|
123 |
+
save_voice_library()
|
124 |
+
|
125 |
+
logger.info(f"β
Created voice: {voice_name} ({voice_id})")
|
126 |
+
return voice_id, voice_entry
|
127 |
+
|
128 |
+
except Exception as e:
|
129 |
+
logger.error(f"β Error creating voice: {e}")
|
130 |
+
return None, None
|
131 |
+
|
132 |
+
def download_audio_from_url(url):
|
133 |
+
"""Download audio from URL and save to temporary file"""
|
134 |
+
try:
|
135 |
+
logger.info(f"π₯ Downloading reference audio from: {url}")
|
136 |
+
response = requests.get(url, timeout=30, headers={
|
137 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
138 |
+
})
|
139 |
+
|
140 |
+
if response.status_code == 200:
|
141 |
+
# Create temporary file
|
142 |
+
temp_file = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
143 |
+
temp_file.write(response.content)
|
144 |
+
temp_file.close()
|
145 |
+
|
146 |
+
logger.info(f"β
Audio downloaded to: {temp_file.name}")
|
147 |
+
return temp_file.name
|
148 |
+
else:
|
149 |
+
logger.error(f"β HTTP {response.status_code} when downloading audio")
|
150 |
+
return None
|
151 |
+
|
152 |
+
except Exception as e:
|
153 |
+
logger.error(f"β Error downloading audio from URL: {e}")
|
154 |
+
return None
|
155 |
+
|
156 |
+
def get_voice_audio_path(voice_id):
|
157 |
+
"""Get the audio path for a voice (download if URL, return path if local)"""
|
158 |
+
if voice_id not in voice_library:
|
159 |
+
return None
|
160 |
+
|
161 |
+
voice_info = voice_library[voice_id]
|
162 |
+
|
163 |
+
# If it's a custom voice with local file
|
164 |
+
if voice_info.get("type") == "custom" and "audio_path" in voice_info:
|
165 |
+
audio_path = voice_info["audio_path"]
|
166 |
+
if os.path.exists(audio_path):
|
167 |
+
return audio_path
|
168 |
+
else:
|
169 |
+
logger.warning(f"β οΈ Voice audio file not found: {audio_path}")
|
170 |
+
return None
|
171 |
+
|
172 |
+
# If it's a builtin voice with URL
|
173 |
+
elif voice_info.get("type") == "builtin" and "audio_url" in voice_info:
|
174 |
+
return download_audio_from_url(voice_info["audio_url"])
|
175 |
+
|
176 |
+
return None
|
177 |
|
178 |
def load_chatterbox_model():
|
179 |
"""Try multiple ways to load ChatterboxTTS from Resemble AI"""
|
|
|
220 |
except Exception as e:
|
221 |
logger.warning(f"Method 3 failed with error: {e}")
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
# If we get here, the GitHub repo might have a different structure
|
224 |
logger.error("β Could not load ChatterboxTTS from Resemble AI repository")
|
225 |
logger.error("π‘ The GitHub repo might have a different structure than expected")
|
|
|
228 |
|
229 |
return False
|
230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
def get_or_load_model():
|
232 |
"""Load ChatterboxTTS model if not already loaded"""
|
233 |
global MODEL
|
|
|
240 |
logger.info("β
ChatterboxTTS model loaded successfully")
|
241 |
else:
|
242 |
logger.error("β Failed to load ChatterboxTTS - using fallback")
|
|
|
243 |
create_fallback_model()
|
244 |
return MODEL
|
245 |
|
|
|
298 |
"""Generate unique ID"""
|
299 |
return str(uuid.uuid4())
|
300 |
|
301 |
+
# Load voice library at startup
|
302 |
+
load_voice_library()
|
303 |
+
|
304 |
# Pydantic models for API
|
305 |
class TTSRequest(BaseModel):
|
306 |
text: str
|
307 |
+
voice_id: Optional[str] = "female_default"
|
308 |
exaggeration: Optional[float] = 0.5
|
309 |
temperature: Optional[float] = 0.8
|
310 |
cfg_weight: Optional[float] = 0.5
|
311 |
seed: Optional[int] = 0
|
312 |
|
313 |
+
class VoiceCreateRequest(BaseModel):
|
314 |
+
voice_name: str
|
315 |
+
voice_description: Optional[str] = "Custom voice"
|
316 |
+
|
317 |
+
class VoiceInfo(BaseModel):
|
318 |
+
voice_id: str
|
319 |
+
name: str
|
320 |
+
description: str
|
321 |
+
type: str
|
322 |
+
created_at: str
|
323 |
+
|
324 |
class TTSResponse(BaseModel):
|
325 |
success: bool
|
326 |
audio_id: Optional[str] = None
|
|
|
342 |
@spaces.GPU
|
343 |
def generate_tts_audio(
|
344 |
text_input: str,
|
345 |
+
voice_id: str,
|
346 |
exaggeration_input: float,
|
347 |
temperature_input: float,
|
348 |
seed_num_input: int,
|
349 |
cfgw_input: float
|
350 |
) -> tuple[int, np.ndarray]:
|
351 |
"""
|
352 |
+
Generate TTS audio using ChatterboxTTS model with voice ID
|
353 |
"""
|
354 |
current_model = get_or_load_model()
|
355 |
|
|
|
360 |
set_seed(int(seed_num_input))
|
361 |
|
362 |
logger.info(f"π΅ Generating audio for: '{text_input[:50]}...'")
|
363 |
+
logger.info(f"π Using voice: {voice_id}")
|
364 |
|
365 |
if not CHATTERBOX_AVAILABLE:
|
366 |
logger.warning("π¨ USING FALLBACK - Real ChatterboxTTS not found!")
|
|
|
367 |
|
368 |
+
# Get audio path for the voice
|
369 |
+
audio_prompt_path = get_voice_audio_path(voice_id)
|
370 |
temp_audio_file = None
|
371 |
|
372 |
try:
|
373 |
+
if audio_prompt_path and audio_prompt_path.startswith('/tmp/'):
|
374 |
+
# It's a temporary file from URL download
|
375 |
+
temp_audio_file = audio_prompt_path
|
376 |
+
|
377 |
+
if audio_prompt_path:
|
378 |
+
voice_name = voice_library.get(voice_id, {}).get("name", voice_id)
|
379 |
+
logger.info(f"β
Using voice '{voice_name}' audio: {audio_prompt_path}")
|
380 |
+
else:
|
381 |
+
logger.warning(f"β οΈ Could not load audio for voice {voice_id}, using default")
|
|
|
|
|
|
|
|
|
382 |
|
383 |
# Generate audio
|
384 |
wav = current_model.generate(
|
|
|
400 |
logger.error(f"β Audio generation failed: {e}")
|
401 |
raise
|
402 |
finally:
|
403 |
+
# Clean up temporary file (only if it's a downloaded URL)
|
404 |
+
if temp_audio_file and temp_audio_file.startswith('/tmp/') and os.path.exists(temp_audio_file):
|
405 |
try:
|
406 |
os.unlink(temp_audio_file)
|
407 |
logger.info(f"ποΈ Cleaned up temporary file: {temp_audio_file}")
|
|
|
410 |
|
411 |
# FastAPI app for API endpoints
|
412 |
app = FastAPI(
|
413 |
+
title="ChatterboxTTS Voice Manager API",
|
414 |
+
description="Advanced text-to-speech with voice cloning and management",
|
415 |
+
version="2.0.0"
|
416 |
)
|
417 |
|
418 |
app.add_middleware(
|
|
|
427 |
async def root():
|
428 |
"""API status endpoint"""
|
429 |
return {
|
430 |
+
"service": "ChatterboxTTS Voice Manager API",
|
431 |
+
"version": "2.0.0",
|
432 |
"status": "operational" if MODEL else "model_loading",
|
433 |
"model_loaded": MODEL is not None,
|
434 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
435 |
"device": DEVICE,
|
436 |
+
"voices_available": len(voice_library),
|
437 |
"message": "Real ChatterboxTTS loaded" if CHATTERBOX_AVAILABLE else "Using fallback - upload ChatterboxTTS package",
|
438 |
"endpoints": {
|
439 |
"synthesize": "/api/tts/synthesize",
|
440 |
+
"voices": "/api/voices",
|
441 |
+
"create_voice": "/api/voices/create",
|
442 |
"audio": "/api/audio/{audio_id}",
|
443 |
"health": "/health"
|
444 |
}
|
|
|
452 |
"model_loaded": MODEL is not None,
|
453 |
"real_chatterbox": CHATTERBOX_AVAILABLE,
|
454 |
"device": DEVICE,
|
455 |
+
"voices_total": len(voice_library),
|
456 |
"timestamp": time.time(),
|
457 |
"warning": None if CHATTERBOX_AVAILABLE else "Using fallback model - upload ChatterboxTTS for production"
|
458 |
}
|
459 |
|
460 |
+
@app.get("/api/voices")
|
461 |
+
async def get_voices():
|
462 |
+
"""Get all available voices"""
|
463 |
+
voices = []
|
464 |
+
for voice_id, voice_info in voice_library.items():
|
465 |
+
voices.append(VoiceInfo(
|
466 |
+
voice_id=voice_id,
|
467 |
+
name=voice_info["name"],
|
468 |
+
description=voice_info["description"],
|
469 |
+
type=voice_info["type"],
|
470 |
+
created_at=voice_info["created_at"]
|
471 |
+
))
|
472 |
+
|
473 |
+
return {
|
474 |
+
"voices": voices,
|
475 |
+
"total": len(voices),
|
476 |
+
"builtin": len([v for v in voices if v.type == "builtin"]),
|
477 |
+
"custom": len([v for v in voices if v.type == "custom"])
|
478 |
+
}
|
479 |
+
|
480 |
+
@app.post("/api/voices/create")
|
481 |
+
async def create_voice_api(
|
482 |
+
voice_name: str,
|
483 |
+
voice_description: str = "Custom voice",
|
484 |
+
audio_file: UploadFile = File(...)
|
485 |
+
):
|
486 |
+
"""Create a new voice from uploaded audio"""
|
487 |
+
try:
|
488 |
+
# Read uploaded file
|
489 |
+
audio_data = await audio_file.read()
|
490 |
+
|
491 |
+
# Save to temporary file for processing
|
492 |
+
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
493 |
+
temp_file.write(audio_data)
|
494 |
+
temp_file.close()
|
495 |
+
|
496 |
+
# Create voice
|
497 |
+
voice_id, voice_entry = create_voice_from_audio(
|
498 |
+
temp_file.name,
|
499 |
+
voice_name,
|
500 |
+
voice_description
|
501 |
+
)
|
502 |
+
|
503 |
+
# Cleanup temp file
|
504 |
+
os.unlink(temp_file.name)
|
505 |
+
|
506 |
+
if voice_id:
|
507 |
+
return {
|
508 |
+
"success": True,
|
509 |
+
"voice_id": voice_id,
|
510 |
+
"message": f"Voice '{voice_name}' created successfully",
|
511 |
+
"voice_info": voice_entry
|
512 |
+
}
|
513 |
+
else:
|
514 |
+
raise HTTPException(status_code=500, detail="Failed to create voice")
|
515 |
+
|
516 |
+
except Exception as e:
|
517 |
+
logger.error(f"β Voice creation failed: {e}")
|
518 |
+
raise HTTPException(status_code=500, detail=f"Voice creation failed: {str(e)}")
|
519 |
+
|
520 |
+
@app.delete("/api/voices/{voice_id}")
|
521 |
+
async def delete_voice(voice_id: str):
|
522 |
+
"""Delete a custom voice"""
|
523 |
+
if voice_id not in voice_library:
|
524 |
+
raise HTTPException(status_code=404, detail="Voice not found")
|
525 |
+
|
526 |
+
voice_info = voice_library[voice_id]
|
527 |
+
|
528 |
+
if voice_info.get("type") == "builtin":
|
529 |
+
raise HTTPException(status_code=400, detail="Cannot delete builtin voices")
|
530 |
+
|
531 |
+
try:
|
532 |
+
# Delete audio file
|
533 |
+
if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
|
534 |
+
os.unlink(voice_info["audio_path"])
|
535 |
+
|
536 |
+
# Remove from library
|
537 |
+
voice_name = voice_info["name"]
|
538 |
+
del voice_library[voice_id]
|
539 |
+
save_voice_library()
|
540 |
+
|
541 |
+
return {
|
542 |
+
"success": True,
|
543 |
+
"message": f"Voice '{voice_name}' deleted successfully"
|
544 |
+
}
|
545 |
+
|
546 |
+
except Exception as e:
|
547 |
+
logger.error(f"β Voice deletion failed: {e}")
|
548 |
+
raise HTTPException(status_code=500, detail=f"Voice deletion failed: {str(e)}")
|
549 |
+
|
550 |
@app.post("/api/tts/synthesize", response_model=TTSResponse)
|
551 |
async def synthesize_speech(request: TTSRequest):
|
552 |
"""
|
553 |
+
Synthesize speech from text using voice ID
|
554 |
"""
|
555 |
try:
|
556 |
if MODEL is None:
|
|
|
562 |
if len(request.text) > 500:
|
563 |
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
564 |
|
565 |
+
if request.voice_id not in voice_library:
|
566 |
+
raise HTTPException(status_code=404, detail=f"Voice '{request.voice_id}' not found")
|
567 |
+
|
568 |
start_time = time.time()
|
569 |
|
570 |
+
# Generate audio using voice ID
|
571 |
+
sample_rate, audio_data = generate_tts_audio(
|
572 |
+
request.text,
|
573 |
+
request.voice_id,
|
574 |
+
request.exaggeration,
|
575 |
+
request.temperature,
|
576 |
+
request.seed,
|
577 |
+
request.cfg_weight
|
578 |
+
)
|
579 |
|
580 |
+
generation_time = time.time() - start_time
|
|
|
|
|
|
|
|
|
|
|
|
|
581 |
|
582 |
+
# Save audio file
|
583 |
+
audio_id = generate_id()
|
584 |
+
audio_path = os.path.join(AUDIO_DIR, f"{audio_id}.wav")
|
585 |
+
sf.write(audio_path, audio_data, sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
586 |
|
587 |
+
# Cache audio info
|
588 |
+
voice_name = voice_library[request.voice_id]["name"]
|
589 |
+
audio_cache[audio_id] = {
|
590 |
+
"path": audio_path,
|
591 |
+
"text": request.text,
|
592 |
+
"voice_id": request.voice_id,
|
593 |
+
"voice_name": voice_name,
|
594 |
+
"sample_rate": sample_rate,
|
595 |
+
"duration": len(audio_data) / sample_rate,
|
596 |
+
"generated_at": time.time(),
|
597 |
+
"generation_time": generation_time,
|
598 |
+
"real_chatterbox": CHATTERBOX_AVAILABLE
|
599 |
+
}
|
600 |
+
|
601 |
+
message = f"Speech synthesized successfully using voice '{voice_name}'"
|
602 |
+
if not CHATTERBOX_AVAILABLE:
|
603 |
+
message += " (using fallback - upload ChatterboxTTS for real synthesis)"
|
604 |
+
|
605 |
+
logger.info(f"β
Audio saved: {audio_id} ({generation_time:.2f}s) with voice '{voice_name}'")
|
606 |
+
|
607 |
+
return TTSResponse(
|
608 |
+
success=True,
|
609 |
+
audio_id=audio_id,
|
610 |
+
message=message,
|
611 |
+
sample_rate=sample_rate,
|
612 |
+
duration=len(audio_data) / sample_rate
|
613 |
+
)
|
614 |
|
615 |
except HTTPException:
|
616 |
raise
|
|
|
620 |
|
621 |
@app.get("/api/audio/{audio_id}")
|
622 |
async def get_audio(audio_id: str):
|
623 |
+
"""Download generated audio file"""
|
|
|
|
|
624 |
if audio_id not in audio_cache:
|
625 |
raise HTTPException(status_code=404, detail="Audio not found")
|
626 |
|
|
|
644 |
|
645 |
@app.get("/api/audio/{audio_id}/info")
|
646 |
async def get_audio_info(audio_id: str):
|
647 |
+
"""Get audio file information"""
|
|
|
|
|
648 |
if audio_id not in audio_cache:
|
649 |
raise HTTPException(status_code=404, detail="Audio not found")
|
650 |
|
|
|
652 |
|
653 |
@app.get("/api/audio")
|
654 |
async def list_audio():
|
655 |
+
"""List all generated audio files"""
|
|
|
|
|
656 |
return {
|
657 |
"audio_files": [
|
658 |
{
|
659 |
"audio_id": audio_id,
|
660 |
"text": info["text"][:50] + "..." if len(info["text"]) > 50 else info["text"],
|
661 |
+
"voice_name": info.get("voice_name", "Unknown"),
|
662 |
"duration": info["duration"],
|
663 |
"generated_at": info["generated_at"],
|
664 |
"real_chatterbox": info.get("real_chatterbox", False)
|
|
|
670 |
|
671 |
# Gradio interface
|
672 |
def create_gradio_interface():
|
673 |
+
"""Create Gradio interface with voice management"""
|
674 |
+
|
675 |
+
def get_voice_choices():
|
676 |
+
"""Get voice choices for dropdown"""
|
677 |
+
choices = []
|
678 |
+
for voice_id, voice_info in voice_library.items():
|
679 |
+
voice_type = "π§" if voice_info["type"] == "builtin" else "π"
|
680 |
+
choices.append((f"{voice_type} {voice_info['name']} - {voice_info['description']}", voice_id))
|
681 |
+
return choices
|
682 |
+
|
683 |
+
def refresh_voice_choices():
|
684 |
+
"""Refresh voice dropdown"""
|
685 |
+
return gr.update(choices=get_voice_choices())
|
686 |
+
|
687 |
+
def create_voice_ui(voice_name, voice_description, audio_file):
|
688 |
+
"""Create voice from UI"""
|
689 |
+
try:
|
690 |
+
if not voice_name.strip():
|
691 |
+
return "β Please enter a voice name", gr.update()
|
692 |
+
|
693 |
+
if audio_file is None:
|
694 |
+
return "β Please upload an audio file", gr.update()
|
695 |
+
|
696 |
+
voice_id, voice_entry = create_voice_from_audio(
|
697 |
+
audio_file,
|
698 |
+
voice_name.strip(),
|
699 |
+
voice_description.strip() or "Custom voice"
|
700 |
+
)
|
701 |
+
|
702 |
+
if voice_id:
|
703 |
+
updated_choices = get_voice_choices()
|
704 |
+
return (
|
705 |
+
f"β
Voice '{voice_name}' created successfully!\n"
|
706 |
+
f"π Voice ID: {voice_id}\n"
|
707 |
+
f"π Audio saved and ready to use\n"
|
708 |
+
f"π Available in voice selection dropdown",
|
709 |
+
gr.update(choices=updated_choices, value=voice_id)
|
710 |
+
)
|
711 |
+
else:
|
712 |
+
return "β Failed to create voice", gr.update()
|
713 |
+
|
714 |
+
except Exception as e:
|
715 |
+
logger.error(f"UI voice creation failed: {e}")
|
716 |
+
return f"β Voice creation failed: {str(e)}", gr.update()
|
717 |
+
|
718 |
+
def generate_speech_ui(text, voice_id, exag, temp, seed_val, cfg):
|
719 |
+
"""Generate speech from UI using voice ID"""
|
720 |
+
try:
|
721 |
+
if not text.strip():
|
722 |
+
return None, "β Please enter some text"
|
723 |
+
|
724 |
+
if len(text) > 300:
|
725 |
+
return None, "β Text too long (max 300 characters)"
|
726 |
+
|
727 |
+
if not voice_id or voice_id not in voice_library:
|
728 |
+
return None, "β Please select a valid voice"
|
729 |
+
|
730 |
+
start_time = time.time()
|
731 |
+
|
732 |
+
# Generate audio using voice ID
|
733 |
+
sample_rate, audio_data = generate_tts_audio(
|
734 |
+
text, voice_id, exag, temp, int(seed_val), cfg
|
735 |
+
)
|
736 |
+
|
737 |
+
generation_time = time.time() - start_time
|
738 |
+
duration = len(audio_data) / sample_rate
|
739 |
+
|
740 |
+
voice_name = voice_library[voice_id]["name"]
|
741 |
+
voice_type = voice_library[voice_id]["type"]
|
742 |
+
|
743 |
+
if CHATTERBOX_AVAILABLE:
|
744 |
+
status = f"""β
Real ChatterboxTTS synthesis completed!
|
745 |
+
|
746 |
+
π Voice: {voice_name} ({voice_type})
|
747 |
+
β±οΈ Generation time: {generation_time:.2f}s
|
748 |
+
π΅ Audio duration: {duration:.2f}s
|
749 |
+
π Sample rate: {sample_rate} Hz
|
750 |
+
π Audio samples: {len(audio_data):,}
|
751 |
+
"""
|
752 |
+
else:
|
753 |
+
status = f"""β οΈ Fallback audio generated (beep sound)
|
754 |
+
|
755 |
+
π¨ This is NOT real speech synthesis!
|
756 |
+
π Voice: {voice_name} ({voice_type})
|
757 |
+
π¦ Upload ChatterboxTTS package for real synthesis
|
758 |
+
β±οΈ Generation time: {generation_time:.2f}s
|
759 |
+
π΅ Audio duration: {duration:.2f}s
|
760 |
+
|
761 |
+
π‘ To fix: Upload your ChatterboxTTS files to this Space
|
762 |
+
"""
|
763 |
+
|
764 |
+
return (sample_rate, audio_data), status
|
765 |
+
|
766 |
+
except Exception as e:
|
767 |
+
logger.error(f"UI generation failed: {e}")
|
768 |
+
return None, f"β Generation failed: {str(e)}"
|
769 |
+
|
770 |
+
def delete_voice_ui(voice_id):
|
771 |
+
"""Delete voice from UI"""
|
772 |
+
try:
|
773 |
+
if not voice_id or voice_id not in voice_library:
|
774 |
+
return "β Please select a voice to delete", gr.update()
|
775 |
+
|
776 |
+
voice_info = voice_library[voice_id]
|
777 |
+
|
778 |
+
if voice_info.get("type") == "builtin":
|
779 |
+
return "β Cannot delete builtin voices", gr.update()
|
780 |
+
|
781 |
+
voice_name = voice_info["name"]
|
782 |
+
|
783 |
+
# Delete audio file
|
784 |
+
if "audio_path" in voice_info and os.path.exists(voice_info["audio_path"]):
|
785 |
+
os.unlink(voice_info["audio_path"])
|
786 |
+
|
787 |
+
# Remove from library
|
788 |
+
del voice_library[voice_id]
|
789 |
+
save_voice_library()
|
790 |
+
|
791 |
+
updated_choices = get_voice_choices()
|
792 |
+
return (
|
793 |
+
f"β
Voice '{voice_name}' deleted successfully",
|
794 |
+
gr.update(choices=updated_choices, value=updated_choices[0][1] if updated_choices else None)
|
795 |
+
)
|
796 |
+
|
797 |
+
except Exception as e:
|
798 |
+
logger.error(f"UI voice deletion failed: {e}")
|
799 |
+
return f"β Voice deletion failed: {str(e)}", gr.update()
|
800 |
|
801 |
+
with gr.Blocks(title="ChatterboxTTS Voice Manager", theme=gr.themes.Soft()) as demo:
|
802 |
|
803 |
# Status indicator at the top
|
804 |
if CHATTERBOX_AVAILABLE:
|
|
|
815 |
""")
|
816 |
|
817 |
gr.Markdown("""
|
818 |
+
# π΅ ChatterboxTTS Voice Manager
|
819 |
|
820 |
+
**Advanced text-to-speech with custom voice cloning and voice library management**
|
821 |
""")
|
822 |
|
823 |
+
with gr.Tabs():
|
824 |
+
# Text-to-Speech Tab
|
825 |
+
with gr.TabItem("π΅ Generate Speech"):
|
826 |
+
with gr.Row():
|
827 |
+
with gr.Column():
|
828 |
+
text_input = gr.Textbox(
|
829 |
+
value="Hello, this is ChatterboxTTS with custom voice cloning. I can speak in any voice you train me with!",
|
830 |
+
label="Text to synthesize (max 300 characters)",
|
831 |
+
max_lines=5,
|
832 |
+
placeholder="Enter your text here..."
|
833 |
+
)
|
834 |
+
|
835 |
+
voice_selector = gr.Dropdown(
|
836 |
+
label="π Select Voice",
|
837 |
+
choices=get_voice_choices(),
|
838 |
+
value=list(voice_library.keys())[0] if voice_library else None,
|
839 |
+
interactive=True,
|
840 |
+
info="Choose from builtin voices (π§) or your custom voices (π)"
|
841 |
+
)
|
842 |
+
|
843 |
+
with gr.Row():
|
844 |
+
generate_btn = gr.Button("π΅ Generate Speech", variant="primary")
|
845 |
+
refresh_voices_btn = gr.Button("π Refresh Voices", size="sm")
|
846 |
+
|
847 |
+
with gr.Row():
|
848 |
+
exaggeration = gr.Slider(
|
849 |
+
0.25, 2,
|
850 |
+
step=0.05,
|
851 |
+
label="Exaggeration",
|
852 |
+
value=0.5,
|
853 |
+
info="Controls expressiveness (0.5 = neutral)"
|
854 |
+
)
|
855 |
+
|
856 |
+
cfg_weight = gr.Slider(
|
857 |
+
0.2, 1,
|
858 |
+
step=0.05,
|
859 |
+
label="CFG Weight",
|
860 |
+
value=0.5,
|
861 |
+
info="Controls pace and clarity"
|
862 |
+
)
|
863 |
+
|
864 |
+
with gr.Accordion("Advanced Settings", open=False):
|
865 |
+
temperature = gr.Slider(
|
866 |
+
0.05, 5,
|
867 |
+
step=0.05,
|
868 |
+
label="Temperature",
|
869 |
+
value=0.8,
|
870 |
+
info="Controls randomness"
|
871 |
+
)
|
872 |
+
|
873 |
+
seed = gr.Number(
|
874 |
+
value=0,
|
875 |
+
label="Seed (0 = random)",
|
876 |
+
info="Set to non-zero for reproducible results"
|
877 |
+
)
|
878 |
+
|
879 |
+
with gr.Column():
|
880 |
+
audio_output = gr.Audio(label="π Generated Speech")
|
881 |
+
|
882 |
+
status_text = gr.Textbox(
|
883 |
+
label="π Generation Status",
|
884 |
+
interactive=False,
|
885 |
+
lines=8,
|
886 |
+
placeholder="Select a voice and click 'Generate Speech' to start..."
|
887 |
+
)
|
888 |
|
889 |
+
# Voice Management Tab
|
890 |
+
with gr.TabItem("π Voice Library"):
|
891 |
+
with gr.Row():
|
892 |
+
with gr.Column():
|
893 |
+
gr.Markdown("### π Available Voices")
|
894 |
+
|
895 |
+
voices_display = gr.HTML(
|
896 |
+
value=f"""
|
897 |
+
<div style="max-height: 300px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
|
898 |
+
{''.join([f"<p><strong>{voice_info['name']}</strong> ({voice_info['type']})<br><small>{voice_info['description']}</small></p>" for voice_info in voice_library.values()])}
|
899 |
+
</div>
|
900 |
+
"""
|
901 |
+
)
|
902 |
+
|
903 |
+
gr.Markdown("### ποΈ Delete Voice")
|
904 |
+
delete_voice_selector = gr.Dropdown(
|
905 |
+
label="Select voice to delete",
|
906 |
+
choices=[(f"{info['name']} ({info['type']})", vid) for vid, info in voice_library.items() if info['type'] == 'custom'],
|
907 |
+
value=None
|
908 |
+
)
|
909 |
+
|
910 |
+
delete_voice_btn = gr.Button("ποΈ Delete Selected Voice", variant="stop")
|
911 |
+
delete_status = gr.Textbox(label="Delete Status", interactive=False)
|
912 |
+
|
913 |
+
with gr.Column():
|
914 |
+
gr.Markdown("### β Create New Voice")
|
915 |
+
|
916 |
+
new_voice_name = gr.Textbox(
|
917 |
+
label="Voice Name",
|
918 |
+
placeholder="e.g., 'John's Voice', 'Narrator Voice'",
|
919 |
+
value=""
|
920 |
+
)
|
921 |
+
|
922 |
+
new_voice_description = gr.Textbox(
|
923 |
+
label="Voice Description",
|
924 |
+
placeholder="e.g., 'Professional male voice', 'Warm female narrator'",
|
925 |
+
value=""
|
926 |
+
)
|
927 |
+
|
928 |
+
new_voice_audio = gr.Audio(
|
929 |
+
label="Upload Voice Sample",
|
930 |
+
type="numpy",
|
931 |
+
info="Upload 5-30 seconds of clear speech"
|
932 |
+
)
|
933 |
+
|
934 |
+
create_voice_btn = gr.Button("π― Create Voice", variant="primary")
|
935 |
+
|
936 |
+
create_status = gr.Textbox(
|
937 |
+
label="π Creation Status",
|
938 |
+
interactive=False,
|
939 |
+
lines=6
|
940 |
+
)
|
941 |
|
942 |
+
# Voice Library Info Tab
|
943 |
+
with gr.TabItem("π Voice Guide"):
|
944 |
+
gr.Markdown(f"""
|
945 |
+
## π Voice Library Management
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
946 |
|
947 |
+
### π Current Library Status
|
948 |
+
- **Total Voices**: {len(voice_library)}
|
949 |
+
- **Builtin Voices**: {len([v for v in voice_library.values() if v['type'] == 'builtin'])}
|
950 |
+
- **Custom Voices**: {len([v for v in voice_library.values() if v['type'] == 'custom'])}
|
|
|
|
|
951 |
|
952 |
+
### π§ Builtin Voices
|
953 |
+
These are pre-configured voices that come with the system:
|
954 |
+
{chr(10).join([f"- **{voice_info['name']}**: {voice_info['description']}" for voice_info in voice_library.values() if voice_info['type'] == 'builtin'])}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
955 |
|
956 |
+
### π― Creating Custom Voices
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
957 |
|
958 |
+
#### π Best Practices:
|
959 |
+
1. **Audio Quality**: Use clear, noise-free recordings
|
960 |
+
2. **Duration**: 5-30 seconds of natural speech
|
961 |
+
3. **Content**: Normal conversational speech works best
|
962 |
+
4. **Format**: WAV, MP3, or FLAC files supported
|
963 |
+
5. **Voice Consistency**: Use the same speaker throughout
|
964 |
|
965 |
+
#### π€ Recording Tips:
|
966 |
+
- Record in a quiet environment
|
967 |
+
- Speak naturally and clearly
|
968 |
+
- Avoid background noise
|
969 |
+
- Use a decent microphone if possible
|
970 |
+
- Read a paragraph of normal text
|
971 |
|
972 |
+
#### π Voice Management:
|
973 |
+
- **Create**: Upload audio + provide name and description
|
974 |
+
- **Use**: Select from dropdown in speech generation
|
975 |
+
- **Delete**: Remove custom voices you no longer need
|
976 |
+
- **Persistent**: Custom voices are saved permanently
|
|
|
|
|
|
|
|
|
|
|
|
|
977 |
|
978 |
+
### π Usage Workflow:
|
979 |
+
1. **Upload Voice Sample** β Create custom voice
|
980 |
+
2. **Select Voice** β Choose from library
|
981 |
+
3. **Generate Speech** β Use selected voice for TTS
|
982 |
+
4. **Manage Library** β Add, delete, organize voices
|
983 |
|
984 |
+
### π API Integration:
|
985 |
+
```python
|
986 |
+
# List voices
|
987 |
+
GET /api/voices
|
988 |
|
989 |
+
# Create voice
|
990 |
+
POST /api/voices/create
|
|
|
|
|
991 |
|
992 |
+
# Generate speech with voice
|
993 |
+
POST /api/tts/synthesize
|
994 |
+
{{
|
995 |
+
"text": "Hello world",
|
996 |
+
"voice_id": "your_voice_id"
|
997 |
+
}}
|
998 |
|
999 |
+
# Delete voice
|
1000 |
+
DELETE /api/voices/voice_id
|
1001 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1002 |
|
1003 |
+
### π‘ Pro Tips:
|
1004 |
+
- **Voice Naming**: Use descriptive names like "John_Professional" or "Sarah_Narrator"
|
1005 |
+
- **Voice Testing**: Generate short test phrases after creating voices
|
1006 |
+
- **Voice Backup**: Custom voices are saved to disk automatically
|
1007 |
+
- **Voice Sharing**: Voice IDs can be shared via API
|
1008 |
+
""")
|
1009 |
|
1010 |
+
# Event handlers
|
1011 |
generate_btn.click(
|
1012 |
fn=generate_speech_ui,
|
1013 |
+
inputs=[text_input, voice_selector, exaggeration, temperature, seed, cfg_weight],
|
1014 |
outputs=[audio_output, status_text]
|
1015 |
)
|
1016 |
|
1017 |
+
refresh_voices_btn.click(
|
1018 |
+
fn=refresh_voice_choices,
|
1019 |
+
outputs=[voice_selector]
|
1020 |
+
)
|
1021 |
+
|
1022 |
+
create_voice_btn.click(
|
1023 |
+
fn=create_voice_ui,
|
1024 |
+
inputs=[new_voice_name, new_voice_description, new_voice_audio],
|
1025 |
+
outputs=[create_status, voice_selector]
|
1026 |
+
)
|
1027 |
+
|
1028 |
+
delete_voice_btn.click(
|
1029 |
+
fn=delete_voice_ui,
|
1030 |
+
inputs=[delete_voice_selector],
|
1031 |
+
outputs=[delete_status, voice_selector]
|
1032 |
+
)
|
1033 |
+
|
1034 |
+
# System info with voice library status
|
1035 |
model_status = "β
Real ChatterboxTTS" if CHATTERBOX_AVAILABLE else "β οΈ Fallback Model (Beep Sounds)"
|
1036 |
chatterbox_status = "Available" if CHATTERBOX_AVAILABLE else "Missing - Upload Package"
|
1037 |
|
|
|
1039 |
### π System Status
|
1040 |
- **Model**: {model_status}
|
1041 |
- **Device**: {DEVICE}
|
|
|
1042 |
- **ChatterboxTTS**: {chatterbox_status}
|
1043 |
+
- **Voice Library**: {len(voice_library)} voices loaded
|
1044 |
+
- **Generated Files**: {len(audio_cache)}
|
1045 |
+
- **Storage**: `{VOICES_DIR}/` for voices, `{AUDIO_DIR}/` for output
|
1046 |
|
1047 |
{'''### π Production Ready!
|
1048 |
+
Your ChatterboxTTS model is loaded with voice management system.''' if CHATTERBOX_AVAILABLE else '''### β οΈ Action Required
|
1049 |
**You're hearing beep sounds because ChatterboxTTS isn't loaded.**
|
1050 |
|
1051 |
+
Voice management is working, but you need ChatterboxTTS for real synthesis.'''}
|
|
|
|
|
|
|
|
|
|
|
1052 |
""")
|
1053 |
|
1054 |
return demo
|
1055 |
|
1056 |
# Main execution
|
1057 |
if __name__ == "__main__":
|
1058 |
+
logger.info("π Starting ChatterboxTTS Voice Management Service...")
|
1059 |
|
1060 |
# Model status
|
1061 |
if CHATTERBOX_AVAILABLE and MODEL:
|
|
|
1068 |
logger.info(f"Model Status: {model_status}")
|
1069 |
logger.info(f"Device: {DEVICE}")
|
1070 |
logger.info(f"ChatterboxTTS Available: {CHATTERBOX_AVAILABLE}")
|
1071 |
+
logger.info(f"Voice Library: {len(voice_library)} voices loaded")
|
1072 |
+
logger.info(f"Custom Voices: {len([v for v in voice_library.values() if v['type'] == 'custom'])}")
|
1073 |
|
1074 |
if not CHATTERBOX_AVAILABLE:
|
1075 |
logger.warning("π¨ IMPORTANT: Upload your ChatterboxTTS package to enable real synthesis!")
|
|
|
1076 |
|
1077 |
if os.getenv("SPACE_ID"):
|
1078 |
# Running in Hugging Face Spaces
|
|
|
1097 |
|
1098 |
logger.info("π FastAPI: http://localhost:8000")
|
1099 |
logger.info("π API Docs: http://localhost:8000/docs")
|
1100 |
+
logger.info("π API Endpoints:")
|
1101 |
+
logger.info(" - GET /api/voices")
|
1102 |
+
logger.info(" - POST /api/voices/create")
|
1103 |
+
logger.info(" - DELETE /api/voices/{voice_id}")
|
1104 |
+
logger.info(" - POST /api/tts/synthesize")
|
1105 |
|
1106 |
# Start Gradio
|
1107 |
demo = create_gradio_interface()
|