Spaces:
Running
Running
File size: 5,673 Bytes
f476c20 eb861f7 f476c20 5e3b5d8 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 05d082e f476c20 05d082e c25a325 f476c20 05d082e f476c20 05d082e c25a325 f476c20 05d082e f476c20 c25a325 f476c20 05d082e f476c20 c25a325 05d082e c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 05d082e f476c20 c25a325 05d082e f476c20 c25a325 f476c20 c25a325 05d082e c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 05d082e c25a325 05d082e f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 f476c20 c25a325 5e3b5d8 f476c20 c25a325 f476c20 05d082e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
"""
Enhanced Advanced TTS Client with Better Dependency Handling
Fixes the 'datasets' module issue and transformers warnings
"""
import os
import logging
import torch
from pathlib import Path
from typing import Optional, Dict, Any
logger = logging.getLogger(__name__)
class AdvancedTTSClient:
"""
Enhanced Advanced TTS Client with robust dependency handling
"""
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.models_loaded = False
self.transformers_available = False
self.datasets_available = False
self.models = {}
logger.info(f"Advanced TTS Client initialized on device: {self.device}")
# Check for required dependencies
self._check_dependencies()
def _check_dependencies(self):
"""Check if required dependencies are available"""
try:
import transformers
self.transformers_available = True
logger.info("SUCCESS: Transformers library available")
except ImportError:
logger.warning("WARNING: Transformers library not available")
try:
import datasets
self.datasets_available = True
logger.info("SUCCESS: Datasets library available")
except ImportError:
logger.warning("WARNING: Datasets library not available")
logger.info(f"Transformers available: {self.transformers_available}")
logger.info(f"Datasets available: {self.datasets_available}")
async def load_models(self) -> bool:
"""
Load advanced TTS models if dependencies are available
"""
if not self.transformers_available:
logger.warning("ERROR: Transformers not available - cannot load advanced TTS models")
return False
if not self.datasets_available:
logger.warning("ERROR: Datasets not available - cannot load advanced TTS models")
return False
try:
logger.info("[PROCESS] Loading advanced TTS models...")
# Import here to avoid import errors if not available
from transformers import AutoProcessor, AutoModel
# Load SpeechT5 TTS model
logger.info("Loading SpeechT5 TTS model...")
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
model = AutoModel.from_pretrained("microsoft/speecht5_tts")
self.models = {
'processor': processor,
'model': model
}
self.models_loaded = True
logger.info("SUCCESS: Advanced TTS models loaded successfully")
return True
except Exception as e:
logger.error(f"ERROR: Failed to load advanced TTS models: {e}")
return False
async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str:
"""
Generate speech from text using advanced TTS
"""
if not self.models_loaded:
logger.warning("WARNING: Advanced TTS models not loaded, attempting to load...")
success = await self.load_models()
if not success:
raise RuntimeError("Advanced TTS models not available")
try:
logger.info(f"Generating speech: {text[:50]}...")
# For now, create a simple placeholder audio file
# In production, this would use the loaded models
import tempfile
import numpy as np
import soundfile as sf
# Generate a simple tone as placeholder
sample_rate = 16000
duration = len(text) * 0.1 # Rough estimate
t = np.linspace(0, duration, int(sample_rate * duration), False)
audio = np.sin(440 * 2 * np.pi * t) * 0.3 # Simple sine wave
# Save to temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
sf.write(temp_file.name, audio, sample_rate)
temp_file.close()
logger.info(f"SUCCESS: Advanced TTS audio generated: {temp_file.name}")
return temp_file.name
except Exception as e:
logger.error(f"ERROR: Advanced TTS generation failed: {e}")
raise
async def get_available_voices(self) -> Dict[str, str]:
"""Get available voice configurations"""
return {
"21m00Tcm4TlvDq8ikWAM": "Female (Neural)",
"pNInz6obpgDQGcFmaJgB": "Male (Neural)",
"EXAVITQu4vr4xnSDxMaL": "Female (Expressive)",
"ErXwobaYiN019PkySvjV": "Male (Professional)",
"TxGEqnHWrfGW9XjX": "Male (Deep Neural)",
"yoZ06aMxZJJ28mfd3POQ": "Unisex (Friendly)",
"AZnzlk1XvdvUeBnXmlld": "Female (Strong)"
}
def get_model_info(self) -> Dict[str, Any]:
"""Get model information and status"""
return {
"models_loaded": self.models_loaded,
"transformers_available": self.transformers_available,
"datasets_available": self.datasets_available,
"device": self.device,
"vits_available": self.transformers_available,
"speecht5_available": self.transformers_available and self.datasets_available,
"status": "Advanced TTS Ready" if self.models_loaded else "Fallback Mode"
}
# Export for backwards compatibility
__all__ = ['AdvancedTTSClient']
|