Spaces:
Sleeping
Sleeping
| import logging | |
| from typing import Optional, Generator, Tuple, List, Dict, Any | |
| import numpy as np | |
| # Import the base class and dummy implementation | |
| from utils.tts_simplified import TTSBase, DummyTTS | |
| # Import the specific TTS implementations | |
| from utils.tts_kokoro_simplified import KokoroTTS, KOKORO_AVAILABLE | |
| from utils.tts_dia_simplified import DiaTTS, DIA_AVAILABLE | |
| from utils.tts_cosyvoice2_simplified import CosyVoice2TTS, COSYVOICE2_AVAILABLE | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| def get_available_engines() -> List[str]: | |
| """Get a list of available TTS engines | |
| Returns: | |
| List[str]: List of available engine names | |
| """ | |
| available = [] | |
| if KOKORO_AVAILABLE: | |
| available.append('kokoro') | |
| if DIA_AVAILABLE: | |
| available.append('dia') | |
| if COSYVOICE2_AVAILABLE: | |
| available.append('cosyvoice2') | |
| # Dummy is always available | |
| available.append('dummy') | |
| return available | |
| def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase: | |
| """Get a TTS engine instance | |
| Args: | |
| engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy') | |
| If None, the best available engine will be used | |
| lang_code (str): Language code for the engine | |
| Returns: | |
| TTSBase: An instance of a TTS engine | |
| """ | |
| # Get available engines | |
| available_engines = get_available_engines() | |
| logger.info(f"Available TTS engines: {available_engines}") | |
| # If engine_type is specified, try to create that specific engine | |
| if engine_type is not None: | |
| if engine_type == 'kokoro' and KOKORO_AVAILABLE: | |
| logger.info("Creating Kokoro TTS engine") | |
| return KokoroTTS(lang_code) | |
| elif engine_type == 'dia' and DIA_AVAILABLE: | |
| logger.info("Creating Dia TTS engine") | |
| return DiaTTS(lang_code) | |
| elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE: | |
| logger.info("Creating CosyVoice2 TTS engine") | |
| return CosyVoice2TTS(lang_code) | |
| elif engine_type == 'dummy': | |
| logger.info("Creating Dummy TTS engine") | |
| return DummyTTS(lang_code) | |
| else: | |
| logger.warning(f"Requested engine '{engine_type}' is not available") | |
| # If no specific engine is requested or the requested engine is not available, | |
| # use the best available engine based on priority | |
| priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy'] | |
| for engine in priority_order: | |
| if engine in available_engines: | |
| logger.info(f"Using best available engine: {engine}") | |
| if engine == 'kokoro': | |
| return KokoroTTS(lang_code) | |
| elif engine == 'dia': | |
| return DiaTTS(lang_code) | |
| elif engine == 'cosyvoice2': | |
| return CosyVoice2TTS(lang_code) | |
| elif engine == 'dummy': | |
| return DummyTTS(lang_code) | |
| # Fallback to dummy engine if no engines are available | |
| logger.warning("No TTS engines available, falling back to dummy engine") | |
| return DummyTTS(lang_code) | |
| def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z', | |
| voice: str = 'default', speed: float = 1.0) -> Optional[str]: | |
| """Generate speech using the specified or best available TTS engine | |
| Args: | |
| text (str): Input text to synthesize | |
| engine_type (str, optional): Type of engine to use | |
| lang_code (str): Language code | |
| voice (str): Voice ID to use | |
| speed (float): Speech speed multiplier | |
| Returns: | |
| Optional[str]: Path to the generated audio file or None if generation fails | |
| """ | |
| engine = get_tts_engine(engine_type, lang_code) | |
| return engine.generate_speech(text, voice, speed) | |
| def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z', | |
| voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: | |
| """Generate speech stream using the specified or best available TTS engine | |
| Args: | |
| text (str): Input text to synthesize | |
| engine_type (str, optional): Type of engine to use | |
| lang_code (str): Language code | |
| voice (str): Voice ID to use | |
| speed (float): Speech speed multiplier | |
| Yields: | |
| tuple: (sample_rate, audio_data) pairs for each segment | |
| """ | |
| engine = get_tts_engine(engine_type, lang_code) | |
| yield from engine.generate_speech_stream(text, voice, speed) |