Spaces:
Build error
Build error
File size: 4,642 Bytes
c72d839 7495571 aaa0814 7495571 b2b15db 9c8546d 3ed3b5a c72d839 60bd17d 7495571 60bd17d 7495571 3ed3b5a 60bd17d 7495571 60bd17d 9740afc 60bd17d 7495571 60bd17d 7495571 e734196 7495571 e734196 7495571 e734196 7495571 e734196 7495571 e734196 7495571 3ed3b5a 60bd17d 7495571 60bd17d 7495571 60bd17d 7495571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import logging
from typing import Optional, Generator, Tuple, List, Dict, Any
import numpy as np
# Import the base class and dummy implementation
from utils.tts_base import TTSBase
from utils.tts_dummy import DummyTTS
# Import the specific TTS implementations
from utils.tts_kokoro import KokoroTTS, KOKORO_AVAILABLE
from utils.tts_dia import DiaTTS, DIA_AVAILABLE
from utils.tts_cosyvoice2 import CosyVoice2TTS, COSYVOICE2_AVAILABLE
# Configure logging
logger = logging.getLogger(__name__)
def get_available_engines() -> List[str]:
"""Get a list of available TTS engines
Returns:
List[str]: List of available engine names
"""
available = []
if KOKORO_AVAILABLE:
available.append('kokoro')
if DIA_AVAILABLE:
available.append('dia')
if COSYVOICE2_AVAILABLE:
available.append('cosyvoice2')
# Dummy is always available
available.append('dummy')
return available
def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase:
"""Get a TTS engine instance
Args:
engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy')
If None, the best available engine will be used
lang_code (str): Language code for the engine
Returns:
TTSBase: An instance of a TTS engine
"""
# Get available engines
available_engines = get_available_engines()
logger.info(f"Available TTS engines: {available_engines}")
# If engine_type is specified, try to create that specific engine
if engine_type is not None:
if engine_type == 'kokoro' and KOKORO_AVAILABLE:
logger.info("Creating Kokoro TTS engine")
return KokoroTTS(lang_code)
elif engine_type == 'dia' and DIA_AVAILABLE:
logger.info("Creating Dia TTS engine")
return DiaTTS(lang_code)
elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE:
logger.info("Creating CosyVoice2 TTS engine")
return CosyVoice2TTS(lang_code)
elif engine_type == 'dummy':
logger.info("Creating Dummy TTS engine")
return DummyTTS(lang_code)
else:
logger.warning(f"Requested engine '{engine_type}' is not available")
# If no specific engine is requested or the requested engine is not available,
# use the best available engine based on priority
priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy']
for engine in priority_order:
if engine in available_engines:
logger.info(f"Using best available engine: {engine}")
if engine == 'kokoro':
return KokoroTTS(lang_code)
elif engine == 'dia':
return DiaTTS(lang_code)
elif engine == 'cosyvoice2':
return CosyVoice2TTS(lang_code)
elif engine == 'dummy':
return DummyTTS(lang_code)
# Fallback to dummy engine if no engines are available
logger.warning("No TTS engines available, falling back to dummy engine")
return DummyTTS(lang_code)
def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
voice: str = 'default', speed: float = 1.0) -> Optional[str]:
"""Generate speech using the specified or best available TTS engine
Args:
text (str): Input text to synthesize
engine_type (str, optional): Type of engine to use
lang_code (str): Language code
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Returns:
Optional[str]: Path to the generated audio file or None if generation fails
"""
engine = get_tts_engine(engine_type, lang_code)
return engine.generate_speech(text, voice, speed)
def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate speech stream using the specified or best available TTS engine
Args:
text (str): Input text to synthesize
engine_type (str, optional): Type of engine to use
lang_code (str): Language code
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
engine = get_tts_engine(engine_type, lang_code)
yield from engine.generate_speech_stream(text, voice, speed) |