File size: 4,642 Bytes
c72d839
7495571
 
 
 
aaa0814
 
7495571
 
b2b15db
 
 
9c8546d
3ed3b5a
c72d839
 
60bd17d
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60bd17d
 
7495571
 
3ed3b5a
60bd17d
7495571
 
60bd17d
9740afc
60bd17d
7495571
60bd17d
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e734196
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e734196
 
7495571
 
 
 
 
e734196
 
7495571
e734196
7495571
 
e734196
7495571
 
 
 
3ed3b5a
60bd17d
 
7495571
 
60bd17d
 
 
7495571
 
60bd17d
7495571
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import logging
from typing import Optional, Generator, Tuple, List, Dict, Any
import numpy as np

# Import the base class and dummy implementation
from utils.tts_base import TTSBase
from utils.tts_dummy import DummyTTS

# Import the specific TTS implementations
from utils.tts_kokoro import KokoroTTS, KOKORO_AVAILABLE
from utils.tts_dia import DiaTTS, DIA_AVAILABLE
from utils.tts_cosyvoice2 import CosyVoice2TTS, COSYVOICE2_AVAILABLE

# Configure logging
logger = logging.getLogger(__name__)


def get_available_engines() -> List[str]:
    """Get a list of available TTS engines
    
    Returns:
        List[str]: List of available engine names
    """
    available = []
    
    if KOKORO_AVAILABLE:
        available.append('kokoro')
    
    if DIA_AVAILABLE:
        available.append('dia')
    
    if COSYVOICE2_AVAILABLE:
        available.append('cosyvoice2')
    
    # Dummy is always available
    available.append('dummy')
    
    return available


def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase:
    """Get a TTS engine instance
    
    Args:
        engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy')
                                    If None, the best available engine will be used
        lang_code (str): Language code for the engine
        
    Returns:
        TTSBase: An instance of a TTS engine
    """
    # Get available engines
    available_engines = get_available_engines()
    logger.info(f"Available TTS engines: {available_engines}")
    
    # If engine_type is specified, try to create that specific engine
    if engine_type is not None:
        if engine_type == 'kokoro' and KOKORO_AVAILABLE:
            logger.info("Creating Kokoro TTS engine")
            return KokoroTTS(lang_code)
        elif engine_type == 'dia' and DIA_AVAILABLE:
            logger.info("Creating Dia TTS engine")
            return DiaTTS(lang_code)
        elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE:
            logger.info("Creating CosyVoice2 TTS engine")
            return CosyVoice2TTS(lang_code)
        elif engine_type == 'dummy':
            logger.info("Creating Dummy TTS engine")
            return DummyTTS(lang_code)
        else:
            logger.warning(f"Requested engine '{engine_type}' is not available")
    
    # If no specific engine is requested or the requested engine is not available,
    # use the best available engine based on priority
    priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy']
    for engine in priority_order:
        if engine in available_engines:
            logger.info(f"Using best available engine: {engine}")
            if engine == 'kokoro':
                return KokoroTTS(lang_code)
            elif engine == 'dia':
                return DiaTTS(lang_code)
            elif engine == 'cosyvoice2':
                return CosyVoice2TTS(lang_code)
            elif engine == 'dummy':
                return DummyTTS(lang_code)
    
    # Fallback to dummy engine if no engines are available
    logger.warning("No TTS engines available, falling back to dummy engine")
    return DummyTTS(lang_code)


def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z', 
                   voice: str = 'default', speed: float = 1.0) -> Optional[str]:
    """Generate speech using the specified or best available TTS engine
    
    Args:
        text (str): Input text to synthesize
        engine_type (str, optional): Type of engine to use
        lang_code (str): Language code
        voice (str): Voice ID to use
        speed (float): Speech speed multiplier
        
    Returns:
        Optional[str]: Path to the generated audio file or None if generation fails
    """
    engine = get_tts_engine(engine_type, lang_code)
    return engine.generate_speech(text, voice, speed)


def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
                          voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
    """Generate speech stream using the specified or best available TTS engine
    
    Args:
        text (str): Input text to synthesize
        engine_type (str, optional): Type of engine to use
        lang_code (str): Language code
        voice (str): Voice ID to use
        speed (float): Speech speed multiplier
        
    Yields:
        tuple: (sample_rate, audio_data) pairs for each segment
    """
    engine = get_tts_engine(engine_type, lang_code)
    yield from engine.generate_speech_stream(text, voice, speed)