File size: 5,063 Bytes
58d9769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import logging
from typing import List, Tuple, Generator, Optional
import numpy as np

from utils.tts_base import TTSEngineBase, DummyTTSEngine
from utils.tts_engines import create_engine

# Configure logging
logger = logging.getLogger(__name__)

class CascadingTTSEngine(TTSEngineBase):
    """Cascading TTS engine implementation
    
    This engine tries multiple TTS engines in order until one succeeds.
    It provides a fallback mechanism to maximize the chances of getting
    quality speech output.
    """
    
    def __init__(self, engine_types: List[str], lang_code: str = 'z'):
        """Initialize the cascading TTS engine
        
        Args:
            engine_types (List[str]): List of engine types to try in order
            lang_code (str): Language code for the engines
        """
        super().__init__(lang_code)
        self.engine_types = engine_types
        self.lang_code = lang_code
        logger.info(f"Initialized cascading TTS engine with engines: {engine_types}")
    
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
        """Generate speech by trying multiple engines in order
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Returns:
            str: Path to the generated audio file
        """
        logger.info(f"Generating speech with cascading engine for text length: {len(text)}")
        
        # Try each engine in order
        for engine_type in self.engine_types:
            try:
                logger.info(f"Trying TTS engine: {engine_type}")
                engine = create_engine(engine_type, self.lang_code)
                
                # Generate speech with the current engine
                result = engine.generate_speech(text, voice, speed)
                
                # If the engine returned a valid result, return it
                if result is not None:
                    logger.info(f"Successfully generated speech with {engine_type}")
                    return result
                
                logger.warning(f"TTS engine {engine_type} failed to generate speech, trying next engine")
            except Exception as e:
                logger.error(f"Error with TTS engine {engine_type}: {str(e)}")
                logger.error(f"Error type: {type(e).__name__}")
                logger.warning(f"Trying next TTS engine")
        
        # If all engines failed, fall back to dummy engine
        logger.warning("All TTS engines failed, falling back to dummy engine")
        return DummyTTSEngine(self.lang_code).generate_speech(text, voice, speed)
    
    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream by trying multiple engines in order
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with cascading engine for text length: {len(text)}")
        
        # Try each engine in order
        for engine_type in self.engine_types:
            try:
                logger.info(f"Trying TTS engine for streaming: {engine_type}")
                engine = create_engine(engine_type, self.lang_code)
                
                # Create a generator for the current engine
                generator = engine.generate_speech_stream(text, voice, speed)
                
                # Try to get the first chunk to verify the engine works
                first_chunk = next(generator, None)
                if first_chunk is not None:
                    # Engine produced a valid first chunk, yield it and continue with this engine
                    logger.info(f"Successfully started speech stream with {engine_type}")
                    yield first_chunk
                    
                    # Yield the rest of the chunks from this engine
                    for chunk in generator:
                        yield chunk
                    
                    # Successfully streamed all chunks, return
                    return
                
                logger.warning(f"TTS engine {engine_type} failed to generate speech stream, trying next engine")
            except Exception as e:
                logger.error(f"Error with TTS engine {engine_type} streaming: {str(e)}")
                logger.error(f"Error type: {type(e).__name__}")
                logger.warning(f"Trying next TTS engine for streaming")
        
        # If all engines failed, fall back to dummy engine
        logger.warning("All TTS engines failed for streaming, falling back to dummy engine")
        yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)