Spaces:
Running
Running
import logging | |
from typing import List, Tuple, Generator, Optional | |
import numpy as np | |
from utils.tts_base import TTSEngineBase, DummyTTSEngine | |
from utils.tts_engines import create_engine | |
# Configure logging | |
logger = logging.getLogger(__name__) | |
class CascadingTTSEngine(TTSEngineBase): | |
"""Cascading TTS engine implementation | |
This engine tries multiple TTS engines in order until one succeeds. | |
It provides a fallback mechanism to maximize the chances of getting | |
quality speech output. | |
""" | |
def __init__(self, engine_types: List[str], lang_code: str = 'z'): | |
"""Initialize the cascading TTS engine | |
Args: | |
engine_types (List[str]): List of engine types to try in order | |
lang_code (str): Language code for the engines | |
""" | |
super().__init__(lang_code) | |
self.engine_types = engine_types | |
self.lang_code = lang_code | |
logger.info(f"Initialized cascading TTS engine with engines: {engine_types}") | |
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: | |
"""Generate speech by trying multiple engines in order | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use | |
speed (float): Speech speed multiplier | |
Returns: | |
str: Path to the generated audio file | |
""" | |
logger.info(f"Generating speech with cascading engine for text length: {len(text)}") | |
# Try each engine in order | |
for engine_type in self.engine_types: | |
try: | |
logger.info(f"Trying TTS engine: {engine_type}") | |
engine = create_engine(engine_type, self.lang_code) | |
# Generate speech with the current engine | |
result = engine.generate_speech(text, voice, speed) | |
# If the engine returned a valid result, return it | |
if result is not None: | |
logger.info(f"Successfully generated speech with {engine_type}") | |
return result | |
logger.warning(f"TTS engine {engine_type} failed to generate speech, trying next engine") | |
except Exception as e: | |
logger.error(f"Error with TTS engine {engine_type}: {str(e)}") | |
logger.error(f"Error type: {type(e).__name__}") | |
logger.warning(f"Trying next TTS engine") | |
# If all engines failed, fall back to dummy engine | |
logger.warning("All TTS engines failed, falling back to dummy engine") | |
return DummyTTSEngine(self.lang_code).generate_speech(text, voice, speed) | |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: | |
"""Generate speech stream by trying multiple engines in order | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use | |
speed (float): Speech speed multiplier | |
Yields: | |
tuple: (sample_rate, audio_data) pairs for each segment | |
""" | |
logger.info(f"Generating speech stream with cascading engine for text length: {len(text)}") | |
# Try each engine in order | |
for engine_type in self.engine_types: | |
try: | |
logger.info(f"Trying TTS engine for streaming: {engine_type}") | |
engine = create_engine(engine_type, self.lang_code) | |
# Create a generator for the current engine | |
generator = engine.generate_speech_stream(text, voice, speed) | |
# Try to get the first chunk to verify the engine works | |
first_chunk = next(generator, None) | |
if first_chunk is not None: | |
# Engine produced a valid first chunk, yield it and continue with this engine | |
logger.info(f"Successfully started speech stream with {engine_type}") | |
yield first_chunk | |
# Yield the rest of the chunks from this engine | |
for chunk in generator: | |
yield chunk | |
# Successfully streamed all chunks, return | |
return | |
logger.warning(f"TTS engine {engine_type} failed to generate speech stream, trying next engine") | |
except Exception as e: | |
logger.error(f"Error with TTS engine {engine_type} streaming: {str(e)}") | |
logger.error(f"Error type: {type(e).__name__}") | |
logger.warning(f"Trying next TTS engine for streaming") | |
# If all engines failed, fall back to dummy engine | |
logger.warning("All TTS engines failed for streaming, falling back to dummy engine") | |
yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed) |