Spaces:

DroolingPanda
/

teachingAssistant

Build error

File size: 4,642 Bytes

import logging
from typing import Optional, Generator, Tuple, List, Dict, Any
import numpy as np

# Import the base class and dummy implementation
from utils.tts_base import TTSBase
from utils.tts_dummy import DummyTTS

# Import the specific TTS implementations
from utils.tts_kokoro import KokoroTTS, KOKORO_AVAILABLE
from utils.tts_dia import DiaTTS, DIA_AVAILABLE
from utils.tts_cosyvoice2 import CosyVoice2TTS, COSYVOICE2_AVAILABLE

# Configure logging
logger = logging.getLogger(__name__)


def get_available_engines() -> List[str]:
    """Get a list of available TTS engines
    
    Returns:
        List[str]: List of available engine names
    """
    available = []
    
    if KOKORO_AVAILABLE:
        available.append('kokoro')
    
    if DIA_AVAILABLE:
        available.append('dia')
    
    if COSYVOICE2_AVAILABLE:
        available.append('cosyvoice2')
    
    # Dummy is always available
    available.append('dummy')
    
    return available


def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase:
    """Get a TTS engine instance
    
    Args:
        engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy')
                                    If None, the best available engine will be used
        lang_code (str): Language code for the engine
        
    Returns:
        TTSBase: An instance of a TTS engine
    """
    # Get available engines
    available_engines = get_available_engines()
    logger.info(f"Available TTS engines: {available_engines}")
    
    # If engine_type is specified, try to create that specific engine
    if engine_type is not None:
        if engine_type == 'kokoro' and KOKORO_AVAILABLE:
            logger.info("Creating Kokoro TTS engine")
            return KokoroTTS(lang_code)
        elif engine_type == 'dia' and DIA_AVAILABLE:
            logger.info("Creating Dia TTS engine")
            return DiaTTS(lang_code)
        elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE:
            logger.info("Creating CosyVoice2 TTS engine")
            return CosyVoice2TTS(lang_code)
        elif engine_type == 'dummy':
            logger.info("Creating Dummy TTS engine")
            return DummyTTS(lang_code)
        else:
            logger.warning(f"Requested engine '{engine_type}' is not available")
    
    # If no specific engine is requested or the requested engine is not available,
    # use the best available engine based on priority
    priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy']
    for engine in priority_order:
        if engine in available_engines:
            logger.info(f"Using best available engine: {engine}")
            if engine == 'kokoro':
                return KokoroTTS(lang_code)
            elif engine == 'dia':
                return DiaTTS(lang_code)
            elif engine == 'cosyvoice2':
                return CosyVoice2TTS(lang_code)
            elif engine == 'dummy':
                return DummyTTS(lang_code)
    
    # Fallback to dummy engine if no engines are available
    logger.warning("No TTS engines available, falling back to dummy engine")
    return DummyTTS(lang_code)


def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z', 
                   voice: str = 'default', speed: float = 1.0) -> Optional[str]:
    """Generate speech using the specified or best available TTS engine
    
    Args:
        text (str): Input text to synthesize
        engine_type (str, optional): Type of engine to use
        lang_code (str): Language code
        voice (str): Voice ID to use
        speed (float): Speech speed multiplier
        
    Returns:
        Optional[str]: Path to the generated audio file or None if generation fails
    """
    engine = get_tts_engine(engine_type, lang_code)
    return engine.generate_speech(text, voice, speed)


def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
                          voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
    """Generate speech stream using the specified or best available TTS engine
    
    Args:
        text (str): Input text to synthesize
        engine_type (str, optional): Type of engine to use
        lang_code (str): Language code
        voice (str): Voice ID to use
        speed (float): Speech speed multiplier
        
    Yields:
        tuple: (sample_rate, audio_data) pairs for each segment
    """
    engine = get_tts_engine(engine_type, lang_code)
    yield from engine.generate_speech_stream(text, voice, speed)