Spaces:

DroolingPanda
/

teachingAssistant

Build error

File size: 7,881 Bytes

import logging
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple

from utils.tts_base import TTSBase

# Configure logging
logger = logging.getLogger(__name__)

# Flag to track CosyVoice2 availability
COSYVOICE2_AVAILABLE = False
DEFAULT_SAMPLE_RATE = 24000

# Try to import CosyVoice2 dependencies
try:
    import torch
    import torchaudio
    # Import CosyVoice2 from the correct package
    # Based on https://github.com/FunAudioLLM/CosyVoice
    from cosyvoice.cli.cosyvoice import CosyVoice
    COSYVOICE2_AVAILABLE = True
    logger.info("CosyVoice2 TTS engine is available")
except ImportError as e:
    logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
    COSYVOICE2_AVAILABLE = False
except ModuleNotFoundError as e:
    logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
    COSYVOICE2_AVAILABLE = False


def _get_model():
    """Lazy-load the CosyVoice2 model

    Returns:
        CosyVoice2 or None: The CosyVoice2 model or None if not available
    """
    if not COSYVOICE2_AVAILABLE:
        logger.warning("CosyVoice2 TTS engine is not available")
        return None

    try:
        import torch
        import torchaudio
        from cosyvoice.cli.cosyvoice import CosyVoice

        # Initialize the model with correct path
        model = CosyVoice('pretrained_models/CosyVoice-300M')
        logger.info("CosyVoice2 model successfully loaded")
        return model
    except ImportError as e:
        logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
        return None
    except FileNotFoundError as e:
        logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
        return None


class CosyVoice2TTS(TTSBase):
    """CosyVoice2 TTS engine implementation

    This engine uses the CosyVoice2 model for TTS generation.
    """

    def __init__(self, lang_code: str = 'z'):
        """Initialize the CosyVoice2 TTS engine

        Args:
            lang_code (str): Language code for the engine
        """
        super().__init__(lang_code)
        self.model = None

    def _ensure_model(self):
        """Ensure the model is loaded

        Returns:
            bool: True if model is available, False otherwise
        """
        if self.model is None:
            self.model = _get_model()

        return self.model is not None

    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
        """Generate speech using CosyVoice2 TTS engine

        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (may not be used in CosyVoice2)
            speed (float): Speech speed multiplier (may not be used in CosyVoice2)

        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")

        # Check if CosyVoice2 is available
        if not COSYVOICE2_AVAILABLE:
            logger.error("CosyVoice2 TTS engine is not available")
            return None

        # Ensure model is loaded
        if not self._ensure_model():
            logger.error("Failed to load CosyVoice2 model")
            return None

        try:
            import torch

            # Generate unique output path
            output_path = self._generate_output_path(prefix="cosyvoice2")

            # Generate audio using CosyVoice2
            try:
                # Use the inference method from CosyVoice
                output_audio_tensor = self.model.inference_sft(text, '中文女')

                # Convert tensor to numpy array
                if isinstance(output_audio_tensor, torch.Tensor):
                    output_audio_np = output_audio_tensor.cpu().numpy()
                else:
                    output_audio_np = output_audio_tensor
            except Exception as api_error:
                # Try alternative API if the first one fails
                try:
                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
                    if isinstance(output_audio_tensor, torch.Tensor):
                        output_audio_np = output_audio_tensor.cpu().numpy()
                    else:
                        output_audio_np = output_audio_tensor
                except Exception as alt_error:
                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
                    return None

            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
                logger.info(f"CosyVoice2 audio generation complete: {output_path}")
                return output_path
            else:
                logger.error("CosyVoice2 model returned None for audio output")
                return None

        except Exception as e:
            logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
            return None

    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream using CosyVoice2 TTS engine

        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (may not be used in CosyVoice2)
            speed (float): Speech speed multiplier (may not be used in CosyVoice2)

        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")

        # Check if CosyVoice2 is available
        if not COSYVOICE2_AVAILABLE:
            logger.error("CosyVoice2 TTS engine is not available")
            return

        # Ensure model is loaded
        if not self._ensure_model():
            logger.error("Failed to load CosyVoice2 model")
            return

        try:
            import torch

            # Generate audio using CosyVoice2
            try:
                # Use the inference method from CosyVoice
                output_audio_tensor = self.model.inference_sft(text, '中文女')

                # Convert tensor to numpy array
                if isinstance(output_audio_tensor, torch.Tensor):
                    output_audio_np = output_audio_tensor.cpu().numpy()
                else:
                    output_audio_np = output_audio_tensor
            except Exception as api_error:
                # Try alternative API if the first one fails
                try:
                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
                    if isinstance(output_audio_tensor, torch.Tensor):
                        output_audio_np = output_audio_tensor.cpu().numpy()
                    else:
                        output_audio_np = output_audio_tensor
                except Exception as alt_error:
                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
                    return

            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                yield DEFAULT_SAMPLE_RATE, output_audio_np
            else:
                logger.error("CosyVoice2 model returned None for audio output")
                return

        except Exception as e:
            logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
            return