File size: 7,881 Bytes
7495571
 
 
 
 
aaa0814
7495571
 
 
 
 
 
 
 
 
 
 
0c2d9e7
 
 
 
7495571
 
0c2d9e7
 
 
7495571
0c2d9e7
7495571
 
 
 
 
0c2d9e7
7495571
 
 
 
 
 
0c2d9e7
7495571
 
0c2d9e7
 
 
 
 
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c2d9e7
7495571
 
0c2d9e7
7495571
 
0c2d9e7
7495571
 
 
 
 
0c2d9e7
7495571
 
0c2d9e7
7495571
 
 
 
 
0c2d9e7
7495571
0c2d9e7
7495571
 
0c2d9e7
7495571
 
 
 
0c2d9e7
7495571
 
 
 
0c2d9e7
7495571
 
e22e786
 
0c2d9e7
7495571
 
e22e786
 
0c2d9e7
7495571
 
0c2d9e7
7495571
 
0c2d9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7495571
 
 
 
 
 
e22e786
 
0c2d9e7
7495571
 
e22e786
0c2d9e7
7495571
 
0c2d9e7
7495571
 
 
 
0c2d9e7
7495571
 
 
 
0c2d9e7
7495571
 
e22e786
7495571
0c2d9e7
7495571
 
e22e786
7495571
0c2d9e7
7495571
 
0c2d9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7495571
 
 
 
e22e786
 
0c2d9e7
7495571
 
e22e786
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import logging
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple

from utils.tts_base import TTSBase

# Configure logging
logger = logging.getLogger(__name__)

# Flag to track CosyVoice2 availability
COSYVOICE2_AVAILABLE = False
DEFAULT_SAMPLE_RATE = 24000

# Try to import CosyVoice2 dependencies
try:
    import torch
    import torchaudio
    # Import CosyVoice2 from the correct package
    # Based on https://github.com/FunAudioLLM/CosyVoice
    from cosyvoice.cli.cosyvoice import CosyVoice
    COSYVOICE2_AVAILABLE = True
    logger.info("CosyVoice2 TTS engine is available")
except ImportError as e:
    logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
    COSYVOICE2_AVAILABLE = False
except ModuleNotFoundError as e:
    logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
    COSYVOICE2_AVAILABLE = False


def _get_model():
    """Lazy-load the CosyVoice2 model

    Returns:
        CosyVoice2 or None: The CosyVoice2 model or None if not available
    """
    if not COSYVOICE2_AVAILABLE:
        logger.warning("CosyVoice2 TTS engine is not available")
        return None

    try:
        import torch
        import torchaudio
        from cosyvoice.cli.cosyvoice import CosyVoice

        # Initialize the model with correct path
        model = CosyVoice('pretrained_models/CosyVoice-300M')
        logger.info("CosyVoice2 model successfully loaded")
        return model
    except ImportError as e:
        logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
        return None
    except FileNotFoundError as e:
        logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
        return None


class CosyVoice2TTS(TTSBase):
    """CosyVoice2 TTS engine implementation

    This engine uses the CosyVoice2 model for TTS generation.
    """

    def __init__(self, lang_code: str = 'z'):
        """Initialize the CosyVoice2 TTS engine

        Args:
            lang_code (str): Language code for the engine
        """
        super().__init__(lang_code)
        self.model = None

    def _ensure_model(self):
        """Ensure the model is loaded

        Returns:
            bool: True if model is available, False otherwise
        """
        if self.model is None:
            self.model = _get_model()

        return self.model is not None

    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
        """Generate speech using CosyVoice2 TTS engine

        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (may not be used in CosyVoice2)
            speed (float): Speech speed multiplier (may not be used in CosyVoice2)

        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")

        # Check if CosyVoice2 is available
        if not COSYVOICE2_AVAILABLE:
            logger.error("CosyVoice2 TTS engine is not available")
            return None

        # Ensure model is loaded
        if not self._ensure_model():
            logger.error("Failed to load CosyVoice2 model")
            return None

        try:
            import torch

            # Generate unique output path
            output_path = self._generate_output_path(prefix="cosyvoice2")

            # Generate audio using CosyVoice2
            try:
                # Use the inference method from CosyVoice
                output_audio_tensor = self.model.inference_sft(text, '中文女')

                # Convert tensor to numpy array
                if isinstance(output_audio_tensor, torch.Tensor):
                    output_audio_np = output_audio_tensor.cpu().numpy()
                else:
                    output_audio_np = output_audio_tensor
            except Exception as api_error:
                # Try alternative API if the first one fails
                try:
                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
                    if isinstance(output_audio_tensor, torch.Tensor):
                        output_audio_np = output_audio_tensor.cpu().numpy()
                    else:
                        output_audio_np = output_audio_tensor
                except Exception as alt_error:
                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
                    return None

            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
                logger.info(f"CosyVoice2 audio generation complete: {output_path}")
                return output_path
            else:
                logger.error("CosyVoice2 model returned None for audio output")
                return None

        except Exception as e:
            logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
            return None

    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream using CosyVoice2 TTS engine

        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (may not be used in CosyVoice2)
            speed (float): Speech speed multiplier (may not be used in CosyVoice2)

        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")

        # Check if CosyVoice2 is available
        if not COSYVOICE2_AVAILABLE:
            logger.error("CosyVoice2 TTS engine is not available")
            return

        # Ensure model is loaded
        if not self._ensure_model():
            logger.error("Failed to load CosyVoice2 model")
            return

        try:
            import torch

            # Generate audio using CosyVoice2
            try:
                # Use the inference method from CosyVoice
                output_audio_tensor = self.model.inference_sft(text, '中文女')

                # Convert tensor to numpy array
                if isinstance(output_audio_tensor, torch.Tensor):
                    output_audio_np = output_audio_tensor.cpu().numpy()
                else:
                    output_audio_np = output_audio_tensor
            except Exception as api_error:
                # Try alternative API if the first one fails
                try:
                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
                    if isinstance(output_audio_tensor, torch.Tensor):
                        output_audio_np = output_audio_tensor.cpu().numpy()
                    else:
                        output_audio_np = output_audio_tensor
                except Exception as alt_error:
                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
                    return

            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                yield DEFAULT_SAMPLE_RATE, output_audio_np
            else:
                logger.error("CosyVoice2 model returned None for audio output")
                return

        except Exception as e:
            logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
            return