File size: 16,483 Bytes
3ed3b5a
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
 
 
a316f58
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a316f58
 
 
 
 
 
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
 
3ed3b5a
 
 
58d9769
 
3ed3b5a
 
 
 
58d9769
 
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
cb90410
 
 
 
58d9769
 
cb90410
3ed3b5a
 
 
 
 
 
 
a316f58
 
58d9769
 
a316f58
cb90410
 
58d9769
 
a316f58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
a316f58
 
 
 
 
 
 
 
 
58d9769
a316f58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
 
3ed3b5a
 
 
58d9769
 
3ed3b5a
 
 
 
58d9769
 
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb90410
 
 
 
 
 
 
 
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb90410
 
 
 
 
 
 
 
3ed3b5a
 
 
 
cb90410
3ed3b5a
 
 
 
 
cb90410
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6034fea
 
 
3ed3b5a
 
 
 
 
 
 
 
 
 
6034fea
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6034fea
 
 
 
 
3ed3b5a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import logging
import time
import os
import numpy as np
import soundfile as sf
from typing import Dict, List, Optional, Tuple, Generator, Any, Union

from utils.tts_base import TTSEngineBase, DummyTTSEngine

# Configure logging
logger = logging.getLogger(__name__)

# Flag to track TTS engine availability
KOKORO_AVAILABLE = False
KOKORO_SPACE_AVAILABLE = True
DIA_AVAILABLE = False
DIA_SPACE_AVAILABLE = True

# Try to import Kokoro
try:
    from kokoro import KPipeline
    KOKORO_AVAILABLE = True
    logger.info("Kokoro TTS engine is available")
except AttributeError as e:
    # Specifically catch the EspeakWrapper.set_data_path error
    if "EspeakWrapper" in str(e) and "set_data_path" in str(e):
        logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue, falling back to Kokoro FastAPI server")
    else:
        # Re-raise if it's a different error
        logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
        raise
except ImportError:
    logger.warning("Kokoro TTS engine is not available")

# Try to import Dia dependencies to check availability
try:
    import torch
    from dia.model import Dia
    DIA_AVAILABLE = True
    logger.info("Dia TTS engine is available")
except ImportError:
    logger.warning("Dia TTS engine is not available")
except ModuleNotFoundError as e:
    if "dac" in str(e):
        logger.warning("Dia TTS engine is not available due to missing 'dac' module")
    else:
        logger.warning(f"Dia TTS engine is not available: {str(e)}")
    DIA_AVAILABLE = False


class KokoroTTSEngine(TTSEngineBase):
    """Kokoro TTS engine implementation
    
    This engine uses the Kokoro library for TTS generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        super().__init__(lang_code)
        try:
            self.pipeline = KPipeline(lang_code=lang_code)
            logger.info("Kokoro TTS engine successfully initialized")
        except Exception as e:
            logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
            logger.error(f"Error type: {type(e).__name__}")
            raise
    
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
        """Generate speech using Kokoro TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
            speed (float): Speech speed multiplier (0.5 to 2.0)
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with Kokoro for text length: {len(text)}")
        
        # Generate unique output path
        output_path = self._generate_output_path()
        
        # Generate speech
        generator = self.pipeline(text, voice=voice, speed=speed)
        for _, _, audio in generator:
            logger.info(f"Saving Kokoro audio to {output_path}")
            sf.write(output_path, audio, 24000)
            break
        
        logger.info(f"Kokoro audio generation complete: {output_path}")
        return output_path
    
    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream using Kokoro TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
        
        # Generate speech stream
        generator = self.pipeline(text, voice=voice, speed=speed)
        for _, _, audio in generator:
            yield 24000, audio


class KokoroSpaceTTSEngine(TTSEngineBase):
    """Kokoro Space TTS engine implementation
    
    This engine uses the Kokoro FastAPI server for TTS generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        super().__init__(lang_code)
        try:
            from gradio_client import Client
            self.client = Client("Remsky/Kokoro-TTS-Zero")
            logger.info("Kokoro Space TTS engine successfully initialized")
        except Exception as e:
            logger.error(f"Failed to initialize Kokoro Space client: {str(e)}")
            logger.error(f"Error type: {type(e).__name__}")
            raise
    
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
        """Generate speech using Kokoro Space TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
            speed (float): Speech speed multiplier (0.5 to 2.0)
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with Kokoro Space for text length: {len(text)}")
        logger.info(f"Text to generate speech on is: {text[:50]}..." if len(text) > 50 else f"Text to generate speech on is: {text}")
        
        # Generate unique output path
        output_path = self._generate_output_path()
        
        try:
            # Use af_nova as the default voice for Kokoro Space
            voice_to_use = 'af_nova' if voice == 'af_heart' else voice
            
            # Generate speech
            result = self.client.predict(
                text=text,
                voice_names=voice_to_use,
                speed=speed,
                api_name="/generate_speech_from_ui"
            )
            logger.info(f"Received audio from Kokoro FastAPI server: {result}")
            
            # Process the result and save to output_path
            # Return the result path directly if it's a string
            if isinstance(result, str) and os.path.exists(result):
                return result
            else:
                logger.warning("Unexpected result from Kokoro Space")
                return None
            
        except Exception as e:
            logger.error(f"Failed to generate speech from Kokoro FastAPI server: {str(e)}")
            logger.error(f"Error type: {type(e).__name__}")
            logger.info("Kokoro Space TTS engine failed")
            return None


class DiaTTSEngine(TTSEngineBase):
    """Dia TTS engine implementation
    
    This engine uses the Dia model for TTS generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        super().__init__(lang_code)
        # Dia doesn't need initialization here, it will be lazy-loaded when needed
        logger.info("Dia TTS engine initialized (lazy loading)")
    
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
        """Generate speech using Dia TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (not used in Dia)
            speed (float): Speech speed multiplier (not used in Dia)
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with Dia for text length: {len(text)}")
        
        try:
            # Import here to avoid circular imports
            from utils.tts_dia import generate_speech as dia_generate_speech, DIA_AVAILABLE
            
            # Check if Dia is available
            if not DIA_AVAILABLE:
                logger.warning("Dia TTS engine is not available")
                return None
            
            logger.info("Successfully imported Dia speech generation function")
            
            # Call Dia's generate_speech function
            # Note: Dia's function expects a language parameter, not voice or speed
            output_path = dia_generate_speech(text, language=self.lang_code)
            logger.info(f"Generated audio with Dia: {output_path}")
            return output_path
        except ModuleNotFoundError as e:
            if "dac" in str(e):
                logger.warning("Dia TTS engine failed due to missing 'dac' module")
                return None
            raise
        except Exception as e:
            logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
            logger.warning("Dia TTS engine failed")
            return None


class DiaSpaceTTSEngine(TTSEngineBase):
    """Dia Space TTS engine implementation
    
    This engine uses the Dia TTS Server API for speech generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        super().__init__(lang_code)
        try:
            # Import here to avoid circular imports
            from utils.tts_dia_space import _get_client
            self.client = _get_client()
            logger.info("Dia Space TTS engine successfully initialized")
        except Exception as e:
            logger.error(f"Failed to initialize Dia Space client: {str(e)}")
            logger.error(f"Error type: {type(e).__name__}")
            raise
    
    def generate_speech(self, text: str, voice: str = 'S1', speed: float = 1.0, response_format: str = 'wav') -> Optional[str]:
        """Generate speech using Dia Space TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice mode to use ('S1', 'S2', 'dialogue', or filename for clone)
            speed (float): Speech speed multiplier
            response_format (str): Audio format ('wav', 'mp3', 'opus')
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with Dia Space for text length: {len(text)}")
        
        try:
            # Import here to avoid circular imports
            from utils.tts_dia_space import _call_dia_api, _generate_output_path
            
            # Call the Dia Space API
            audio_data = _call_dia_api(text, voice, response_format, speed)
            
            # Save the audio data to a file
            output_path = _generate_output_path(prefix="dia_space", extension=response_format)
            with open(output_path, 'wb') as f:
                f.write(audio_data)
            
            logger.info(f"Generated audio with Dia Space: {output_path}")
            return output_path
        except Exception as e:
            logger.error(f"Failed to generate speech from Dia Space API: {str(e)}")
            logger.error(f"Error type: {type(e).__name__}")
            logger.info("Dia Space TTS engine failed")
            return None
            
        except ImportError as import_err:
            logger.error(f"Dia TTS generation failed due to import error: {str(import_err)}")
            logger.error("Dia Space TTS engine failed")
            return None
            
        except Exception as dia_error:
            logger.error(f"Dia TTS generation failed: {str(dia_error)}", exc_info=True)
            logger.error(f"Error type: {type(dia_error).__name__}")
            logger.error("Dia Space TTS engine failed")
            return None
    
    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream using Dia TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (not used in Dia)
            speed (float): Speech speed multiplier (not used in Dia)
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with Dia for text length: {len(text)}")
        
        try:
            # Import required modules
            from utils.tts_dia import _get_model, DEFAULT_SAMPLE_RATE, DIA_AVAILABLE
            
            # Check if Dia is available
            if not DIA_AVAILABLE:
                logger.warning("Dia TTS engine is not available, falling back to dummy audio stream")
                yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
                return
                
            import torch
            
            # Get the Dia model
            model = _get_model()
            
            # Generate audio
            with torch.inference_mode():
                output_audio_np = model.generate(
                    text,
                    max_tokens=None,
                    cfg_scale=3.0,
                    temperature=1.3,
                    top_p=0.95,
                    cfg_filter_top_k=35,
                    use_torch_compile=False,
                    verbose=False
                )
            
            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
                yield DEFAULT_SAMPLE_RATE, output_audio_np
            else:
                logger.warning("Dia model returned None for audio output")
                logger.warning("Falling back to dummy audio stream")
                yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
                
        except ModuleNotFoundError as e:
            if "dac" in str(e):
                logger.warning("Dia TTS streaming failed due to missing 'dac' module, falling back to dummy audio stream")
            else:
                logger.error(f"Module not found error in Dia TTS streaming: {str(e)}")
            yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
                
        except ImportError as import_err:
            logger.error(f"Dia TTS streaming failed due to import error: {str(import_err)}")
            logger.error("Falling back to dummy audio stream")
            yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)
            
        except Exception as dia_error:
            logger.error(f"Dia TTS streaming failed: {str(dia_error)}", exc_info=True)
            logger.error(f"Error type: {type(dia_error).__name__}")
            logger.error("Falling back to dummy audio stream")
            yield from DummyTTSEngine(self.lang_code).generate_speech_stream(text, voice, speed)


def get_available_engines() -> List[str]:
    """Get a list of available TTS engines
    
    Returns:
        List[str]: List of available engine names
    """
    available = []
    
    if KOKORO_AVAILABLE:
        available.append('kokoro')
    
    if KOKORO_SPACE_AVAILABLE:
        available.append('kokoro_space')
    
    if DIA_AVAILABLE:
        available.append('dia')
    
    if DIA_SPACE_AVAILABLE:
        available.append('dia_space')
    
    # Dummy is always available
    available.append('dummy')
    
    return available


def create_engine(engine_type: str, lang_code: str = 'z') -> TTSEngineBase:
    """Create a specific TTS engine
    
    Args:
        engine_type (str): Type of engine to create ('kokoro', 'kokoro_space', 'dia', 'dia_space', 'dummy')
        lang_code (str): Language code for the engine
        
    Returns:
        TTSEngineBase: An instance of the requested TTS engine
        
    Raises:
        ValueError: If the requested engine type is not supported
    """
    if engine_type == 'kokoro':
        if not KOKORO_AVAILABLE:
            raise ValueError("Kokoro TTS engine is not available")
        return KokoroTTSEngine(lang_code)
    
    elif engine_type == 'kokoro_space':
        if not KOKORO_SPACE_AVAILABLE:
            raise ValueError("Kokoro Space TTS engine is not available")
        return KokoroSpaceTTSEngine(lang_code)
    
    elif engine_type == 'dia':
        if not DIA_AVAILABLE:
            raise ValueError("Dia TTS engine is not available")
        return DiaTTSEngine(lang_code)
    
    elif engine_type == 'dia_space':
        if not DIA_SPACE_AVAILABLE:
            raise ValueError("Dia Space TTS engine is not available")
        return DiaSpaceTTSEngine(lang_code)
    
    elif engine_type == 'dummy':
        return DummyTTSEngine(lang_code)
    
    else:
        raise ValueError(f"Unsupported TTS engine type: {engine_type}")