File size: 4,815 Bytes
60bd17d
 
 
7495571
 
aaa0814
60bd17d
 
 
 
7495571
 
60bd17d
7495571
 
 
 
 
 
 
 
 
 
60bd17d
 
 
7495571
60bd17d
 
7495571
60bd17d
 
7495571
60bd17d
7495571
 
 
60bd17d
 
7495571
 
 
60bd17d
7495571
 
60bd17d
 
7495571
 
60bd17d
7495571
60bd17d
 
7495571
 
60bd17d
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e22e786
 
7495571
 
 
e22e786
 
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e22e786
7495571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e22e786
7495571
 
 
 
e22e786
7495571
 
 
 
 
 
 
 
 
e22e786
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import logging
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple

from utils.tts_base import TTSBase

# Configure logging
logger = logging.getLogger(__name__)

# Flag to track Kokoro availability
KOKORO_AVAILABLE = False

# Try to import Kokoro
try:
    from kokoro import KPipeline
    KOKORO_AVAILABLE = True
    logger.info("Kokoro TTS engine is available")
except ImportError:
    logger.warning("Kokoro TTS engine is not available")
except Exception as e:
    logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
    KOKORO_AVAILABLE = False


def _get_pipeline(lang_code: str = 'z'):
    """Lazy-load the Kokoro pipeline
    
    Args:
        lang_code (str): Language code for the pipeline
        
    Returns:
        KPipeline or None: The Kokoro pipeline or None if not available
    """
    if not KOKORO_AVAILABLE:
        logger.warning("Kokoro TTS engine is not available")
        return None
    
    try:
        pipeline = KPipeline(lang_code=lang_code)
        logger.info("Kokoro pipeline successfully loaded")
        return pipeline
    except Exception as e:
        logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
        return None


class KokoroTTS(TTSBase):
    """Kokoro TTS engine implementation
    
    This engine uses the Kokoro library for TTS generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        """Initialize the Kokoro TTS engine
        
        Args:
            lang_code (str): Language code for the engine
        """
        super().__init__(lang_code)
        self.pipeline = None
    
    def _ensure_pipeline(self):
        """Ensure the pipeline is loaded
        
        Returns:
            bool: True if pipeline is available, False otherwise
        """
        if self.pipeline is None:
            self.pipeline = _get_pipeline(self.lang_code)
        
        return self.pipeline is not None
    
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
        """Generate speech using Kokoro TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
            speed (float): Speech speed multiplier (0.5 to 2.0)
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with Kokoro for text length: {len(text)}")
        
        # Check if Kokoro is available
        if not KOKORO_AVAILABLE:
            logger.error("Kokoro TTS engine is not available")
            return None
        
        # Ensure pipeline is loaded
        if not self._ensure_pipeline():
            logger.error("Failed to load Kokoro pipeline")
            return None
        
        try:
            # Generate unique output path
            output_path = self._generate_output_path(prefix="kokoro")
            
            # Generate speech
            generator = self.pipeline(text, voice=voice, speed=speed)
            for _, _, audio in generator:
                logger.info(f"Saving Kokoro audio to {output_path}")
                sf.write(output_path, audio, 24000)
                break
            
            logger.info(f"Kokoro audio generation complete: {output_path}")
            return output_path
        except Exception as e:
            logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
            return None
    
    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream using Kokoro TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
        
        # Check if Kokoro is available
        if not KOKORO_AVAILABLE:
            logger.error("Kokoro TTS engine is not available")
            return
        
        # Ensure pipeline is loaded
        if not self._ensure_pipeline():
            logger.error("Failed to load Kokoro pipeline")
            return
        
        try:
            # Generate speech stream
            generator = self.pipeline(text, voice=voice, speed=speed)
            for _, _, audio in generator:
                yield 24000, audio
        except Exception as e:
            logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
            return