File size: 5,577 Bytes
9c8546d
c72d839
9740afc
 
9c8546d
c72d839
 
7eff88c
 
 
 
 
 
 
 
 
 
 
 
 
9c8546d
9740afc
 
 
 
 
 
 
c72d839
7eff88c
 
 
 
 
 
c72d839
9740afc
 
9c8546d
9740afc
 
 
 
 
 
 
 
c72d839
9c8546d
c72d839
9740afc
 
c72d839
9740afc
c72d839
9740afc
7eff88c
 
 
 
 
 
 
 
 
 
 
 
 
9740afc
 
 
 
 
 
 
c72d839
 
 
 
 
 
 
933cc7f
9740afc
 
 
 
 
 
 
 
 
 
 
 
7eff88c
 
 
 
 
 
 
 
 
 
 
 
 
 
9740afc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933cc7f
9740afc
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import logging
import time
import soundfile as sf

logger = logging.getLogger(__name__)

# Wrap the problematic import in a try-except block
try:
    from kokoro import KPipeline
    KOKORO_AVAILABLE = True
except AttributeError as e:
    # Specifically catch the EspeakWrapper.set_data_path error
    if "EspeakWrapper" in str(e) and "set_data_path" in str(e):
        logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue")
        KOKORO_AVAILABLE = False
    else:
        # Re-raise if it's a different error
        raise

class TTSEngine:
    def __init__(self, lang_code='z'):
        """Initialize TTS Engine with Kokoro
        
        Args:
            lang_code (str): Language code ('a' for US English, 'b' for British English,
                           'j' for Japanese, 'z' for Mandarin Chinese)
        """
        logger.info("Initializing TTS Engine")
        if not KOKORO_AVAILABLE:
            logger.warning("Using dummy TTS implementation as Kokoro is not available")
            self.pipeline = None
        else:
            self.pipeline = KPipeline(lang_code=lang_code)
            logger.info("TTS engine initialized with Kokoro")

    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
        """Generate speech from text using Kokoro
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
            speed (float): Speech speed multiplier (0.5 to 2.0)
            
        Returns:
            str: Path to the generated audio file
        """
        logger.info(f"Generating speech for text length: {len(text)}")
        
        try:
            # Create output directory if it doesn't exist
            os.makedirs("temp/outputs", exist_ok=True)
            
            # Generate unique output path
            output_path = f"temp/outputs/output_{int(time.time())}.wav"
            
            if not KOKORO_AVAILABLE:
                # Generate a simple sine wave as dummy audio
                import numpy as np
                sample_rate = 24000
                duration = 3.0  # seconds
                t = np.linspace(0, duration, int(sample_rate * duration), False)
                tone = np.sin(2 * np.pi * 440 * t) * 0.3
                
                logger.info(f"Saving dummy audio to {output_path}")
                sf.write(output_path, tone, sample_rate)
                logger.info(f"Dummy audio generation complete: {output_path}")
                return output_path
            
            # Get the first generated segment
            # We only take the first segment since the original code handled single segments
            generator = self.pipeline(text, voice=voice, speed=speed)
            for _, _, audio in generator:
                logger.info(f"Saving audio to {output_path}")
                sf.write(output_path, audio, 24000)
                break
            
            logger.info(f"Audio generation complete: {output_path}")
            return output_path

        except Exception as e:
            logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
            raise

    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0):
        """Generate speech from text and yield each segment
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
            speed (float): Speech speed multiplier (0.5 to 2.0)
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        try:
            if not KOKORO_AVAILABLE:
                # Generate dummy audio chunks
                import numpy as np
                sample_rate = 24000
                duration = 1.0  # seconds per chunk
                
                # Create 3 chunks of dummy audio
                for i in range(3):
                    t = np.linspace(0, duration, int(sample_rate * duration), False)
                    freq = 440 + (i * 220)  # Different frequency for each chunk
                    tone = np.sin(2 * np.pi * freq * t) * 0.3
                    yield sample_rate, tone
                return
                
            generator = self.pipeline(text, voice=voice, speed=speed)
            for _, _, audio in generator:
                yield 24000, audio
                
        except Exception as e:
            logger.error(f"TTS streaming failed: {str(e)}", exc_info=True)
            raise

# Initialize TTS engine with cache decorator if using Streamlit
def get_tts_engine(lang_code='a'):
    """Get or create TTS engine instance
    
    Args:
        lang_code (str): Language code for the pipeline
        
    Returns:
        TTSEngine: Initialized TTS engine instance
    """
    try:
        import streamlit as st
        @st.cache_resource
        def _get_engine():
            return TTSEngine(lang_code)
        return _get_engine()
    except ImportError:
        return TTSEngine(lang_code)

def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
    """Public interface for TTS generation
    
    Args:
        text (str): Input text to synthesize
        voice (str): Voice ID to use
        speed (float): Speech speed multiplier
        
    Returns:
        str: Path to generated audio file
    """
    engine = get_tts_engine()
    return engine.generate_speech(text, voice, speed)