import os import logging import time import soundfile as sf logger = logging.getLogger(__name__) # Wrap the problematic import in a try-except block try: from kokoro import KPipeline KOKORO_AVAILABLE = True except AttributeError as e: # Specifically catch the EspeakWrapper.set_data_path error if "EspeakWrapper" in str(e) and "set_data_path" in str(e): logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue") KOKORO_AVAILABLE = False else: # Re-raise if it's a different error raise class TTSEngine: def __init__(self, lang_code='z'): """Initialize TTS Engine with Kokoro Args: lang_code (str): Language code ('a' for US English, 'b' for British English, 'j' for Japanese, 'z' for Mandarin Chinese) """ logger.info("Initializing TTS Engine") if not KOKORO_AVAILABLE: logger.warning("Using dummy TTS implementation as Kokoro is not available") self.pipeline = None else: self.pipeline = KPipeline(lang_code=lang_code) logger.info("TTS engine initialized with Kokoro") def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: """Generate speech from text using Kokoro Args: text (str): Input text to synthesize voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) speed (float): Speech speed multiplier (0.5 to 2.0) Returns: str: Path to the generated audio file """ logger.info(f"Generating speech for text length: {len(text)}") try: # Create output directory if it doesn't exist os.makedirs("temp/outputs", exist_ok=True) # Generate unique output path output_path = f"temp/outputs/output_{int(time.time())}.wav" if not KOKORO_AVAILABLE: # Generate a simple sine wave as dummy audio import numpy as np sample_rate = 24000 duration = 3.0 # seconds t = np.linspace(0, duration, int(sample_rate * duration), False) tone = np.sin(2 * np.pi * 440 * t) * 0.3 logger.info(f"Saving dummy audio to {output_path}") sf.write(output_path, tone, sample_rate) logger.info(f"Dummy audio generation complete: {output_path}") return output_path # Get the first generated segment # We only take the first segment since the original code handled single segments generator = self.pipeline(text, voice=voice, speed=speed) for _, _, audio in generator: logger.info(f"Saving audio to {output_path}") sf.write(output_path, audio, 24000) break logger.info(f"Audio generation complete: {output_path}") return output_path except Exception as e: logger.error(f"TTS generation failed: {str(e)}", exc_info=True) raise def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0): """Generate speech from text and yield each segment Args: text (str): Input text to synthesize voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) speed (float): Speech speed multiplier (0.5 to 2.0) Yields: tuple: (sample_rate, audio_data) pairs for each segment """ try: if not KOKORO_AVAILABLE: # Generate dummy audio chunks import numpy as np sample_rate = 24000 duration = 1.0 # seconds per chunk # Create 3 chunks of dummy audio for i in range(3): t = np.linspace(0, duration, int(sample_rate * duration), False) freq = 440 + (i * 220) # Different frequency for each chunk tone = np.sin(2 * np.pi * freq * t) * 0.3 yield sample_rate, tone return generator = self.pipeline(text, voice=voice, speed=speed) for _, _, audio in generator: yield 24000, audio except Exception as e: logger.error(f"TTS streaming failed: {str(e)}", exc_info=True) raise # Initialize TTS engine with cache decorator if using Streamlit def get_tts_engine(lang_code='a'): """Get or create TTS engine instance Args: lang_code (str): Language code for the pipeline Returns: TTSEngine: Initialized TTS engine instance """ try: import streamlit as st @st.cache_resource def _get_engine(): return TTSEngine(lang_code) return _get_engine() except ImportError: return TTSEngine(lang_code) def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: """Public interface for TTS generation Args: text (str): Input text to synthesize voice (str): Voice ID to use speed (float): Speech speed multiplier Returns: str: Path to generated audio file """ engine = get_tts_engine() return engine.generate_speech(text, voice, speed)