Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import time | |
| import soundfile as sf | |
| logger = logging.getLogger(__name__) | |
| # Wrap the problematic import in a try-except block | |
| try: | |
| from kokoro import KPipeline | |
| KOKORO_AVAILABLE = True | |
| except AttributeError as e: | |
| # Specifically catch the EspeakWrapper.set_data_path error | |
| if "EspeakWrapper" in str(e) and "set_data_path" in str(e): | |
| logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue") | |
| KOKORO_AVAILABLE = False | |
| else: | |
| # Re-raise if it's a different error | |
| raise | |
| class TTSEngine: | |
| def __init__(self, lang_code='z'): | |
| """Initialize TTS Engine with Kokoro | |
| Args: | |
| lang_code (str): Language code ('a' for US English, 'b' for British English, | |
| 'j' for Japanese, 'z' for Mandarin Chinese) | |
| """ | |
| logger.info("Initializing TTS Engine") | |
| if not KOKORO_AVAILABLE: | |
| logger.warning("Using dummy TTS implementation as Kokoro is not available") | |
| self.pipeline = None | |
| else: | |
| self.pipeline = KPipeline(lang_code=lang_code) | |
| logger.info("TTS engine initialized with Kokoro") | |
| def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: | |
| """Generate speech from text using Kokoro | |
| Args: | |
| text (str): Input text to synthesize | |
| voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) | |
| speed (float): Speech speed multiplier (0.5 to 2.0) | |
| Returns: | |
| str: Path to the generated audio file | |
| """ | |
| logger.info(f"Generating speech for text length: {len(text)}") | |
| try: | |
| # Create output directory if it doesn't exist | |
| os.makedirs("temp/outputs", exist_ok=True) | |
| # Generate unique output path | |
| output_path = f"temp/outputs/output_{int(time.time())}.wav" | |
| if not KOKORO_AVAILABLE: | |
| # Generate a simple sine wave as dummy audio | |
| import numpy as np | |
| sample_rate = 24000 | |
| duration = 3.0 # seconds | |
| t = np.linspace(0, duration, int(sample_rate * duration), False) | |
| tone = np.sin(2 * np.pi * 440 * t) * 0.3 | |
| logger.info(f"Saving dummy audio to {output_path}") | |
| sf.write(output_path, tone, sample_rate) | |
| logger.info(f"Dummy audio generation complete: {output_path}") | |
| return output_path | |
| # Get the first generated segment | |
| # We only take the first segment since the original code handled single segments | |
| generator = self.pipeline(text, voice=voice, speed=speed) | |
| for _, _, audio in generator: | |
| logger.info(f"Saving audio to {output_path}") | |
| sf.write(output_path, audio, 24000) | |
| break | |
| logger.info(f"Audio generation complete: {output_path}") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"TTS generation failed: {str(e)}", exc_info=True) | |
| raise | |
| def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0): | |
| """Generate speech from text and yield each segment | |
| Args: | |
| text (str): Input text to synthesize | |
| voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) | |
| speed (float): Speech speed multiplier (0.5 to 2.0) | |
| Yields: | |
| tuple: (sample_rate, audio_data) pairs for each segment | |
| """ | |
| try: | |
| if not KOKORO_AVAILABLE: | |
| # Generate dummy audio chunks | |
| import numpy as np | |
| sample_rate = 24000 | |
| duration = 1.0 # seconds per chunk | |
| # Create 3 chunks of dummy audio | |
| for i in range(3): | |
| t = np.linspace(0, duration, int(sample_rate * duration), False) | |
| freq = 440 + (i * 220) # Different frequency for each chunk | |
| tone = np.sin(2 * np.pi * freq * t) * 0.3 | |
| yield sample_rate, tone | |
| return | |
| generator = self.pipeline(text, voice=voice, speed=speed) | |
| for _, _, audio in generator: | |
| yield 24000, audio | |
| except Exception as e: | |
| logger.error(f"TTS streaming failed: {str(e)}", exc_info=True) | |
| raise | |
| # Initialize TTS engine with cache decorator if using Streamlit | |
| def get_tts_engine(lang_code='a'): | |
| """Get or create TTS engine instance | |
| Args: | |
| lang_code (str): Language code for the pipeline | |
| Returns: | |
| TTSEngine: Initialized TTS engine instance | |
| """ | |
| try: | |
| import streamlit as st | |
| def _get_engine(): | |
| return TTSEngine(lang_code) | |
| return _get_engine() | |
| except ImportError: | |
| return TTSEngine(lang_code) | |
| def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: | |
| """Public interface for TTS generation | |
| Args: | |
| text (str): Input text to synthesize | |
| voice (str): Voice ID to use | |
| speed (float): Speech speed multiplier | |
| Returns: | |
| str: Path to generated audio file | |
| """ | |
| engine = get_tts_engine() | |
| return engine.generate_speech(text, voice, speed) |