Spaces:
Running
Running
import os | |
import time | |
import logging | |
import numpy as np | |
import soundfile as sf | |
from typing import Optional, Tuple, Generator | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Constants | |
DEFAULT_SAMPLE_RATE = 24000 | |
# Global model instance (lazy loaded) | |
_pipeline = None | |
def _get_pipeline(lang_code: str = 'z'): | |
"""Lazy-load the Kokoro pipeline to avoid loading it until needed""" | |
global _pipeline | |
if _pipeline is None: | |
logger.info("Loading Kokoro pipeline...") | |
try: | |
# Import Kokoro | |
from kokoro import KPipeline | |
# Initialize the pipeline | |
logger.info(f"Initializing Kokoro pipeline with language code: {lang_code}") | |
_pipeline = KPipeline(lang_code=lang_code) | |
# Log pipeline details | |
logger.info(f"Kokoro pipeline loaded successfully") | |
logger.info(f"Pipeline type: {type(_pipeline).__name__}") | |
except ImportError as import_err: | |
logger.error(f"Import error loading Kokoro pipeline: {import_err}") | |
logger.error(f"This may indicate missing dependencies") | |
raise | |
except Exception as e: | |
logger.error(f"Error loading Kokoro pipeline: {e}", exc_info=True) | |
logger.error(f"Error type: {type(e).__name__}") | |
raise | |
return _pipeline | |
def generate_speech(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> str: | |
"""Public interface for TTS generation using Kokoro model | |
This is a legacy function maintained for backward compatibility. | |
New code should use the factory pattern implementation directly. | |
Args: | |
text (str): Input text to synthesize | |
language (str): Language code ('a' for US English, 'b' for British English, | |
'j' for Japanese, 'z' for Mandarin Chinese) | |
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) | |
speed (float): Speech speed multiplier (0.5 to 2.0) | |
Returns: | |
str: Path to the generated audio file | |
""" | |
logger.info(f"Legacy Kokoro generate_speech called with text length: {len(text)}") | |
# Use the new implementation via factory pattern | |
from utils.tts_engines import KokoroTTSEngine | |
try: | |
# Create a Kokoro engine and generate speech | |
kokoro_engine = KokoroTTSEngine(language) | |
return kokoro_engine.generate_speech(text, voice, speed) | |
except Exception as e: | |
logger.error(f"Error in legacy Kokoro generate_speech: {str(e)}", exc_info=True) | |
# Fall back to dummy TTS | |
from utils.tts_base import DummyTTSEngine | |
dummy_engine = DummyTTSEngine() | |
return dummy_engine.generate_speech(text) | |
def generate_speech_stream(text: str, language: str = "z", voice: str = "af_heart", speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: | |
"""Generate speech stream using Kokoro TTS engine | |
Args: | |
text (str): Input text to synthesize | |
language (str): Language code | |
voice (str): Voice ID to use | |
speed (float): Speech speed multiplier | |
Yields: | |
tuple: (sample_rate, audio_data) pairs for each segment | |
""" | |
logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}") | |
try: | |
# Get the Kokoro pipeline | |
pipeline = _get_pipeline(language) | |
# Generate speech stream | |
generator = pipeline(text, voice=voice, speed=speed) | |
for _, _, audio in generator: | |
yield DEFAULT_SAMPLE_RATE, audio | |
except Exception as e: | |
logger.error(f"Error in Kokoro generate_speech_stream: {str(e)}", exc_info=True) | |
# Fall back to dummy TTS | |
from utils.tts_base import DummyTTSEngine | |
dummy_engine = DummyTTSEngine() | |
yield from dummy_engine.generate_speech_stream(text) |