teachingAssistant / utils /tts_kokoro.py
Michael Hu
fix path
b2b15db
raw
history blame
5.52 kB
import logging
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple
from utils.tts import TTSBase, DummyTTS
# Configure logging
logger = logging.getLogger(__name__)
# Flag to track Kokoro availability
KOKORO_AVAILABLE = False
# Try to import Kokoro
try:
from kokoro import KPipeline
KOKORO_AVAILABLE = True
logger.info("Kokoro TTS engine is available")
except ImportError:
logger.warning("Kokoro TTS engine is not available")
except Exception as e:
logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
KOKORO_AVAILABLE = False
def _get_pipeline(lang_code: str = 'z'):
"""Lazy-load the Kokoro pipeline
Args:
lang_code (str): Language code for the pipeline
Returns:
KPipeline or None: The Kokoro pipeline or None if not available
"""
if not KOKORO_AVAILABLE:
logger.warning("Kokoro TTS engine is not available")
return None
try:
pipeline = KPipeline(lang_code=lang_code)
logger.info("Kokoro pipeline successfully loaded")
return pipeline
except Exception as e:
logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
return None
class KokoroTTS(TTSBase):
"""Kokoro TTS engine implementation
This engine uses the Kokoro library for TTS generation.
"""
def __init__(self, lang_code: str = 'z'):
"""Initialize the Kokoro TTS engine
Args:
lang_code (str): Language code for the engine
"""
super().__init__(lang_code)
self.pipeline = None
def _ensure_pipeline(self):
"""Ensure the pipeline is loaded
Returns:
bool: True if pipeline is available, False otherwise
"""
if self.pipeline is None:
self.pipeline = _get_pipeline(self.lang_code)
return self.pipeline is not None
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
"""Generate speech using Kokoro TTS engine
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
speed (float): Speech speed multiplier (0.5 to 2.0)
Returns:
Optional[str]: Path to the generated audio file or None if generation fails
"""
logger.info(f"Generating speech with Kokoro for text length: {len(text)}")
# Check if Kokoro is available
if not KOKORO_AVAILABLE:
logger.warning("Kokoro TTS engine is not available, falling back to dummy TTS")
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
# Ensure pipeline is loaded
if not self._ensure_pipeline():
logger.warning("Failed to load Kokoro pipeline, falling back to dummy TTS")
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
try:
# Generate unique output path
output_path = self._generate_output_path(prefix="kokoro")
# Generate speech
generator = self.pipeline(text, voice=voice, speed=speed)
for _, _, audio in generator:
logger.info(f"Saving Kokoro audio to {output_path}")
sf.write(output_path, audio, 24000)
break
logger.info(f"Kokoro audio generation complete: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
logger.warning("Kokoro TTS engine failed, falling back to dummy TTS")
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
"""Generate speech stream using Kokoro TTS engine
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
logger.info(f"Generating speech stream with Kokoro for text length: {len(text)}")
# Check if Kokoro is available
if not KOKORO_AVAILABLE:
logger.warning("Kokoro TTS engine is not available, falling back to dummy TTS")
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
return
# Ensure pipeline is loaded
if not self._ensure_pipeline():
logger.warning("Failed to load Kokoro pipeline, falling back to dummy TTS")
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
return
try:
# Generate speech stream
generator = self.pipeline(text, voice=voice, speed=speed)
for _, _, audio in generator:
yield 24000, audio
except Exception as e:
logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
logger.warning("Kokoro TTS engine failed, falling back to dummy TTS")
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)