Michael Hu
update tts
9740afc
raw
history blame
3.72 kB
import os
import logging
import time
import soundfile as sf
from kokoro import KPipeline
logger = logging.getLogger(__name__)
class TTSEngine:
def __init__(self, lang_code='z'):
"""Initialize TTS Engine with Kokoro
Args:
lang_code (str): Language code ('a' for US English, 'b' for British English,
'j' for Japanese, 'z' for Mandarin Chinese)
"""
logger.info("Initializing TTS Engine")
self.pipeline = KPipeline(lang_code=lang_code)
logger.info("TTS engine initialized")
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
"""Generate speech from text using Kokoro
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
speed (float): Speech speed multiplier (0.5 to 2.0)
Returns:
str: Path to the generated audio file
"""
logger.info(f"Generating speech for text length: {len(text)}")
try:
# Create output directory if it doesn't exist
os.makedirs("temp/outputs", exist_ok=True)
# Generate unique output path
output_path = f"temp/outputs/output_{int(time.time())}.wav"
# Get the first generated segment
# We only take the first segment since the original code handled single segments
generator = self.pipeline(text, voice=voice, speed=speed)
for _, _, audio in generator:
logger.info(f"Saving audio to {output_path}")
sf.write(output_path, audio, 24000)
break
logger.info(f"Audio generation complete: {output_path}")
return output_path
except Exception as e:
logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
raise
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0):
"""Generate speech from text and yield each segment
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
speed (float): Speech speed multiplier (0.5 to 2.0)
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
try:
generator = self.pipeline(text, voice=voice, speed=speed)
for _, _, audio in generator:
yield 24000, audio
except Exception as e:
logger.error(f"TTS streaming failed: {str(e)}", exc_info=True)
raise
# Initialize TTS engine with cache decorator if using Streamlit
def get_tts_engine(lang_code='a'):
"""Get or create TTS engine instance
Args:
lang_code (str): Language code for the pipeline
Returns:
TTSEngine: Initialized TTS engine instance
"""
try:
import streamlit as st
@st.cache_resource
def _get_engine():
return TTSEngine(lang_code)
return _get_engine()
except ImportError:
return TTSEngine(lang_code)
def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
"""Public interface for TTS generation
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Returns:
str: Path to generated audio file
"""
engine = get_tts_engine()
return engine.generate_speech(text, voice, speed)