Spaces:
Running
Running
File size: 5,577 Bytes
9c8546d c72d839 9740afc 9c8546d c72d839 7eff88c 9c8546d 9740afc c72d839 7eff88c c72d839 9740afc 9c8546d 9740afc c72d839 9c8546d c72d839 9740afc c72d839 9740afc c72d839 9740afc 7eff88c 9740afc c72d839 933cc7f 9740afc 7eff88c 9740afc 933cc7f 9740afc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
import logging
import time
import soundfile as sf
logger = logging.getLogger(__name__)
# Wrap the problematic import in a try-except block
try:
from kokoro import KPipeline
KOKORO_AVAILABLE = True
except AttributeError as e:
# Specifically catch the EspeakWrapper.set_data_path error
if "EspeakWrapper" in str(e) and "set_data_path" in str(e):
logger.warning("Kokoro import failed due to EspeakWrapper.set_data_path issue")
KOKORO_AVAILABLE = False
else:
# Re-raise if it's a different error
raise
class TTSEngine:
def __init__(self, lang_code='z'):
"""Initialize TTS Engine with Kokoro
Args:
lang_code (str): Language code ('a' for US English, 'b' for British English,
'j' for Japanese, 'z' for Mandarin Chinese)
"""
logger.info("Initializing TTS Engine")
if not KOKORO_AVAILABLE:
logger.warning("Using dummy TTS implementation as Kokoro is not available")
self.pipeline = None
else:
self.pipeline = KPipeline(lang_code=lang_code)
logger.info("TTS engine initialized with Kokoro")
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
"""Generate speech from text using Kokoro
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
speed (float): Speech speed multiplier (0.5 to 2.0)
Returns:
str: Path to the generated audio file
"""
logger.info(f"Generating speech for text length: {len(text)}")
try:
# Create output directory if it doesn't exist
os.makedirs("temp/outputs", exist_ok=True)
# Generate unique output path
output_path = f"temp/outputs/output_{int(time.time())}.wav"
if not KOKORO_AVAILABLE:
# Generate a simple sine wave as dummy audio
import numpy as np
sample_rate = 24000
duration = 3.0 # seconds
t = np.linspace(0, duration, int(sample_rate * duration), False)
tone = np.sin(2 * np.pi * 440 * t) * 0.3
logger.info(f"Saving dummy audio to {output_path}")
sf.write(output_path, tone, sample_rate)
logger.info(f"Dummy audio generation complete: {output_path}")
return output_path
# Get the first generated segment
# We only take the first segment since the original code handled single segments
generator = self.pipeline(text, voice=voice, speed=speed)
for _, _, audio in generator:
logger.info(f"Saving audio to {output_path}")
sf.write(output_path, audio, 24000)
break
logger.info(f"Audio generation complete: {output_path}")
return output_path
except Exception as e:
logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
raise
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0):
"""Generate speech from text and yield each segment
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
speed (float): Speech speed multiplier (0.5 to 2.0)
Yields:
tuple: (sample_rate, audio_data) pairs for each segment
"""
try:
if not KOKORO_AVAILABLE:
# Generate dummy audio chunks
import numpy as np
sample_rate = 24000
duration = 1.0 # seconds per chunk
# Create 3 chunks of dummy audio
for i in range(3):
t = np.linspace(0, duration, int(sample_rate * duration), False)
freq = 440 + (i * 220) # Different frequency for each chunk
tone = np.sin(2 * np.pi * freq * t) * 0.3
yield sample_rate, tone
return
generator = self.pipeline(text, voice=voice, speed=speed)
for _, _, audio in generator:
yield 24000, audio
except Exception as e:
logger.error(f"TTS streaming failed: {str(e)}", exc_info=True)
raise
# Initialize TTS engine with cache decorator if using Streamlit
def get_tts_engine(lang_code='a'):
"""Get or create TTS engine instance
Args:
lang_code (str): Language code for the pipeline
Returns:
TTSEngine: Initialized TTS engine instance
"""
try:
import streamlit as st
@st.cache_resource
def _get_engine():
return TTSEngine(lang_code)
return _get_engine()
except ImportError:
return TTSEngine(lang_code)
def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
"""Public interface for TTS generation
Args:
text (str): Input text to synthesize
voice (str): Voice ID to use
speed (float): Speech speed multiplier
Returns:
str: Path to generated audio file
"""
engine = get_tts_engine()
return engine.generate_speech(text, voice, speed) |