Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

Michael Hu

refactor tts module

7495571 3 months ago

4.65 kB

	import logging
	from typing import Optional, Generator, Tuple, List, Dict, Any
	import numpy as np

	# Import the base class and dummy implementation
	from utils.tts_simplified import TTSBase, DummyTTS

	# Import the specific TTS implementations
	from utils.tts_kokoro_simplified import KokoroTTS, KOKORO_AVAILABLE
	from utils.tts_dia_simplified import DiaTTS, DIA_AVAILABLE
	from utils.tts_cosyvoice2_simplified import CosyVoice2TTS, COSYVOICE2_AVAILABLE

	# Configure logging
	logger = logging.getLogger(__name__)


	def get_available_engines() -> List[str]:
	"""Get a list of available TTS engines

	Returns:
	List[str]: List of available engine names
	"""
	available = []

	if KOKORO_AVAILABLE:
	available.append('kokoro')

	if DIA_AVAILABLE:
	available.append('dia')

	if COSYVOICE2_AVAILABLE:
	available.append('cosyvoice2')

	# Dummy is always available
	available.append('dummy')

	return available


	def get_tts_engine(engine_type: Optional[str] = None, lang_code: str = 'z') -> TTSBase:
	"""Get a TTS engine instance

	Args:
	engine_type (str, optional): Type of engine to create ('kokoro', 'dia', 'cosyvoice2', 'dummy')
	If None, the best available engine will be used
	lang_code (str): Language code for the engine

	Returns:
	TTSBase: An instance of a TTS engine
	"""
	# Get available engines
	available_engines = get_available_engines()
	logger.info(f"Available TTS engines: {available_engines}")

	# If engine_type is specified, try to create that specific engine
	if engine_type is not None:
	if engine_type == 'kokoro' and KOKORO_AVAILABLE:
	logger.info("Creating Kokoro TTS engine")
	return KokoroTTS(lang_code)
	elif engine_type == 'dia' and DIA_AVAILABLE:
	logger.info("Creating Dia TTS engine")
	return DiaTTS(lang_code)
	elif engine_type == 'cosyvoice2' and COSYVOICE2_AVAILABLE:
	logger.info("Creating CosyVoice2 TTS engine")
	return CosyVoice2TTS(lang_code)
	elif engine_type == 'dummy':
	logger.info("Creating Dummy TTS engine")
	return DummyTTS(lang_code)
	else:
	logger.warning(f"Requested engine '{engine_type}' is not available")

	# If no specific engine is requested or the requested engine is not available,
	# use the best available engine based on priority
	priority_order = ['cosyvoice2', 'kokoro', 'dia', 'dummy']
	for engine in priority_order:
	if engine in available_engines:
	logger.info(f"Using best available engine: {engine}")
	if engine == 'kokoro':
	return KokoroTTS(lang_code)
	elif engine == 'dia':
	return DiaTTS(lang_code)
	elif engine == 'cosyvoice2':
	return CosyVoice2TTS(lang_code)
	elif engine == 'dummy':
	return DummyTTS(lang_code)

	# Fallback to dummy engine if no engines are available
	logger.warning("No TTS engines available, falling back to dummy engine")
	return DummyTTS(lang_code)


	def generate_speech(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
	voice: str = 'default', speed: float = 1.0) -> Optional[str]:
	"""Generate speech using the specified or best available TTS engine

	Args:
	text (str): Input text to synthesize
	engine_type (str, optional): Type of engine to use
	lang_code (str): Language code
	voice (str): Voice ID to use
	speed (float): Speech speed multiplier

	Returns:
	Optional[str]: Path to the generated audio file or None if generation fails
	"""
	engine = get_tts_engine(engine_type, lang_code)
	return engine.generate_speech(text, voice, speed)


	def generate_speech_stream(text: str, engine_type: Optional[str] = None, lang_code: str = 'z',
	voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
	"""Generate speech stream using the specified or best available TTS engine

	Args:
	text (str): Input text to synthesize
	engine_type (str, optional): Type of engine to use
	lang_code (str): Language code
	voice (str): Voice ID to use
	speed (float): Speech speed multiplier

	Yields:
	tuple: (sample_rate, audio_data) pairs for each segment
	"""
	engine = get_tts_engine(engine_type, lang_code)
	yield from engine.generate_speech_stream(text, voice, speed)