Spaces:

DroolingPanda
/

teachingAssistant

Running

Michael Hu

create fallback flow for tts engines

58d9769 2 months ago

5.49 kB

	import os
	import time
	import logging
	import soundfile as sf
	import numpy as np
	from abc import ABC, abstractmethod
	from typing import Tuple, Generator, Optional

	# Configure logging
	logger = logging.getLogger(__name__)

	class TTSEngineBase(ABC):
	"""Base class for all TTS engines

	This abstract class defines the interface that all TTS engines must implement.
	It also provides common utility methods for file handling and audio generation.
	"""

	def __init__(self, lang_code: str = 'z'):
	"""Initialize the TTS engine

	Args:
	lang_code (str): Language code ('a' for US English, 'b' for British English,
	'j' for Japanese, 'z' for Mandarin Chinese)
	Note: Not all engines support all language codes
	"""
	self.lang_code = lang_code
	logger.info(f"Initializing {self.__class__.__name__} with language code: {lang_code}")

	@abstractmethod
	def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
	"""Generate speech from text

	Args:
	text (str): Input text to synthesize
	voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
	Note: Not all engines support all voices
	speed (float): Speech speed multiplier (0.5 to 2.0)
	Note: Not all engines support speed adjustment

	Returns:
	Optional[str]: Path to the generated audio file, or None if generation fails
	"""
	pass

	def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
	"""Generate speech from text and yield each segment

	Args:
	text (str): Input text to synthesize
	voice (str): Voice ID to use
	speed (float): Speech speed multiplier

	Yields:
	tuple: (sample_rate, audio_data) pairs for each segment
	"""
	# Default implementation: generate full audio and yield as a single chunk
	output_path = self.generate_speech(text, voice, speed)
	audio_data, sample_rate = sf.read(output_path)
	yield sample_rate, audio_data

	def _create_output_dir(self) -> str:
	"""Create output directory for audio files

	Returns:
	str: Path to the output directory
	"""
	output_dir = "temp/outputs"
	os.makedirs(output_dir, exist_ok=True)
	return output_dir

	def _generate_output_path(self, prefix: str = "output") -> str:
	"""Generate a unique output path for audio files

	Args:
	prefix (str): Prefix for the output filename

	Returns:
	str: Path to the output file
	"""
	output_dir = self._create_output_dir()
	timestamp = int(time.time())
	return f"{output_dir}/{prefix}_{timestamp}.wav"


	class DummyTTSEngine(TTSEngineBase):
	"""Dummy TTS engine that generates a simple sine wave

	This engine is used as a fallback when no other engines are available.
	"""

	def __init__(self, lang_code: str = 'z'):
	super().__init__(lang_code)
	logger.warning("Using dummy TTS implementation as no other engines are available")

	def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
	"""Generate a dummy audio file with a simple sine wave

	Args:
	text (str): Input text (not used)
	voice (str): Voice ID (not used)
	speed (float): Speed multiplier (not used)

	Returns:
	str: Path to the generated dummy audio file
	"""
	logger.info(f"Generating dummy speech for text length: {len(text)}")

	# Generate unique output path
	output_path = self._generate_output_path("dummy")

	# Generate a simple sine wave
	sample_rate = 24000
	duration = 3.0 # seconds
	t = np.linspace(0, duration, int(sample_rate * duration), False)
	tone = np.sin(2 * np.pi * 440 * t) * 0.3

	# Save the audio file
	logger.info(f"Saving dummy audio to {output_path}")
	sf.write(output_path, tone, sample_rate)
	logger.info(f"Dummy audio generation complete: {output_path}")

	return output_path

	def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
	"""Generate dummy audio chunks with simple sine waves

	Args:
	text (str): Input text (not used)
	voice (str): Voice ID (not used)
	speed (float): Speed multiplier (not used)

	Yields:
	tuple: (sample_rate, audio_data) pairs for each dummy segment
	"""
	logger.info(f"Generating dummy speech stream for text length: {len(text)}")

	sample_rate = 24000
	duration = 1.0 # seconds per chunk

	# Create 3 chunks of dummy audio
	for i in range(3):
	t = np.linspace(0, duration, int(sample_rate * duration), False)
	freq = 440 + (i * 220) # Different frequency for each chunk
	tone = np.sin(2 * np.pi * freq * t) * 0.3
	yield sample_rate, tone