Spaces:
Running
Running
import os | |
import time | |
import logging | |
import soundfile as sf | |
import numpy as np | |
from abc import ABC, abstractmethod | |
from typing import Tuple, Generator, Optional | |
# Configure logging | |
logger = logging.getLogger(__name__) | |
class TTSEngineBase(ABC): | |
"""Base class for all TTS engines | |
This abstract class defines the interface that all TTS engines must implement. | |
It also provides common utility methods for file handling and audio generation. | |
""" | |
def __init__(self, lang_code: str = 'z'): | |
"""Initialize the TTS engine | |
Args: | |
lang_code (str): Language code ('a' for US English, 'b' for British English, | |
'j' for Japanese, 'z' for Mandarin Chinese) | |
Note: Not all engines support all language codes | |
""" | |
self.lang_code = lang_code | |
logger.info(f"Initializing {self.__class__.__name__} with language code: {lang_code}") | |
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]: | |
"""Generate speech from text | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) | |
Note: Not all engines support all voices | |
speed (float): Speech speed multiplier (0.5 to 2.0) | |
Note: Not all engines support speed adjustment | |
Returns: | |
Optional[str]: Path to the generated audio file, or None if generation fails | |
""" | |
pass | |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: | |
"""Generate speech from text and yield each segment | |
Args: | |
text (str): Input text to synthesize | |
voice (str): Voice ID to use | |
speed (float): Speech speed multiplier | |
Yields: | |
tuple: (sample_rate, audio_data) pairs for each segment | |
""" | |
# Default implementation: generate full audio and yield as a single chunk | |
output_path = self.generate_speech(text, voice, speed) | |
audio_data, sample_rate = sf.read(output_path) | |
yield sample_rate, audio_data | |
def _create_output_dir(self) -> str: | |
"""Create output directory for audio files | |
Returns: | |
str: Path to the output directory | |
""" | |
output_dir = "temp/outputs" | |
os.makedirs(output_dir, exist_ok=True) | |
return output_dir | |
def _generate_output_path(self, prefix: str = "output") -> str: | |
"""Generate a unique output path for audio files | |
Args: | |
prefix (str): Prefix for the output filename | |
Returns: | |
str: Path to the output file | |
""" | |
output_dir = self._create_output_dir() | |
timestamp = int(time.time()) | |
return f"{output_dir}/{prefix}_{timestamp}.wav" | |
class DummyTTSEngine(TTSEngineBase): | |
"""Dummy TTS engine that generates a simple sine wave | |
This engine is used as a fallback when no other engines are available. | |
""" | |
def __init__(self, lang_code: str = 'z'): | |
super().__init__(lang_code) | |
logger.warning("Using dummy TTS implementation as no other engines are available") | |
def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: | |
"""Generate a dummy audio file with a simple sine wave | |
Args: | |
text (str): Input text (not used) | |
voice (str): Voice ID (not used) | |
speed (float): Speed multiplier (not used) | |
Returns: | |
str: Path to the generated dummy audio file | |
""" | |
logger.info(f"Generating dummy speech for text length: {len(text)}") | |
# Generate unique output path | |
output_path = self._generate_output_path("dummy") | |
# Generate a simple sine wave | |
sample_rate = 24000 | |
duration = 3.0 # seconds | |
t = np.linspace(0, duration, int(sample_rate * duration), False) | |
tone = np.sin(2 * np.pi * 440 * t) * 0.3 | |
# Save the audio file | |
logger.info(f"Saving dummy audio to {output_path}") | |
sf.write(output_path, tone, sample_rate) | |
logger.info(f"Dummy audio generation complete: {output_path}") | |
return output_path | |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: | |
"""Generate dummy audio chunks with simple sine waves | |
Args: | |
text (str): Input text (not used) | |
voice (str): Voice ID (not used) | |
speed (float): Speed multiplier (not used) | |
Yields: | |
tuple: (sample_rate, audio_data) pairs for each dummy segment | |
""" | |
logger.info(f"Generating dummy speech stream for text length: {len(text)}") | |
sample_rate = 24000 | |
duration = 1.0 # seconds per chunk | |
# Create 3 chunks of dummy audio | |
for i in range(3): | |
t = np.linspace(0, duration, int(sample_rate * duration), False) | |
freq = 440 + (i * 220) # Different frequency for each chunk | |
tone = np.sin(2 * np.pi * freq * t) * 0.3 | |
yield sample_rate, tone |