Spaces:
Build error
Build error
"""Dummy TTS provider implementation for testing and fallback.""" | |
import logging | |
import numpy as np | |
import soundfile as sf | |
import io | |
from typing import Iterator, TYPE_CHECKING | |
if TYPE_CHECKING: | |
from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest | |
from ..base.tts_provider_base import TTSProviderBase | |
from ...domain.exceptions import SpeechSynthesisException | |
logger = logging.getLogger(__name__) | |
class DummyTTSProvider(TTSProviderBase): | |
"""Dummy TTS provider that generates sine wave audio for testing.""" | |
def __init__(self): | |
"""Initialize the Dummy TTS provider.""" | |
super().__init__( | |
provider_name="Dummy", | |
supported_languages=['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh'] | |
) | |
def is_available(self) -> bool: | |
"""Dummy TTS is always available.""" | |
return True | |
def get_available_voices(self) -> list[str]: | |
"""Get available voices for Dummy TTS.""" | |
return ['default', 'male', 'female', 'robot'] | |
def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]: | |
"""Generate dummy sine wave audio.""" | |
try: | |
# Extract parameters from request | |
text = request.text_content.text | |
speed = request.voice_settings.speed | |
# Generate a simple sine wave based on text length and speed | |
sample_rate = 24000 | |
# Rough approximation of speech duration adjusted by speed | |
duration = min(len(text) / (20 * speed), 10) | |
# Create time array | |
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) | |
# Generate sine wave (440 Hz base frequency) | |
frequency = 440 | |
audio = 0.5 * np.sin(2 * np.pi * frequency * t) | |
# Add some variation based on voice setting | |
voice = request.voice_settings.voice_id | |
if voice == 'male': | |
# Lower frequency for male voice | |
audio = 0.5 * np.sin(2 * np.pi * 220 * t) | |
elif voice == 'female': | |
# Higher frequency for female voice | |
audio = 0.5 * np.sin(2 * np.pi * 660 * t) | |
elif voice == 'robot': | |
# Square wave for robot voice | |
audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t)) | |
# Convert to bytes | |
audio_bytes = self._numpy_to_bytes(audio, sample_rate) | |
logger.info(f"Generated dummy audio: duration={duration:.2f}s, voice={voice}") | |
return audio_bytes, sample_rate | |
except Exception as e: | |
self._handle_provider_error(e, "dummy audio generation") | |
def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]: | |
"""Generate dummy sine wave audio stream.""" | |
try: | |
# Extract parameters from request | |
text = request.text_content.text | |
speed = request.voice_settings.speed | |
# Generate audio in chunks | |
sample_rate = 24000 | |
chunk_duration = 1.0 # 1 second chunks | |
total_duration = min(len(text) / (20 * speed), 10) | |
chunks_count = int(np.ceil(total_duration / chunk_duration)) | |
for chunk_idx in range(chunks_count): | |
start_time = chunk_idx * chunk_duration | |
end_time = min((chunk_idx + 1) * chunk_duration, total_duration) | |
actual_duration = end_time - start_time | |
if actual_duration <= 0: | |
break | |
# Create time array for this chunk | |
t = np.linspace(0, actual_duration, int(sample_rate * actual_duration), endpoint=False) | |
# Generate sine wave | |
frequency = 440 | |
audio = 0.5 * np.sin(2 * np.pi * frequency * t) | |
# Apply voice variations | |
voice = request.voice_settings.voice_id | |
if voice == 'male': | |
audio = 0.5 * np.sin(2 * np.pi * 220 * t) | |
elif voice == 'female': | |
audio = 0.5 * np.sin(2 * np.pi * 660 * t) | |
elif voice == 'robot': | |
audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t)) | |
# Convert to bytes | |
audio_bytes = self._numpy_to_bytes(audio, sample_rate) | |
# Check if this is the final chunk | |
is_final = (chunk_idx == chunks_count - 1) | |
yield audio_bytes, sample_rate, is_final | |
except Exception as e: | |
self._handle_provider_error(e, "dummy streaming audio generation") | |
def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes: | |
"""Convert numpy audio array to bytes.""" | |
try: | |
# Create an in-memory buffer | |
buffer = io.BytesIO() | |
# Write audio data to buffer as WAV | |
sf.write(buffer, audio_array, sample_rate, format='WAV') | |
# Get bytes from buffer | |
buffer.seek(0) | |
return buffer.read() | |
except Exception as e: | |
raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e |