"""Dummy TTS provider implementation for testing and fallback.""" import logging import numpy as np import soundfile as sf import io from typing import Iterator, TYPE_CHECKING if TYPE_CHECKING: from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest from ..base.tts_provider_base import TTSProviderBase from ...domain.exceptions import SpeechSynthesisException logger = logging.getLogger(__name__) class DummyTTSProvider(TTSProviderBase): """Dummy TTS provider that generates sine wave audio for testing.""" def __init__(self): """Initialize the Dummy TTS provider.""" super().__init__( provider_name="Dummy", supported_languages=['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh'] ) def is_available(self) -> bool: """Dummy TTS is always available.""" return True def get_available_voices(self) -> list[str]: """Get available voices for Dummy TTS.""" return ['default', 'male', 'female', 'robot'] def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]: """Generate dummy sine wave audio.""" try: # Extract parameters from request text = request.text_content.text speed = request.voice_settings.speed # Generate a simple sine wave based on text length and speed sample_rate = 24000 # Rough approximation of speech duration adjusted by speed duration = min(len(text) / (20 * speed), 10) # Create time array t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) # Generate sine wave (440 Hz base frequency) frequency = 440 audio = 0.5 * np.sin(2 * np.pi * frequency * t) # Add some variation based on voice setting voice = request.voice_settings.voice_id if voice == 'male': # Lower frequency for male voice audio = 0.5 * np.sin(2 * np.pi * 220 * t) elif voice == 'female': # Higher frequency for female voice audio = 0.5 * np.sin(2 * np.pi * 660 * t) elif voice == 'robot': # Square wave for robot voice audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t)) # Convert to bytes audio_bytes = self._numpy_to_bytes(audio, sample_rate) logger.info(f"Generated dummy audio: duration={duration:.2f}s, voice={voice}") return audio_bytes, sample_rate except Exception as e: self._handle_provider_error(e, "dummy audio generation") def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]: """Generate dummy sine wave audio stream.""" try: # Extract parameters from request text = request.text_content.text speed = request.voice_settings.speed # Generate audio in chunks sample_rate = 24000 chunk_duration = 1.0 # 1 second chunks total_duration = min(len(text) / (20 * speed), 10) chunks_count = int(np.ceil(total_duration / chunk_duration)) for chunk_idx in range(chunks_count): start_time = chunk_idx * chunk_duration end_time = min((chunk_idx + 1) * chunk_duration, total_duration) actual_duration = end_time - start_time if actual_duration <= 0: break # Create time array for this chunk t = np.linspace(0, actual_duration, int(sample_rate * actual_duration), endpoint=False) # Generate sine wave frequency = 440 audio = 0.5 * np.sin(2 * np.pi * frequency * t) # Apply voice variations voice = request.voice_settings.voice_id if voice == 'male': audio = 0.5 * np.sin(2 * np.pi * 220 * t) elif voice == 'female': audio = 0.5 * np.sin(2 * np.pi * 660 * t) elif voice == 'robot': audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t)) # Convert to bytes audio_bytes = self._numpy_to_bytes(audio, sample_rate) # Check if this is the final chunk is_final = (chunk_idx == chunks_count - 1) yield audio_bytes, sample_rate, is_final except Exception as e: self._handle_provider_error(e, "dummy streaming audio generation") def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes: """Convert numpy audio array to bytes.""" try: # Create an in-memory buffer buffer = io.BytesIO() # Write audio data to buffer as WAV sf.write(buffer, audio_array, sample_rate, format='WAV') # Get bytes from buffer buffer.seek(0) return buffer.read() except Exception as e: raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e