Spaces:
Build error
Build error
File size: 5,241 Bytes
1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 fdc056d 1f9c751 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
"""Dummy TTS provider implementation for testing and fallback."""
import logging
import numpy as np
import soundfile as sf
import io
from typing import Iterator, TYPE_CHECKING
if TYPE_CHECKING:
from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
from ..base.tts_provider_base import TTSProviderBase
from ...domain.exceptions import SpeechSynthesisException
logger = logging.getLogger(__name__)
class DummyTTSProvider(TTSProviderBase):
"""Dummy TTS provider that generates sine wave audio for testing."""
def __init__(self):
"""Initialize the Dummy TTS provider."""
super().__init__(
provider_name="Dummy",
supported_languages=['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
)
def is_available(self) -> bool:
"""Dummy TTS is always available."""
return True
def get_available_voices(self) -> list[str]:
"""Get available voices for Dummy TTS."""
return ['default', 'male', 'female', 'robot']
def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
"""Generate dummy sine wave audio."""
try:
# Extract parameters from request
text = request.text_content.text
speed = request.voice_settings.speed
# Generate a simple sine wave based on text length and speed
sample_rate = 24000
# Rough approximation of speech duration adjusted by speed
duration = min(len(text) / (20 * speed), 10)
# Create time array
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
# Generate sine wave (440 Hz base frequency)
frequency = 440
audio = 0.5 * np.sin(2 * np.pi * frequency * t)
# Add some variation based on voice setting
voice = request.voice_settings.voice_id
if voice == 'male':
# Lower frequency for male voice
audio = 0.5 * np.sin(2 * np.pi * 220 * t)
elif voice == 'female':
# Higher frequency for female voice
audio = 0.5 * np.sin(2 * np.pi * 660 * t)
elif voice == 'robot':
# Square wave for robot voice
audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t))
# Convert to bytes
audio_bytes = self._numpy_to_bytes(audio, sample_rate)
logger.info(f"Generated dummy audio: duration={duration:.2f}s, voice={voice}")
return audio_bytes, sample_rate
except Exception as e:
self._handle_provider_error(e, "dummy audio generation")
def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
"""Generate dummy sine wave audio stream."""
try:
# Extract parameters from request
text = request.text_content.text
speed = request.voice_settings.speed
# Generate audio in chunks
sample_rate = 24000
chunk_duration = 1.0 # 1 second chunks
total_duration = min(len(text) / (20 * speed), 10)
chunks_count = int(np.ceil(total_duration / chunk_duration))
for chunk_idx in range(chunks_count):
start_time = chunk_idx * chunk_duration
end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
actual_duration = end_time - start_time
if actual_duration <= 0:
break
# Create time array for this chunk
t = np.linspace(0, actual_duration, int(sample_rate * actual_duration), endpoint=False)
# Generate sine wave
frequency = 440
audio = 0.5 * np.sin(2 * np.pi * frequency * t)
# Apply voice variations
voice = request.voice_settings.voice_id
if voice == 'male':
audio = 0.5 * np.sin(2 * np.pi * 220 * t)
elif voice == 'female':
audio = 0.5 * np.sin(2 * np.pi * 660 * t)
elif voice == 'robot':
audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t))
# Convert to bytes
audio_bytes = self._numpy_to_bytes(audio, sample_rate)
# Check if this is the final chunk
is_final = (chunk_idx == chunks_count - 1)
yield audio_bytes, sample_rate, is_final
except Exception as e:
self._handle_provider_error(e, "dummy streaming audio generation")
def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
"""Convert numpy audio array to bytes."""
try:
# Create an in-memory buffer
buffer = io.BytesIO()
# Write audio data to buffer as WAV
sf.write(buffer, audio_array, sample_rate, format='WAV')
# Get bytes from buffer
buffer.seek(0)
return buffer.read()
except Exception as e:
raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e |