File size: 5,241 Bytes
1f9c751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1f9c751
 
fdc056d
1f9c751
 
 
fdc056d
1f9c751
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1f9c751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdc056d
1f9c751
fdc056d
1f9c751
 
 
 
fdc056d
1f9c751
 
fdc056d
1f9c751
 
fdc056d
1f9c751
 
 
fdc056d
1f9c751
 
 
 
 
 
 
 
 
 
 
fdc056d
1f9c751
 
fdc056d
1f9c751
 
 
 
 
 
 
 
 
 
fdc056d
1f9c751
 
fdc056d
1f9c751
 
 
fdc056d
1f9c751
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""Dummy TTS provider implementation for testing and fallback."""

import logging
import numpy as np
import soundfile as sf
import io
from typing import Iterator, TYPE_CHECKING

if TYPE_CHECKING:
    from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest

from ..base.tts_provider_base import TTSProviderBase
from ...domain.exceptions import SpeechSynthesisException

logger = logging.getLogger(__name__)


class DummyTTSProvider(TTSProviderBase):
    """Dummy TTS provider that generates sine wave audio for testing."""

    def __init__(self):
        """Initialize the Dummy TTS provider."""
        super().__init__(
            provider_name="Dummy",
            supported_languages=['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
        )

    def is_available(self) -> bool:
        """Dummy TTS is always available."""
        return True

    def get_available_voices(self) -> list[str]:
        """Get available voices for Dummy TTS."""
        return ['default', 'male', 'female', 'robot']

    def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
        """Generate dummy sine wave audio."""
        try:
            # Extract parameters from request
            text = request.text_content.text
            speed = request.voice_settings.speed

            # Generate a simple sine wave based on text length and speed
            sample_rate = 24000
            # Rough approximation of speech duration adjusted by speed
            duration = min(len(text) / (20 * speed), 10)

            # Create time array
            t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)

            # Generate sine wave (440 Hz base frequency)
            frequency = 440
            audio = 0.5 * np.sin(2 * np.pi * frequency * t)

            # Add some variation based on voice setting
            voice = request.voice_settings.voice_id
            if voice == 'male':
                # Lower frequency for male voice
                audio = 0.5 * np.sin(2 * np.pi * 220 * t)
            elif voice == 'female':
                # Higher frequency for female voice
                audio = 0.5 * np.sin(2 * np.pi * 660 * t)
            elif voice == 'robot':
                # Square wave for robot voice
                audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t))

            # Convert to bytes
            audio_bytes = self._numpy_to_bytes(audio, sample_rate)

            logger.info(f"Generated dummy audio: duration={duration:.2f}s, voice={voice}")
            return audio_bytes, sample_rate

        except Exception as e:
            self._handle_provider_error(e, "dummy audio generation")

    def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
        """Generate dummy sine wave audio stream."""
        try:
            # Extract parameters from request
            text = request.text_content.text
            speed = request.voice_settings.speed

            # Generate audio in chunks
            sample_rate = 24000
            chunk_duration = 1.0  # 1 second chunks
            total_duration = min(len(text) / (20 * speed), 10)

            chunks_count = int(np.ceil(total_duration / chunk_duration))

            for chunk_idx in range(chunks_count):
                start_time = chunk_idx * chunk_duration
                end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
                actual_duration = end_time - start_time

                if actual_duration <= 0:
                    break

                # Create time array for this chunk
                t = np.linspace(0, actual_duration, int(sample_rate * actual_duration), endpoint=False)

                # Generate sine wave
                frequency = 440
                audio = 0.5 * np.sin(2 * np.pi * frequency * t)

                # Apply voice variations
                voice = request.voice_settings.voice_id
                if voice == 'male':
                    audio = 0.5 * np.sin(2 * np.pi * 220 * t)
                elif voice == 'female':
                    audio = 0.5 * np.sin(2 * np.pi * 660 * t)
                elif voice == 'robot':
                    audio = 0.5 * np.sign(np.sin(2 * np.pi * 440 * t))

                # Convert to bytes
                audio_bytes = self._numpy_to_bytes(audio, sample_rate)

                # Check if this is the final chunk
                is_final = (chunk_idx == chunks_count - 1)

                yield audio_bytes, sample_rate, is_final

        except Exception as e:
            self._handle_provider_error(e, "dummy streaming audio generation")

    def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
        """Convert numpy audio array to bytes."""
        try:
            # Create an in-memory buffer
            buffer = io.BytesIO()

            # Write audio data to buffer as WAV
            sf.write(buffer, audio_array, sample_rate, format='WAV')

            # Get bytes from buffer
            buffer.seek(0)
            return buffer.read()

        except Exception as e:
            raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e