File size: 4,326 Bytes
7495571
3ed3b5a
 
 
7495571
 
3ed3b5a
 
 
 
 
7495571
 
3ed3b5a
 
 
 
 
 
 
 
 
7495571
3ed3b5a
 
 
 
7495571
3ed3b5a
 
 
 
7495571
 
3ed3b5a
 
7495571
3ed3b5a
 
 
7495571
 
 
3ed3b5a
 
 
 
 
 
 
 
 
7495571
3ed3b5a
7495571
 
3ed3b5a
 
7495571
 
3ed3b5a
 
 
 
7495571
 
 
 
 
3ed3b5a
 
7495571
 
3ed3b5a
7495571
3ed3b5a
 
7495571
 
3ed3b5a
 
 
 
7495571
3ed3b5a
 
7495571
3ed3b5a
 
 
 
 
7495571
 
 
3ed3b5a
7495571
 
 
3ed3b5a
7495571
3ed3b5a
 
7495571
 
3ed3b5a
 
 
 
7495571
3ed3b5a
 
7495571
3ed3b5a
 
 
7495571
3ed3b5a
7495571
 
 
3ed3b5a
7495571
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import logging
import os
import time
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple, List
from abc import ABC, abstractmethod

# Configure logging
logger = logging.getLogger(__name__)


class TTSBase(ABC):
    """Base class for all TTS engines
    
    This abstract class defines the interface that all TTS engines must implement.
    """
    
    def __init__(self, lang_code: str = 'z'):
        """Initialize the TTS engine
        
        Args:
            lang_code (str): Language code for the engine
        """
        self.lang_code = lang_code
    
    @abstractmethod
    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
        """Generate speech from text
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        pass
    
    @abstractmethod
    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream from text
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        pass
    
    def _generate_output_path(self, prefix: str = "tts", extension: str = "wav") -> str:
        """Generate a unique output path for the audio file
        
        Args:
            prefix (str): Prefix for the filename
            extension (str): File extension
            
        Returns:
            str: Path to the output file
        """
        timestamp = int(time.time() * 1000)
        filename = f"{prefix}_{timestamp}.{extension}"
        output_dir = os.path.join(os.getcwd(), "output")
        os.makedirs(output_dir, exist_ok=True)
        return os.path.join(output_dir, filename)


class DummyTTS(TTSBase):
    """Dummy TTS engine that generates sine wave audio
    
    This class is used as a fallback when no other TTS engine is available.
    """
    
    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
        """Generate a dummy sine wave audio file
        
        Args:
            text (str): Input text (not used)
            voice (str): Voice ID (not used)
            speed (float): Speech speed multiplier (not used)
            
        Returns:
            str: Path to the generated audio file
        """
        logger.info(f"Generating dummy speech for text length: {len(text)}")
        
        # Generate a simple sine wave
        sample_rate = 24000
        duration = min(len(text) / 20, 10)  # Rough approximation of speech duration
        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
        audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
        
        # Save to file
        output_path = self._generate_output_path(prefix="dummy")
        sf.write(output_path, audio, sample_rate)
        
        logger.info(f"Generated dummy audio: {output_path}")
        return output_path
    
    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate a dummy sine wave audio stream
        
        Args:
            text (str): Input text (not used)
            voice (str): Voice ID (not used)
            speed (float): Speech speed multiplier (not used)
            
        Yields:
            tuple: (sample_rate, audio_data) pairs
        """
        logger.info(f"Generating dummy speech stream for text length: {len(text)}")
        
        # Generate a simple sine wave
        sample_rate = 24000
        duration = min(len(text) / 20, 10)  # Rough approximation of speech duration
        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
        audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz sine wave
        
        # Yield the audio data
        yield sample_rate, audio