Michael Hu commited on
Commit
e22e786
·
1 Parent(s): ae641cf

remove fallback to Dummy TTS

Browse files
utils/tts_base.py CHANGED
@@ -67,58 +67,3 @@ class TTSBase(ABC):
67
  output_dir = os.path.join(os.getcwd(), "output")
68
  os.makedirs(output_dir, exist_ok=True)
69
  return os.path.join(output_dir, filename)
70
-
71
-
72
- class DummyTTS(TTSBase):
73
- """Dummy TTS engine that generates sine wave audio
74
-
75
- This class is used as a fallback when no other TTS engine is available.
76
- """
77
-
78
- def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
79
- """Generate a dummy sine wave audio file
80
-
81
- Args:
82
- text (str): Input text (not used)
83
- voice (str): Voice ID (not used)
84
- speed (float): Speech speed multiplier (not used)
85
-
86
- Returns:
87
- str: Path to the generated audio file
88
- """
89
- logger.info(f"Generating dummy speech for text length: {len(text)}")
90
-
91
- # Generate a simple sine wave
92
- sample_rate = 24000
93
- duration = min(len(text) / 20, 10) # Rough approximation of speech duration
94
- t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
95
- audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
96
-
97
- # Save to file
98
- output_path = self._generate_output_path(prefix="dummy")
99
- sf.write(output_path, audio, sample_rate)
100
-
101
- logger.info(f"Generated dummy audio: {output_path}")
102
- return output_path
103
-
104
- def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
105
- """Generate a dummy sine wave audio stream
106
-
107
- Args:
108
- text (str): Input text (not used)
109
- voice (str): Voice ID (not used)
110
- speed (float): Speech speed multiplier (not used)
111
-
112
- Yields:
113
- tuple: (sample_rate, audio_data) pairs
114
- """
115
- logger.info(f"Generating dummy speech stream for text length: {len(text)}")
116
-
117
- # Generate a simple sine wave
118
- sample_rate = 24000
119
- duration = min(len(text) / 20, 10) # Rough approximation of speech duration
120
- t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
121
- audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
122
-
123
- # Yield the audio data
124
- yield sample_rate, audio
 
67
  output_dir = os.path.join(os.getcwd(), "output")
68
  os.makedirs(output_dir, exist_ok=True)
69
  return os.path.join(output_dir, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/tts_cosyvoice2.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import soundfile as sf
4
  from typing import Optional, Generator, Tuple
5
 
6
- from utils.tts import TTSBase, DummyTTS
7
 
8
  # Configure logging
9
  logger = logging.getLogger(__name__)
@@ -97,13 +97,13 @@ class CosyVoice2TTS(TTSBase):
97
 
98
  # Check if CosyVoice2 is available
99
  if not COSYVOICE2_AVAILABLE:
100
- logger.warning("CosyVoice2 TTS engine is not available, falling back to dummy TTS")
101
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
102
 
103
  # Ensure model is loaded
104
  if not self._ensure_model():
105
- logger.warning("Failed to load CosyVoice2 model, falling back to dummy TTS")
106
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
107
 
108
  try:
109
  import torch
@@ -130,14 +130,12 @@ class CosyVoice2TTS(TTSBase):
130
  logger.info(f"CosyVoice2 audio generation complete: {output_path}")
131
  return output_path
132
  else:
133
- logger.warning("CosyVoice2 model returned None for audio output")
134
- logger.warning("Falling back to dummy TTS")
135
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
136
 
137
  except Exception as e:
138
  logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
139
- logger.warning("CosyVoice2 TTS engine failed, falling back to dummy TTS")
140
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
141
 
142
  def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
143
  """Generate speech stream using CosyVoice2 TTS engine
@@ -154,14 +152,12 @@ class CosyVoice2TTS(TTSBase):
154
 
155
  # Check if CosyVoice2 is available
156
  if not COSYVOICE2_AVAILABLE:
157
- logger.warning("CosyVoice2 TTS engine is not available, falling back to dummy TTS")
158
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
159
  return
160
 
161
  # Ensure model is loaded
162
  if not self._ensure_model():
163
- logger.warning("Failed to load CosyVoice2 model, falling back to dummy TTS")
164
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
165
  return
166
 
167
  try:
@@ -184,11 +180,9 @@ class CosyVoice2TTS(TTSBase):
184
  logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
185
  yield DEFAULT_SAMPLE_RATE, output_audio_np
186
  else:
187
- logger.warning("CosyVoice2 model returned None for audio output")
188
- logger.warning("Falling back to dummy TTS")
189
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
190
 
191
  except Exception as e:
192
  logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
193
- logger.warning("CosyVoice2 TTS engine failed, falling back to dummy TTS")
194
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
 
3
  import soundfile as sf
4
  from typing import Optional, Generator, Tuple
5
 
6
+ from utils.tts import TTSBase
7
 
8
  # Configure logging
9
  logger = logging.getLogger(__name__)
 
97
 
98
  # Check if CosyVoice2 is available
99
  if not COSYVOICE2_AVAILABLE:
100
+ logger.error("CosyVoice2 TTS engine is not available")
101
+ return None
102
 
103
  # Ensure model is loaded
104
  if not self._ensure_model():
105
+ logger.error("Failed to load CosyVoice2 model")
106
+ return None
107
 
108
  try:
109
  import torch
 
130
  logger.info(f"CosyVoice2 audio generation complete: {output_path}")
131
  return output_path
132
  else:
133
+ logger.error("CosyVoice2 model returned None for audio output")
134
+ return None
 
135
 
136
  except Exception as e:
137
  logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
138
+ return None
 
139
 
140
  def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
141
  """Generate speech stream using CosyVoice2 TTS engine
 
152
 
153
  # Check if CosyVoice2 is available
154
  if not COSYVOICE2_AVAILABLE:
155
+ logger.error("CosyVoice2 TTS engine is not available")
 
156
  return
157
 
158
  # Ensure model is loaded
159
  if not self._ensure_model():
160
+ logger.error("Failed to load CosyVoice2 model")
 
161
  return
162
 
163
  try:
 
180
  logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
181
  yield DEFAULT_SAMPLE_RATE, output_audio_np
182
  else:
183
+ logger.error("CosyVoice2 model returned None for audio output")
184
+ return
 
185
 
186
  except Exception as e:
187
  logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
188
+ return
 
utils/tts_dia.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import soundfile as sf
4
  from typing import Optional, Generator, Tuple
5
 
6
- from utils.tts import TTSBase, DummyTTS
7
 
8
  # Configure logging
9
  logger = logging.getLogger(__name__)
@@ -98,13 +98,13 @@ class DiaTTS(TTSBase):
98
 
99
  # Check if Dia is available
100
  if not DIA_AVAILABLE:
101
- logger.warning("Dia TTS engine is not available, falling back to dummy TTS")
102
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
103
 
104
  # Ensure model is loaded
105
  if not self._ensure_model():
106
- logger.warning("Failed to load Dia model, falling back to dummy TTS")
107
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
108
 
109
  try:
110
  import torch
@@ -131,20 +131,18 @@ class DiaTTS(TTSBase):
131
  logger.info(f"Dia audio generation complete: {output_path}")
132
  return output_path
133
  else:
134
- logger.warning("Dia model returned None for audio output")
135
- logger.warning("Falling back to dummy TTS")
136
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
137
 
138
  except ModuleNotFoundError as e:
139
  if "dac" in str(e):
140
- logger.warning("Dia TTS engine failed due to missing 'dac' module, falling back to dummy TTS")
141
  else:
142
  logger.error(f"Module not found error in Dia TTS: {str(e)}")
143
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
144
  except Exception as e:
145
  logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
146
- logger.warning("Dia TTS engine failed, falling back to dummy TTS")
147
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
148
 
149
  def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
150
  """Generate speech stream using Dia TTS engine
@@ -161,14 +159,12 @@ class DiaTTS(TTSBase):
161
 
162
  # Check if Dia is available
163
  if not DIA_AVAILABLE:
164
- logger.warning("Dia TTS engine is not available, falling back to dummy TTS")
165
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
166
  return
167
 
168
  # Ensure model is loaded
169
  if not self._ensure_model():
170
- logger.warning("Failed to load Dia model, falling back to dummy TTS")
171
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
172
  return
173
 
174
  try:
@@ -191,17 +187,15 @@ class DiaTTS(TTSBase):
191
  logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
192
  yield DEFAULT_SAMPLE_RATE, output_audio_np
193
  else:
194
- logger.warning("Dia model returned None for audio output")
195
- logger.warning("Falling back to dummy TTS")
196
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
197
 
198
  except ModuleNotFoundError as e:
199
  if "dac" in str(e):
200
- logger.warning("Dia TTS engine failed due to missing 'dac' module, falling back to dummy TTS")
201
  else:
202
  logger.error(f"Module not found error in Dia TTS: {str(e)}")
203
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
204
  except Exception as e:
205
  logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
206
- logger.warning("Dia TTS engine failed, falling back to dummy TTS")
207
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
 
3
  import soundfile as sf
4
  from typing import Optional, Generator, Tuple
5
 
6
+ from utils.tts import TTSBase
7
 
8
  # Configure logging
9
  logger = logging.getLogger(__name__)
 
98
 
99
  # Check if Dia is available
100
  if not DIA_AVAILABLE:
101
+ logger.error("Dia TTS engine is not available")
102
+ return None
103
 
104
  # Ensure model is loaded
105
  if not self._ensure_model():
106
+ logger.error("Failed to load Dia model")
107
+ return None
108
 
109
  try:
110
  import torch
 
131
  logger.info(f"Dia audio generation complete: {output_path}")
132
  return output_path
133
  else:
134
+ logger.error("Dia model returned None for audio output")
135
+ return None
 
136
 
137
  except ModuleNotFoundError as e:
138
  if "dac" in str(e):
139
+ logger.error("Dia TTS engine failed due to missing 'dac' module")
140
  else:
141
  logger.error(f"Module not found error in Dia TTS: {str(e)}")
142
+ return None
143
  except Exception as e:
144
  logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
145
+ return None
 
146
 
147
  def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
148
  """Generate speech stream using Dia TTS engine
 
159
 
160
  # Check if Dia is available
161
  if not DIA_AVAILABLE:
162
+ logger.error("Dia TTS engine is not available")
 
163
  return
164
 
165
  # Ensure model is loaded
166
  if not self._ensure_model():
167
+ logger.error("Failed to load Dia model")
 
168
  return
169
 
170
  try:
 
187
  logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
188
  yield DEFAULT_SAMPLE_RATE, output_audio_np
189
  else:
190
+ logger.error("Dia model returned None for audio output")
191
+ return
 
192
 
193
  except ModuleNotFoundError as e:
194
  if "dac" in str(e):
195
+ logger.error("Dia TTS engine failed due to missing 'dac' module")
196
  else:
197
  logger.error(f"Module not found error in Dia TTS: {str(e)}")
198
+ return
199
  except Exception as e:
200
  logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
201
+ return
 
utils/tts_dummy.py CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  class DummyTTS(TTSBase):
2
  """Dummy TTS engine that generates sine wave audio
3
 
 
1
+ import logging
2
+ import os
3
+ import time
4
+ import numpy as np
5
+ import soundfile as sf
6
+ from typing import Optional, Generator, Tuple, List
7
+ from .tts_base import TTSBase
8
+
9
+ # Configure logging
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
  class DummyTTS(TTSBase):
14
  """Dummy TTS engine that generates sine wave audio
15
 
utils/tts_kokoro.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import soundfile as sf
4
  from typing import Optional, Generator, Tuple
5
 
6
- from utils.tts import TTSBase, DummyTTS
7
 
8
  # Configure logging
9
  logger = logging.getLogger(__name__)
@@ -86,13 +86,13 @@ class KokoroTTS(TTSBase):
86
 
87
  # Check if Kokoro is available
88
  if not KOKORO_AVAILABLE:
89
- logger.warning("Kokoro TTS engine is not available, falling back to dummy TTS")
90
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
91
 
92
  # Ensure pipeline is loaded
93
  if not self._ensure_pipeline():
94
- logger.warning("Failed to load Kokoro pipeline, falling back to dummy TTS")
95
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
96
 
97
  try:
98
  # Generate unique output path
@@ -109,8 +109,7 @@ class KokoroTTS(TTSBase):
109
  return output_path
110
  except Exception as e:
111
  logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
112
- logger.warning("Kokoro TTS engine failed, falling back to dummy TTS")
113
- return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
114
 
115
  def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
116
  """Generate speech stream using Kokoro TTS engine
@@ -127,14 +126,12 @@ class KokoroTTS(TTSBase):
127
 
128
  # Check if Kokoro is available
129
  if not KOKORO_AVAILABLE:
130
- logger.warning("Kokoro TTS engine is not available, falling back to dummy TTS")
131
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
132
  return
133
 
134
  # Ensure pipeline is loaded
135
  if not self._ensure_pipeline():
136
- logger.warning("Failed to load Kokoro pipeline, falling back to dummy TTS")
137
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
138
  return
139
 
140
  try:
@@ -144,5 +141,4 @@ class KokoroTTS(TTSBase):
144
  yield 24000, audio
145
  except Exception as e:
146
  logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
147
- logger.warning("Kokoro TTS engine failed, falling back to dummy TTS")
148
- yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
 
3
  import soundfile as sf
4
  from typing import Optional, Generator, Tuple
5
 
6
+ from utils.tts import TTSBase
7
 
8
  # Configure logging
9
  logger = logging.getLogger(__name__)
 
86
 
87
  # Check if Kokoro is available
88
  if not KOKORO_AVAILABLE:
89
+ logger.error("Kokoro TTS engine is not available")
90
+ return None
91
 
92
  # Ensure pipeline is loaded
93
  if not self._ensure_pipeline():
94
+ logger.error("Failed to load Kokoro pipeline")
95
+ return None
96
 
97
  try:
98
  # Generate unique output path
 
109
  return output_path
110
  except Exception as e:
111
  logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
112
+ return None
 
113
 
114
  def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
115
  """Generate speech stream using Kokoro TTS engine
 
126
 
127
  # Check if Kokoro is available
128
  if not KOKORO_AVAILABLE:
129
+ logger.error("Kokoro TTS engine is not available")
 
130
  return
131
 
132
  # Ensure pipeline is loaded
133
  if not self._ensure_pipeline():
134
+ logger.error("Failed to load Kokoro pipeline")
 
135
  return
136
 
137
  try:
 
141
  yield 24000, audio
142
  except Exception as e:
143
  logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
144
+ return