Spaces:
Build error
Build error
Michael Hu
commited on
Commit
·
e22e786
1
Parent(s):
ae641cf
remove fallback to Dummy TTS
Browse files- utils/tts_base.py +0 -55
- utils/tts_cosyvoice2.py +13 -19
- utils/tts_dia.py +17 -23
- utils/tts_dummy.py +12 -0
- utils/tts_kokoro.py +9 -13
utils/tts_base.py
CHANGED
@@ -67,58 +67,3 @@ class TTSBase(ABC):
|
|
67 |
output_dir = os.path.join(os.getcwd(), "output")
|
68 |
os.makedirs(output_dir, exist_ok=True)
|
69 |
return os.path.join(output_dir, filename)
|
70 |
-
|
71 |
-
|
72 |
-
class DummyTTS(TTSBase):
|
73 |
-
"""Dummy TTS engine that generates sine wave audio
|
74 |
-
|
75 |
-
This class is used as a fallback when no other TTS engine is available.
|
76 |
-
"""
|
77 |
-
|
78 |
-
def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
|
79 |
-
"""Generate a dummy sine wave audio file
|
80 |
-
|
81 |
-
Args:
|
82 |
-
text (str): Input text (not used)
|
83 |
-
voice (str): Voice ID (not used)
|
84 |
-
speed (float): Speech speed multiplier (not used)
|
85 |
-
|
86 |
-
Returns:
|
87 |
-
str: Path to the generated audio file
|
88 |
-
"""
|
89 |
-
logger.info(f"Generating dummy speech for text length: {len(text)}")
|
90 |
-
|
91 |
-
# Generate a simple sine wave
|
92 |
-
sample_rate = 24000
|
93 |
-
duration = min(len(text) / 20, 10) # Rough approximation of speech duration
|
94 |
-
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
95 |
-
audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
|
96 |
-
|
97 |
-
# Save to file
|
98 |
-
output_path = self._generate_output_path(prefix="dummy")
|
99 |
-
sf.write(output_path, audio, sample_rate)
|
100 |
-
|
101 |
-
logger.info(f"Generated dummy audio: {output_path}")
|
102 |
-
return output_path
|
103 |
-
|
104 |
-
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
105 |
-
"""Generate a dummy sine wave audio stream
|
106 |
-
|
107 |
-
Args:
|
108 |
-
text (str): Input text (not used)
|
109 |
-
voice (str): Voice ID (not used)
|
110 |
-
speed (float): Speech speed multiplier (not used)
|
111 |
-
|
112 |
-
Yields:
|
113 |
-
tuple: (sample_rate, audio_data) pairs
|
114 |
-
"""
|
115 |
-
logger.info(f"Generating dummy speech stream for text length: {len(text)}")
|
116 |
-
|
117 |
-
# Generate a simple sine wave
|
118 |
-
sample_rate = 24000
|
119 |
-
duration = min(len(text) / 20, 10) # Rough approximation of speech duration
|
120 |
-
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
121 |
-
audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
|
122 |
-
|
123 |
-
# Yield the audio data
|
124 |
-
yield sample_rate, audio
|
|
|
67 |
output_dir = os.path.join(os.getcwd(), "output")
|
68 |
os.makedirs(output_dir, exist_ok=True)
|
69 |
return os.path.join(output_dir, filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/tts_cosyvoice2.py
CHANGED
@@ -3,7 +3,7 @@ import numpy as np
|
|
3 |
import soundfile as sf
|
4 |
from typing import Optional, Generator, Tuple
|
5 |
|
6 |
-
from utils.tts import TTSBase
|
7 |
|
8 |
# Configure logging
|
9 |
logger = logging.getLogger(__name__)
|
@@ -97,13 +97,13 @@ class CosyVoice2TTS(TTSBase):
|
|
97 |
|
98 |
# Check if CosyVoice2 is available
|
99 |
if not COSYVOICE2_AVAILABLE:
|
100 |
-
logger.
|
101 |
-
return
|
102 |
|
103 |
# Ensure model is loaded
|
104 |
if not self._ensure_model():
|
105 |
-
logger.
|
106 |
-
return
|
107 |
|
108 |
try:
|
109 |
import torch
|
@@ -130,14 +130,12 @@ class CosyVoice2TTS(TTSBase):
|
|
130 |
logger.info(f"CosyVoice2 audio generation complete: {output_path}")
|
131 |
return output_path
|
132 |
else:
|
133 |
-
logger.
|
134 |
-
|
135 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
136 |
|
137 |
except Exception as e:
|
138 |
logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
|
139 |
-
|
140 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
141 |
|
142 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
143 |
"""Generate speech stream using CosyVoice2 TTS engine
|
@@ -154,14 +152,12 @@ class CosyVoice2TTS(TTSBase):
|
|
154 |
|
155 |
# Check if CosyVoice2 is available
|
156 |
if not COSYVOICE2_AVAILABLE:
|
157 |
-
logger.
|
158 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
159 |
return
|
160 |
|
161 |
# Ensure model is loaded
|
162 |
if not self._ensure_model():
|
163 |
-
logger.
|
164 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
165 |
return
|
166 |
|
167 |
try:
|
@@ -184,11 +180,9 @@ class CosyVoice2TTS(TTSBase):
|
|
184 |
logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
|
185 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
186 |
else:
|
187 |
-
logger.
|
188 |
-
|
189 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
190 |
|
191 |
except Exception as e:
|
192 |
logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
|
193 |
-
|
194 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
|
|
3 |
import soundfile as sf
|
4 |
from typing import Optional, Generator, Tuple
|
5 |
|
6 |
+
from utils.tts import TTSBase
|
7 |
|
8 |
# Configure logging
|
9 |
logger = logging.getLogger(__name__)
|
|
|
97 |
|
98 |
# Check if CosyVoice2 is available
|
99 |
if not COSYVOICE2_AVAILABLE:
|
100 |
+
logger.error("CosyVoice2 TTS engine is not available")
|
101 |
+
return None
|
102 |
|
103 |
# Ensure model is loaded
|
104 |
if not self._ensure_model():
|
105 |
+
logger.error("Failed to load CosyVoice2 model")
|
106 |
+
return None
|
107 |
|
108 |
try:
|
109 |
import torch
|
|
|
130 |
logger.info(f"CosyVoice2 audio generation complete: {output_path}")
|
131 |
return output_path
|
132 |
else:
|
133 |
+
logger.error("CosyVoice2 model returned None for audio output")
|
134 |
+
return None
|
|
|
135 |
|
136 |
except Exception as e:
|
137 |
logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
|
138 |
+
return None
|
|
|
139 |
|
140 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
141 |
"""Generate speech stream using CosyVoice2 TTS engine
|
|
|
152 |
|
153 |
# Check if CosyVoice2 is available
|
154 |
if not COSYVOICE2_AVAILABLE:
|
155 |
+
logger.error("CosyVoice2 TTS engine is not available")
|
|
|
156 |
return
|
157 |
|
158 |
# Ensure model is loaded
|
159 |
if not self._ensure_model():
|
160 |
+
logger.error("Failed to load CosyVoice2 model")
|
|
|
161 |
return
|
162 |
|
163 |
try:
|
|
|
180 |
logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
|
181 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
182 |
else:
|
183 |
+
logger.error("CosyVoice2 model returned None for audio output")
|
184 |
+
return
|
|
|
185 |
|
186 |
except Exception as e:
|
187 |
logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
|
188 |
+
return
|
|
utils/tts_dia.py
CHANGED
@@ -3,7 +3,7 @@ import numpy as np
|
|
3 |
import soundfile as sf
|
4 |
from typing import Optional, Generator, Tuple
|
5 |
|
6 |
-
from utils.tts import TTSBase
|
7 |
|
8 |
# Configure logging
|
9 |
logger = logging.getLogger(__name__)
|
@@ -98,13 +98,13 @@ class DiaTTS(TTSBase):
|
|
98 |
|
99 |
# Check if Dia is available
|
100 |
if not DIA_AVAILABLE:
|
101 |
-
logger.
|
102 |
-
return
|
103 |
|
104 |
# Ensure model is loaded
|
105 |
if not self._ensure_model():
|
106 |
-
logger.
|
107 |
-
return
|
108 |
|
109 |
try:
|
110 |
import torch
|
@@ -131,20 +131,18 @@ class DiaTTS(TTSBase):
|
|
131 |
logger.info(f"Dia audio generation complete: {output_path}")
|
132 |
return output_path
|
133 |
else:
|
134 |
-
logger.
|
135 |
-
|
136 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
137 |
|
138 |
except ModuleNotFoundError as e:
|
139 |
if "dac" in str(e):
|
140 |
-
logger.
|
141 |
else:
|
142 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
143 |
-
return
|
144 |
except Exception as e:
|
145 |
logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
|
146 |
-
|
147 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
148 |
|
149 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
150 |
"""Generate speech stream using Dia TTS engine
|
@@ -161,14 +159,12 @@ class DiaTTS(TTSBase):
|
|
161 |
|
162 |
# Check if Dia is available
|
163 |
if not DIA_AVAILABLE:
|
164 |
-
logger.
|
165 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
166 |
return
|
167 |
|
168 |
# Ensure model is loaded
|
169 |
if not self._ensure_model():
|
170 |
-
logger.
|
171 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
172 |
return
|
173 |
|
174 |
try:
|
@@ -191,17 +187,15 @@ class DiaTTS(TTSBase):
|
|
191 |
logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
|
192 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
193 |
else:
|
194 |
-
logger.
|
195 |
-
|
196 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
197 |
|
198 |
except ModuleNotFoundError as e:
|
199 |
if "dac" in str(e):
|
200 |
-
logger.
|
201 |
else:
|
202 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
203 |
-
|
204 |
except Exception as e:
|
205 |
logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
|
206 |
-
|
207 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
|
|
3 |
import soundfile as sf
|
4 |
from typing import Optional, Generator, Tuple
|
5 |
|
6 |
+
from utils.tts import TTSBase
|
7 |
|
8 |
# Configure logging
|
9 |
logger = logging.getLogger(__name__)
|
|
|
98 |
|
99 |
# Check if Dia is available
|
100 |
if not DIA_AVAILABLE:
|
101 |
+
logger.error("Dia TTS engine is not available")
|
102 |
+
return None
|
103 |
|
104 |
# Ensure model is loaded
|
105 |
if not self._ensure_model():
|
106 |
+
logger.error("Failed to load Dia model")
|
107 |
+
return None
|
108 |
|
109 |
try:
|
110 |
import torch
|
|
|
131 |
logger.info(f"Dia audio generation complete: {output_path}")
|
132 |
return output_path
|
133 |
else:
|
134 |
+
logger.error("Dia model returned None for audio output")
|
135 |
+
return None
|
|
|
136 |
|
137 |
except ModuleNotFoundError as e:
|
138 |
if "dac" in str(e):
|
139 |
+
logger.error("Dia TTS engine failed due to missing 'dac' module")
|
140 |
else:
|
141 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
142 |
+
return None
|
143 |
except Exception as e:
|
144 |
logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
|
145 |
+
return None
|
|
|
146 |
|
147 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
148 |
"""Generate speech stream using Dia TTS engine
|
|
|
159 |
|
160 |
# Check if Dia is available
|
161 |
if not DIA_AVAILABLE:
|
162 |
+
logger.error("Dia TTS engine is not available")
|
|
|
163 |
return
|
164 |
|
165 |
# Ensure model is loaded
|
166 |
if not self._ensure_model():
|
167 |
+
logger.error("Failed to load Dia model")
|
|
|
168 |
return
|
169 |
|
170 |
try:
|
|
|
187 |
logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
|
188 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
189 |
else:
|
190 |
+
logger.error("Dia model returned None for audio output")
|
191 |
+
return
|
|
|
192 |
|
193 |
except ModuleNotFoundError as e:
|
194 |
if "dac" in str(e):
|
195 |
+
logger.error("Dia TTS engine failed due to missing 'dac' module")
|
196 |
else:
|
197 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
198 |
+
return
|
199 |
except Exception as e:
|
200 |
logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
|
201 |
+
return
|
|
utils/tts_dummy.py
CHANGED
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
class DummyTTS(TTSBase):
|
2 |
"""Dummy TTS engine that generates sine wave audio
|
3 |
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import numpy as np
|
5 |
+
import soundfile as sf
|
6 |
+
from typing import Optional, Generator, Tuple, List
|
7 |
+
from .tts_base import TTSBase
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
class DummyTTS(TTSBase):
|
14 |
"""Dummy TTS engine that generates sine wave audio
|
15 |
|
utils/tts_kokoro.py
CHANGED
@@ -3,7 +3,7 @@ import numpy as np
|
|
3 |
import soundfile as sf
|
4 |
from typing import Optional, Generator, Tuple
|
5 |
|
6 |
-
from utils.tts import TTSBase
|
7 |
|
8 |
# Configure logging
|
9 |
logger = logging.getLogger(__name__)
|
@@ -86,13 +86,13 @@ class KokoroTTS(TTSBase):
|
|
86 |
|
87 |
# Check if Kokoro is available
|
88 |
if not KOKORO_AVAILABLE:
|
89 |
-
logger.
|
90 |
-
return
|
91 |
|
92 |
# Ensure pipeline is loaded
|
93 |
if not self._ensure_pipeline():
|
94 |
-
logger.
|
95 |
-
return
|
96 |
|
97 |
try:
|
98 |
# Generate unique output path
|
@@ -109,8 +109,7 @@ class KokoroTTS(TTSBase):
|
|
109 |
return output_path
|
110 |
except Exception as e:
|
111 |
logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
|
112 |
-
|
113 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
114 |
|
115 |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
116 |
"""Generate speech stream using Kokoro TTS engine
|
@@ -127,14 +126,12 @@ class KokoroTTS(TTSBase):
|
|
127 |
|
128 |
# Check if Kokoro is available
|
129 |
if not KOKORO_AVAILABLE:
|
130 |
-
logger.
|
131 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
132 |
return
|
133 |
|
134 |
# Ensure pipeline is loaded
|
135 |
if not self._ensure_pipeline():
|
136 |
-
logger.
|
137 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
138 |
return
|
139 |
|
140 |
try:
|
@@ -144,5 +141,4 @@ class KokoroTTS(TTSBase):
|
|
144 |
yield 24000, audio
|
145 |
except Exception as e:
|
146 |
logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
|
147 |
-
|
148 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
|
|
3 |
import soundfile as sf
|
4 |
from typing import Optional, Generator, Tuple
|
5 |
|
6 |
+
from utils.tts import TTSBase
|
7 |
|
8 |
# Configure logging
|
9 |
logger = logging.getLogger(__name__)
|
|
|
86 |
|
87 |
# Check if Kokoro is available
|
88 |
if not KOKORO_AVAILABLE:
|
89 |
+
logger.error("Kokoro TTS engine is not available")
|
90 |
+
return None
|
91 |
|
92 |
# Ensure pipeline is loaded
|
93 |
if not self._ensure_pipeline():
|
94 |
+
logger.error("Failed to load Kokoro pipeline")
|
95 |
+
return None
|
96 |
|
97 |
try:
|
98 |
# Generate unique output path
|
|
|
109 |
return output_path
|
110 |
except Exception as e:
|
111 |
logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
|
112 |
+
return None
|
|
|
113 |
|
114 |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
115 |
"""Generate speech stream using Kokoro TTS engine
|
|
|
126 |
|
127 |
# Check if Kokoro is available
|
128 |
if not KOKORO_AVAILABLE:
|
129 |
+
logger.error("Kokoro TTS engine is not available")
|
|
|
130 |
return
|
131 |
|
132 |
# Ensure pipeline is loaded
|
133 |
if not self._ensure_pipeline():
|
134 |
+
logger.error("Failed to load Kokoro pipeline")
|
|
|
135 |
return
|
136 |
|
137 |
try:
|
|
|
141 |
yield 24000, audio
|
142 |
except Exception as e:
|
143 |
logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
|
144 |
+
return
|
|