Spaces:
Sleeping
Sleeping
Michael Hu
commited on
Commit
·
e22e786
1
Parent(s):
ae641cf
remove fallback to Dummy TTS
Browse files- utils/tts_base.py +0 -55
- utils/tts_cosyvoice2.py +13 -19
- utils/tts_dia.py +17 -23
- utils/tts_dummy.py +12 -0
- utils/tts_kokoro.py +9 -13
utils/tts_base.py
CHANGED
|
@@ -67,58 +67,3 @@ class TTSBase(ABC):
|
|
| 67 |
output_dir = os.path.join(os.getcwd(), "output")
|
| 68 |
os.makedirs(output_dir, exist_ok=True)
|
| 69 |
return os.path.join(output_dir, filename)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
class DummyTTS(TTSBase):
|
| 73 |
-
"""Dummy TTS engine that generates sine wave audio
|
| 74 |
-
|
| 75 |
-
This class is used as a fallback when no other TTS engine is available.
|
| 76 |
-
"""
|
| 77 |
-
|
| 78 |
-
def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> str:
|
| 79 |
-
"""Generate a dummy sine wave audio file
|
| 80 |
-
|
| 81 |
-
Args:
|
| 82 |
-
text (str): Input text (not used)
|
| 83 |
-
voice (str): Voice ID (not used)
|
| 84 |
-
speed (float): Speech speed multiplier (not used)
|
| 85 |
-
|
| 86 |
-
Returns:
|
| 87 |
-
str: Path to the generated audio file
|
| 88 |
-
"""
|
| 89 |
-
logger.info(f"Generating dummy speech for text length: {len(text)}")
|
| 90 |
-
|
| 91 |
-
# Generate a simple sine wave
|
| 92 |
-
sample_rate = 24000
|
| 93 |
-
duration = min(len(text) / 20, 10) # Rough approximation of speech duration
|
| 94 |
-
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
| 95 |
-
audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
|
| 96 |
-
|
| 97 |
-
# Save to file
|
| 98 |
-
output_path = self._generate_output_path(prefix="dummy")
|
| 99 |
-
sf.write(output_path, audio, sample_rate)
|
| 100 |
-
|
| 101 |
-
logger.info(f"Generated dummy audio: {output_path}")
|
| 102 |
-
return output_path
|
| 103 |
-
|
| 104 |
-
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 105 |
-
"""Generate a dummy sine wave audio stream
|
| 106 |
-
|
| 107 |
-
Args:
|
| 108 |
-
text (str): Input text (not used)
|
| 109 |
-
voice (str): Voice ID (not used)
|
| 110 |
-
speed (float): Speech speed multiplier (not used)
|
| 111 |
-
|
| 112 |
-
Yields:
|
| 113 |
-
tuple: (sample_rate, audio_data) pairs
|
| 114 |
-
"""
|
| 115 |
-
logger.info(f"Generating dummy speech stream for text length: {len(text)}")
|
| 116 |
-
|
| 117 |
-
# Generate a simple sine wave
|
| 118 |
-
sample_rate = 24000
|
| 119 |
-
duration = min(len(text) / 20, 10) # Rough approximation of speech duration
|
| 120 |
-
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
| 121 |
-
audio = 0.5 * np.sin(2 * np.pi * 440 * t) # 440 Hz sine wave
|
| 122 |
-
|
| 123 |
-
# Yield the audio data
|
| 124 |
-
yield sample_rate, audio
|
|
|
|
| 67 |
output_dir = os.path.join(os.getcwd(), "output")
|
| 68 |
os.makedirs(output_dir, exist_ok=True)
|
| 69 |
return os.path.join(output_dir, filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/tts_cosyvoice2.py
CHANGED
|
@@ -3,7 +3,7 @@ import numpy as np
|
|
| 3 |
import soundfile as sf
|
| 4 |
from typing import Optional, Generator, Tuple
|
| 5 |
|
| 6 |
-
from utils.tts import TTSBase
|
| 7 |
|
| 8 |
# Configure logging
|
| 9 |
logger = logging.getLogger(__name__)
|
|
@@ -97,13 +97,13 @@ class CosyVoice2TTS(TTSBase):
|
|
| 97 |
|
| 98 |
# Check if CosyVoice2 is available
|
| 99 |
if not COSYVOICE2_AVAILABLE:
|
| 100 |
-
logger.
|
| 101 |
-
return
|
| 102 |
|
| 103 |
# Ensure model is loaded
|
| 104 |
if not self._ensure_model():
|
| 105 |
-
logger.
|
| 106 |
-
return
|
| 107 |
|
| 108 |
try:
|
| 109 |
import torch
|
|
@@ -130,14 +130,12 @@ class CosyVoice2TTS(TTSBase):
|
|
| 130 |
logger.info(f"CosyVoice2 audio generation complete: {output_path}")
|
| 131 |
return output_path
|
| 132 |
else:
|
| 133 |
-
logger.
|
| 134 |
-
|
| 135 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
| 136 |
|
| 137 |
except Exception as e:
|
| 138 |
logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
|
| 139 |
-
|
| 140 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
| 141 |
|
| 142 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 143 |
"""Generate speech stream using CosyVoice2 TTS engine
|
|
@@ -154,14 +152,12 @@ class CosyVoice2TTS(TTSBase):
|
|
| 154 |
|
| 155 |
# Check if CosyVoice2 is available
|
| 156 |
if not COSYVOICE2_AVAILABLE:
|
| 157 |
-
logger.
|
| 158 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 159 |
return
|
| 160 |
|
| 161 |
# Ensure model is loaded
|
| 162 |
if not self._ensure_model():
|
| 163 |
-
logger.
|
| 164 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 165 |
return
|
| 166 |
|
| 167 |
try:
|
|
@@ -184,11 +180,9 @@ class CosyVoice2TTS(TTSBase):
|
|
| 184 |
logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
|
| 185 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
| 186 |
else:
|
| 187 |
-
logger.
|
| 188 |
-
|
| 189 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 190 |
|
| 191 |
except Exception as e:
|
| 192 |
logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
|
| 193 |
-
|
| 194 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
|
|
|
| 3 |
import soundfile as sf
|
| 4 |
from typing import Optional, Generator, Tuple
|
| 5 |
|
| 6 |
+
from utils.tts import TTSBase
|
| 7 |
|
| 8 |
# Configure logging
|
| 9 |
logger = logging.getLogger(__name__)
|
|
|
|
| 97 |
|
| 98 |
# Check if CosyVoice2 is available
|
| 99 |
if not COSYVOICE2_AVAILABLE:
|
| 100 |
+
logger.error("CosyVoice2 TTS engine is not available")
|
| 101 |
+
return None
|
| 102 |
|
| 103 |
# Ensure model is loaded
|
| 104 |
if not self._ensure_model():
|
| 105 |
+
logger.error("Failed to load CosyVoice2 model")
|
| 106 |
+
return None
|
| 107 |
|
| 108 |
try:
|
| 109 |
import torch
|
|
|
|
| 130 |
logger.info(f"CosyVoice2 audio generation complete: {output_path}")
|
| 131 |
return output_path
|
| 132 |
else:
|
| 133 |
+
logger.error("CosyVoice2 model returned None for audio output")
|
| 134 |
+
return None
|
|
|
|
| 135 |
|
| 136 |
except Exception as e:
|
| 137 |
logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
|
| 138 |
+
return None
|
|
|
|
| 139 |
|
| 140 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 141 |
"""Generate speech stream using CosyVoice2 TTS engine
|
|
|
|
| 152 |
|
| 153 |
# Check if CosyVoice2 is available
|
| 154 |
if not COSYVOICE2_AVAILABLE:
|
| 155 |
+
logger.error("CosyVoice2 TTS engine is not available")
|
|
|
|
| 156 |
return
|
| 157 |
|
| 158 |
# Ensure model is loaded
|
| 159 |
if not self._ensure_model():
|
| 160 |
+
logger.error("Failed to load CosyVoice2 model")
|
|
|
|
| 161 |
return
|
| 162 |
|
| 163 |
try:
|
|
|
|
| 180 |
logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
|
| 181 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
| 182 |
else:
|
| 183 |
+
logger.error("CosyVoice2 model returned None for audio output")
|
| 184 |
+
return
|
|
|
|
| 185 |
|
| 186 |
except Exception as e:
|
| 187 |
logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
|
| 188 |
+
return
|
|
|
utils/tts_dia.py
CHANGED
|
@@ -3,7 +3,7 @@ import numpy as np
|
|
| 3 |
import soundfile as sf
|
| 4 |
from typing import Optional, Generator, Tuple
|
| 5 |
|
| 6 |
-
from utils.tts import TTSBase
|
| 7 |
|
| 8 |
# Configure logging
|
| 9 |
logger = logging.getLogger(__name__)
|
|
@@ -98,13 +98,13 @@ class DiaTTS(TTSBase):
|
|
| 98 |
|
| 99 |
# Check if Dia is available
|
| 100 |
if not DIA_AVAILABLE:
|
| 101 |
-
logger.
|
| 102 |
-
return
|
| 103 |
|
| 104 |
# Ensure model is loaded
|
| 105 |
if not self._ensure_model():
|
| 106 |
-
logger.
|
| 107 |
-
return
|
| 108 |
|
| 109 |
try:
|
| 110 |
import torch
|
|
@@ -131,20 +131,18 @@ class DiaTTS(TTSBase):
|
|
| 131 |
logger.info(f"Dia audio generation complete: {output_path}")
|
| 132 |
return output_path
|
| 133 |
else:
|
| 134 |
-
logger.
|
| 135 |
-
|
| 136 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
| 137 |
|
| 138 |
except ModuleNotFoundError as e:
|
| 139 |
if "dac" in str(e):
|
| 140 |
-
logger.
|
| 141 |
else:
|
| 142 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
| 143 |
-
return
|
| 144 |
except Exception as e:
|
| 145 |
logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
|
| 146 |
-
|
| 147 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
| 148 |
|
| 149 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 150 |
"""Generate speech stream using Dia TTS engine
|
|
@@ -161,14 +159,12 @@ class DiaTTS(TTSBase):
|
|
| 161 |
|
| 162 |
# Check if Dia is available
|
| 163 |
if not DIA_AVAILABLE:
|
| 164 |
-
logger.
|
| 165 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 166 |
return
|
| 167 |
|
| 168 |
# Ensure model is loaded
|
| 169 |
if not self._ensure_model():
|
| 170 |
-
logger.
|
| 171 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 172 |
return
|
| 173 |
|
| 174 |
try:
|
|
@@ -191,17 +187,15 @@ class DiaTTS(TTSBase):
|
|
| 191 |
logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
|
| 192 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
| 193 |
else:
|
| 194 |
-
logger.
|
| 195 |
-
|
| 196 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 197 |
|
| 198 |
except ModuleNotFoundError as e:
|
| 199 |
if "dac" in str(e):
|
| 200 |
-
logger.
|
| 201 |
else:
|
| 202 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
| 203 |
-
|
| 204 |
except Exception as e:
|
| 205 |
logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
|
| 206 |
-
|
| 207 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
|
|
|
| 3 |
import soundfile as sf
|
| 4 |
from typing import Optional, Generator, Tuple
|
| 5 |
|
| 6 |
+
from utils.tts import TTSBase
|
| 7 |
|
| 8 |
# Configure logging
|
| 9 |
logger = logging.getLogger(__name__)
|
|
|
|
| 98 |
|
| 99 |
# Check if Dia is available
|
| 100 |
if not DIA_AVAILABLE:
|
| 101 |
+
logger.error("Dia TTS engine is not available")
|
| 102 |
+
return None
|
| 103 |
|
| 104 |
# Ensure model is loaded
|
| 105 |
if not self._ensure_model():
|
| 106 |
+
logger.error("Failed to load Dia model")
|
| 107 |
+
return None
|
| 108 |
|
| 109 |
try:
|
| 110 |
import torch
|
|
|
|
| 131 |
logger.info(f"Dia audio generation complete: {output_path}")
|
| 132 |
return output_path
|
| 133 |
else:
|
| 134 |
+
logger.error("Dia model returned None for audio output")
|
| 135 |
+
return None
|
|
|
|
| 136 |
|
| 137 |
except ModuleNotFoundError as e:
|
| 138 |
if "dac" in str(e):
|
| 139 |
+
logger.error("Dia TTS engine failed due to missing 'dac' module")
|
| 140 |
else:
|
| 141 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
| 142 |
+
return None
|
| 143 |
except Exception as e:
|
| 144 |
logger.error(f"Error generating speech with Dia: {str(e)}", exc_info=True)
|
| 145 |
+
return None
|
|
|
|
| 146 |
|
| 147 |
def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 148 |
"""Generate speech stream using Dia TTS engine
|
|
|
|
| 159 |
|
| 160 |
# Check if Dia is available
|
| 161 |
if not DIA_AVAILABLE:
|
| 162 |
+
logger.error("Dia TTS engine is not available")
|
|
|
|
| 163 |
return
|
| 164 |
|
| 165 |
# Ensure model is loaded
|
| 166 |
if not self._ensure_model():
|
| 167 |
+
logger.error("Failed to load Dia model")
|
|
|
|
| 168 |
return
|
| 169 |
|
| 170 |
try:
|
|
|
|
| 187 |
logger.info(f"Successfully generated audio with Dia (length: {len(output_audio_np)})")
|
| 188 |
yield DEFAULT_SAMPLE_RATE, output_audio_np
|
| 189 |
else:
|
| 190 |
+
logger.error("Dia model returned None for audio output")
|
| 191 |
+
return
|
|
|
|
| 192 |
|
| 193 |
except ModuleNotFoundError as e:
|
| 194 |
if "dac" in str(e):
|
| 195 |
+
logger.error("Dia TTS engine failed due to missing 'dac' module")
|
| 196 |
else:
|
| 197 |
logger.error(f"Module not found error in Dia TTS: {str(e)}")
|
| 198 |
+
return
|
| 199 |
except Exception as e:
|
| 200 |
logger.error(f"Error generating speech stream with Dia: {str(e)}", exc_info=True)
|
| 201 |
+
return
|
|
|
utils/tts_dummy.py
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
class DummyTTS(TTSBase):
|
| 2 |
"""Dummy TTS engine that generates sine wave audio
|
| 3 |
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import numpy as np
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
from typing import Optional, Generator, Tuple, List
|
| 7 |
+
from .tts_base import TTSBase
|
| 8 |
+
|
| 9 |
+
# Configure logging
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
class DummyTTS(TTSBase):
|
| 14 |
"""Dummy TTS engine that generates sine wave audio
|
| 15 |
|
utils/tts_kokoro.py
CHANGED
|
@@ -3,7 +3,7 @@ import numpy as np
|
|
| 3 |
import soundfile as sf
|
| 4 |
from typing import Optional, Generator, Tuple
|
| 5 |
|
| 6 |
-
from utils.tts import TTSBase
|
| 7 |
|
| 8 |
# Configure logging
|
| 9 |
logger = logging.getLogger(__name__)
|
|
@@ -86,13 +86,13 @@ class KokoroTTS(TTSBase):
|
|
| 86 |
|
| 87 |
# Check if Kokoro is available
|
| 88 |
if not KOKORO_AVAILABLE:
|
| 89 |
-
logger.
|
| 90 |
-
return
|
| 91 |
|
| 92 |
# Ensure pipeline is loaded
|
| 93 |
if not self._ensure_pipeline():
|
| 94 |
-
logger.
|
| 95 |
-
return
|
| 96 |
|
| 97 |
try:
|
| 98 |
# Generate unique output path
|
|
@@ -109,8 +109,7 @@ class KokoroTTS(TTSBase):
|
|
| 109 |
return output_path
|
| 110 |
except Exception as e:
|
| 111 |
logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
|
| 112 |
-
|
| 113 |
-
return DummyTTS(self.lang_code).generate_speech(text, voice, speed)
|
| 114 |
|
| 115 |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 116 |
"""Generate speech stream using Kokoro TTS engine
|
|
@@ -127,14 +126,12 @@ class KokoroTTS(TTSBase):
|
|
| 127 |
|
| 128 |
# Check if Kokoro is available
|
| 129 |
if not KOKORO_AVAILABLE:
|
| 130 |
-
logger.
|
| 131 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 132 |
return
|
| 133 |
|
| 134 |
# Ensure pipeline is loaded
|
| 135 |
if not self._ensure_pipeline():
|
| 136 |
-
logger.
|
| 137 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
| 138 |
return
|
| 139 |
|
| 140 |
try:
|
|
@@ -144,5 +141,4 @@ class KokoroTTS(TTSBase):
|
|
| 144 |
yield 24000, audio
|
| 145 |
except Exception as e:
|
| 146 |
logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
|
| 147 |
-
|
| 148 |
-
yield from DummyTTS(self.lang_code).generate_speech_stream(text, voice, speed)
|
|
|
|
| 3 |
import soundfile as sf
|
| 4 |
from typing import Optional, Generator, Tuple
|
| 5 |
|
| 6 |
+
from utils.tts import TTSBase
|
| 7 |
|
| 8 |
# Configure logging
|
| 9 |
logger = logging.getLogger(__name__)
|
|
|
|
| 86 |
|
| 87 |
# Check if Kokoro is available
|
| 88 |
if not KOKORO_AVAILABLE:
|
| 89 |
+
logger.error("Kokoro TTS engine is not available")
|
| 90 |
+
return None
|
| 91 |
|
| 92 |
# Ensure pipeline is loaded
|
| 93 |
if not self._ensure_pipeline():
|
| 94 |
+
logger.error("Failed to load Kokoro pipeline")
|
| 95 |
+
return None
|
| 96 |
|
| 97 |
try:
|
| 98 |
# Generate unique output path
|
|
|
|
| 109 |
return output_path
|
| 110 |
except Exception as e:
|
| 111 |
logger.error(f"Error generating speech with Kokoro: {str(e)}", exc_info=True)
|
| 112 |
+
return None
|
|
|
|
| 113 |
|
| 114 |
def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
|
| 115 |
"""Generate speech stream using Kokoro TTS engine
|
|
|
|
| 126 |
|
| 127 |
# Check if Kokoro is available
|
| 128 |
if not KOKORO_AVAILABLE:
|
| 129 |
+
logger.error("Kokoro TTS engine is not available")
|
|
|
|
| 130 |
return
|
| 131 |
|
| 132 |
# Ensure pipeline is loaded
|
| 133 |
if not self._ensure_pipeline():
|
| 134 |
+
logger.error("Failed to load Kokoro pipeline")
|
|
|
|
| 135 |
return
|
| 136 |
|
| 137 |
try:
|
|
|
|
| 141 |
yield 24000, audio
|
| 142 |
except Exception as e:
|
| 143 |
logger.error(f"Error generating speech stream with Kokoro: {str(e)}", exc_info=True)
|
| 144 |
+
return
|
|
|