Michael Hu commited on
Commit
6825e46
Β·
1 Parent(s): 8b93773

remove all tts providers

Browse files
app.py CHANGED
@@ -248,7 +248,7 @@ def create_interface():
248
  ),
249
  gr.Dropdown(
250
  choices=config['voices'],
251
- value="kokoro",
252
  label="Voice"
253
  ),
254
  gr.Slider(
 
248
  ),
249
  gr.Dropdown(
250
  choices=config['voices'],
251
+ value="chatterbox",
252
  label="Voice"
253
  ),
254
  gr.Slider(
pyproject.toml CHANGED
@@ -9,7 +9,7 @@ license = {text = "MIT"}
9
  readme = "README.md"
10
  requires-python = ">=3.10"
11
  dependencies = [
12
- "gradio>=5.25.2",
13
  "nltk>=3.8",
14
  "librosa>=0.10",
15
  "ffmpeg-python>=0.2",
@@ -20,13 +20,12 @@ dependencies = [
20
  "munch>=2.5",
21
  "accelerate>=1.2.0",
22
  "soundfile>=0.13.0",
23
- "kokoro>=0.7.9",
24
  "ordered-set>=4.1.0",
25
  "phonemizer-fork>=3.3.2",
26
  "nemo_toolkit[asr]",
27
  "faster-whisper>=1.1.1",
28
  "chatterbox-tts",
29
- "YouTokenToMe = { git = "https://github.com/LahiLuk/YouTokenToMe", branch = "main" }"
30
  ]
31
 
32
  [project.optional-dependencies]
 
9
  readme = "README.md"
10
  requires-python = ">=3.10"
11
  dependencies = [
12
+ "gradio>=4.44.0,<5.0.0",
13
  "nltk>=3.8",
14
  "librosa>=0.10",
15
  "ffmpeg-python>=0.2",
 
20
  "munch>=2.5",
21
  "accelerate>=1.2.0",
22
  "soundfile>=0.13.0",
 
23
  "ordered-set>=4.1.0",
24
  "phonemizer-fork>=3.3.2",
25
  "nemo_toolkit[asr]",
26
  "faster-whisper>=1.1.1",
27
  "chatterbox-tts",
28
+ "YouTokenToMe @ git+https://github.com/LahiLuk/YouTokenToMe@main"
29
  ]
30
 
31
  [project.optional-dependencies]
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio>=5.25.2
2
  nltk>=3.8
3
  librosa>=0.10
4
  ffmpeg-python>=0.2
@@ -9,9 +9,8 @@ scipy>=1.11
9
  munch>=2.5
10
  accelerate>=1.2.0
11
  soundfile>=0.13.0
12
- kokoro>=0.7.9
13
  ordered-set>=4.1.0
14
  phonemizer-fork>=3.3.2
 
15
  faster-whisper>=1.1.1
16
- chatterbox-tts
17
- nemo_toolkit[asr]
 
1
+ gradio>=4.44.0,<5.0.0
2
  nltk>=3.8
3
  librosa>=0.10
4
  ffmpeg-python>=0.2
 
9
  munch>=2.5
10
  accelerate>=1.2.0
11
  soundfile>=0.13.0
 
12
  ordered-set>=4.1.0
13
  phonemizer-fork>=3.3.2
14
+ nemo_toolkit[asr]
15
  faster-whisper>=1.1.1
16
+ chatterbox-tts
 
src/application/services/audio_processing_service.py CHANGED
@@ -635,7 +635,7 @@ class AudioProcessingApplicationService:
635
  """
636
  return {
637
  'asr_models': ['parakeet', 'whisper-small', 'whisper-medium', 'whisper-large'],
638
- 'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
639
  'languages': [
640
  'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh',
641
  'ar', 'hi', 'tr', 'pl', 'nl', 'sv', 'da', 'no', 'fi'
 
635
  """
636
  return {
637
  'asr_models': ['parakeet', 'whisper-small', 'whisper-medium', 'whisper-large'],
638
+ 'voices': ['chatterbox'],
639
  'languages': [
640
  'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh',
641
  'ar', 'hi', 'tr', 'pl', 'nl', 'sv', 'da', 'no', 'fi'
src/application/services/configuration_service.py CHANGED
@@ -294,7 +294,7 @@ class ConfigurationApplicationService:
294
  Raises:
295
  ConfigurationException: If validation fails
296
  """
297
- valid_providers = ['kokoro', 'dia', 'cosyvoice2', 'dummy']
298
  valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
299
 
300
  for key, value in updates.items():
@@ -515,7 +515,7 @@ class ConfigurationApplicationService:
515
 
516
  # Check TTS providers
517
  tts_factory = self._container.resolve(type(self._container._get_tts_factory()))
518
- for provider in ['kokoro', 'dia', 'cosyvoice2', 'dummy']:
519
  try:
520
  tts_factory.create_provider(provider)
521
  availability['tts'][provider] = True
 
294
  Raises:
295
  ConfigurationException: If validation fails
296
  """
297
+ valid_providers = ['chatterbox', 'dummy']
298
  valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
299
 
300
  for key, value in updates.items():
 
515
 
516
  # Check TTS providers
517
  tts_factory = self._container.resolve(type(self._container._get_tts_factory()))
518
+ for provider in ['chatterbox', 'dummy']:
519
  try:
520
  tts_factory.create_provider(provider)
521
  availability['tts'][provider] = True
src/infrastructure/config/app_config.py CHANGED
@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
12
  @dataclass
13
  class TTSConfig:
14
  """Configuration for TTS providers."""
15
- preferred_providers: List[str] = field(default_factory=lambda: ['kokoro', 'dia', 'cosyvoice2', 'dummy'])
16
  default_voice: str = 'default'
17
  default_speed: float = 1.0
18
  default_language: str = 'en'
 
12
  @dataclass
13
  class TTSConfig:
14
  """Configuration for TTS providers."""
15
+ preferred_providers: List[str] = field(default_factory=lambda: ['chatterbox', 'dummy'])
16
  default_voice: str = 'default'
17
  default_speed: float = 1.0
18
  default_language: str = 'en'
src/infrastructure/tts/cosyvoice2_provider.py DELETED
@@ -1,207 +0,0 @@
1
- """CosyVoice2 TTS provider implementation."""
2
-
3
- import logging
4
- import numpy as np
5
- import soundfile as sf
6
- import io
7
- from typing import Iterator, TYPE_CHECKING
8
-
9
- if TYPE_CHECKING:
10
- from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
11
-
12
- from ..base.tts_provider_base import TTSProviderBase
13
- from ...domain.exceptions import SpeechSynthesisException
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
- # Flag to track CosyVoice2 availability
18
- COSYVOICE2_AVAILABLE = False
19
- DEFAULT_SAMPLE_RATE = 24000
20
-
21
- # Try to import CosyVoice2 dependencies
22
- try:
23
- import torch
24
- import torchaudio
25
- # Import CosyVoice2 from the correct package
26
- # Based on https://github.com/FunAudioLLM/CosyVoice
27
- from cosyvoice.cli.cosyvoice import CosyVoice
28
- COSYVOICE2_AVAILABLE = True
29
- logger.info("CosyVoice2 TTS engine is available")
30
- except ImportError as e:
31
- logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
32
- COSYVOICE2_AVAILABLE = False
33
- except ModuleNotFoundError as e:
34
- logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
35
- COSYVOICE2_AVAILABLE = False
36
-
37
-
38
- class CosyVoice2TTSProvider(TTSProviderBase):
39
- """CosyVoice2 TTS provider implementation."""
40
-
41
- def __init__(self, lang_code: str = 'z'):
42
- """Initialize the CosyVoice2 TTS provider."""
43
- super().__init__(
44
- provider_name="CosyVoice2",
45
- supported_languages=['en', 'z'] # CosyVoice2 supports English and multilingual
46
- )
47
- self.lang_code = lang_code
48
- self.model = None
49
-
50
- def _ensure_model(self):
51
- """Ensure the model is loaded."""
52
- if self.model is None and COSYVOICE2_AVAILABLE:
53
- try:
54
- logger.info("Loading CosyVoice2 model...")
55
- import torch
56
- import torchaudio
57
- from cosyvoice.cli.cosyvoice import CosyVoice
58
-
59
- # Initialize CosyVoice with the correct model path
60
- # You may need to adjust the model path based on your installation
61
- self.model = CosyVoice('pretrained_models/CosyVoice-300M')
62
- logger.info("CosyVoice2 model successfully loaded")
63
- except ImportError as e:
64
- logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}", exception=e)
65
- self.model = None
66
- except FileNotFoundError as e:
67
- logger.error(f"Failed to load CosyVoice2 model files: {str(e)}", exception=e)
68
- self.model = None
69
- except Exception as e:
70
- logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}", exception=e)
71
- self.model = None
72
-
73
- model_available = self.model is not None
74
- logger.info(f"CosyVoice2 model availability check: {model_available}")
75
- return model_available
76
-
77
- def is_available(self) -> bool:
78
- """Check if CosyVoice2 TTS is available."""
79
- return COSYVOICE2_AVAILABLE and self._ensure_model()
80
-
81
- def get_available_voices(self) -> list[str]:
82
- """Get available voices for CosyVoice2."""
83
- # CosyVoice2 typically uses a default voice
84
- return ['default']
85
-
86
- def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
87
- """Generate audio using CosyVoice2 TTS."""
88
- logger.info("Starting CosyVoice2 audio generation")
89
-
90
- if not self.is_available():
91
- logger.error("CosyVoice2 TTS engine is not available")
92
- raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
93
-
94
- try:
95
- import torch
96
-
97
- # Extract parameters from request
98
- text = request.text_content.text
99
- logger.info(f"CosyVoice2 generating audio for text length: {len(text)}")
100
- logger.info(f"Voice settings: voice_id={request.voice_settings.voice_id}, speed={request.voice_settings.speed}")
101
-
102
- # Generate audio using CosyVoice2
103
- logger.info("Starting CosyVoice2 model inference")
104
-
105
- # CosyVoice API - using inference method
106
- # The model expects text and returns audio tensor
107
- try:
108
- # Use the inference method from CosyVoice
109
- output_audio_tensor = self.model.inference_sft(text, 'δΈ­ζ–‡ε₯³')
110
-
111
- # Convert tensor to numpy array
112
- if isinstance(output_audio_tensor, torch.Tensor):
113
- output_audio_np = output_audio_tensor.cpu().numpy()
114
- else:
115
- output_audio_np = output_audio_tensor
116
-
117
- logger.info("CosyVoice2 model inference completed")
118
- except Exception as api_error:
119
- logger.error(f"CosyVoice2 API error: {str(api_error)}")
120
- # Try alternative API if the first one fails
121
- try:
122
- logger.info("Trying alternative CosyVoice2 API")
123
- output_audio_tensor = self.model.inference_zero_shot(text, 'θ―·θΎ“οΏ½οΏ½οΏ½ζη€Ίζ–‡ζœ¬', 'δΈ­ζ–‡ε₯³')
124
- if isinstance(output_audio_tensor, torch.Tensor):
125
- output_audio_np = output_audio_tensor.cpu().numpy()
126
- else:
127
- output_audio_np = output_audio_tensor
128
- logger.info("CosyVoice2 alternative API succeeded")
129
- except Exception as alt_error:
130
- logger.error(f"CosyVoice2 alternative API also failed: {str(alt_error)}")
131
- raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
132
-
133
- if output_audio_np is None:
134
- logger.error("CosyVoice2 model returned None for audio output")
135
- raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
136
-
137
- logger.info(f"CosyVoice2 generated audio array shape: {output_audio_np.shape if hasattr(output_audio_np, 'shape') else 'unknown'}")
138
-
139
- # Convert numpy array to bytes
140
- logger.info("Converting CosyVoice2 audio to bytes")
141
- audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
142
- logger.info(f"CosyVoice2 audio conversion completed, bytes length: {len(audio_bytes)}")
143
-
144
- return audio_bytes, DEFAULT_SAMPLE_RATE
145
-
146
- except Exception as e:
147
- logger.error(f"CosyVoice2 audio generation failed: {str(e)}", exception=e)
148
- self._handle_provider_error(e, "audio generation")
149
-
150
- def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
151
- """Generate audio stream using CosyVoice2 TTS."""
152
- if not self.is_available():
153
- raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
154
-
155
- try:
156
- import torch
157
-
158
- # Extract parameters from request
159
- text = request.text_content.text
160
-
161
- # Generate audio using CosyVoice2
162
- try:
163
- # Use the inference method from CosyVoice
164
- output_audio_tensor = self.model.inference_sft(text, 'δΈ­ζ–‡ε₯³')
165
-
166
- # Convert tensor to numpy array
167
- if isinstance(output_audio_tensor, torch.Tensor):
168
- output_audio_np = output_audio_tensor.cpu().numpy()
169
- else:
170
- output_audio_np = output_audio_tensor
171
- except Exception as api_error:
172
- # Try alternative API if the first one fails
173
- try:
174
- output_audio_tensor = self.model.inference_zero_shot(text, 'θ―·θΎ“ε…₯ζη€Ίζ–‡ζœ¬', 'δΈ­ζ–‡ε₯³')
175
- if isinstance(output_audio_tensor, torch.Tensor):
176
- output_audio_np = output_audio_tensor.cpu().numpy()
177
- else:
178
- output_audio_np = output_audio_tensor
179
- except Exception as alt_error:
180
- raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
181
-
182
- if output_audio_np is None:
183
- raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
184
-
185
- # Convert numpy array to bytes
186
- audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
187
- # CosyVoice2 generates complete audio in one go
188
- yield audio_bytes, DEFAULT_SAMPLE_RATE, True
189
-
190
- except Exception as e:
191
- self._handle_provider_error(e, "streaming audio generation")
192
-
193
- def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
194
- """Convert numpy audio array to bytes."""
195
- try:
196
- # Create an in-memory buffer
197
- buffer = io.BytesIO()
198
-
199
- # Write audio data to buffer as WAV
200
- sf.write(buffer, audio_array, sample_rate, format='WAV')
201
-
202
- # Get bytes from buffer
203
- buffer.seek(0)
204
- return buffer.read()
205
-
206
- except Exception as e:
207
- raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/infrastructure/tts/dia_provider.py DELETED
@@ -1,229 +0,0 @@
1
- """Dia TTS provider implementation."""
2
-
3
- import logging
4
- import numpy as np
5
- import soundfile as sf
6
- import io
7
- from typing import Iterator, TYPE_CHECKING
8
-
9
- if TYPE_CHECKING:
10
- from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
11
-
12
- from ..base.tts_provider_base import TTSProviderBase
13
- from ...domain.exceptions import SpeechSynthesisException
14
-
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- # Flag to track Dia availability
19
- DIA_AVAILABLE = False
20
- DEFAULT_SAMPLE_RATE = 24000
21
-
22
- # Try to import Dia dependencies
23
- def _check_dia_dependencies():
24
- """Check if Dia dependencies are available."""
25
- global DIA_AVAILABLE
26
-
27
- logger.info("πŸ” Checking Dia TTS dependencies...")
28
-
29
- try:
30
- logger.info("Attempting to import torch...")
31
- import torch
32
- logger.info("βœ“ Successfully imported torch")
33
-
34
- logger.info("Attempting to import dia.model...")
35
- from dia.model import Dia
36
- logger.info("βœ“ Successfully imported dia.model")
37
-
38
- DIA_AVAILABLE = True
39
- logger.info("βœ… Dia TTS engine is available")
40
- return True
41
- except ImportError as e:
42
- logger.warning(f"⚠️ Dia TTS engine dependencies not available: {e}")
43
- logger.info(f"ImportError details: {type(e).__name__}: {e}")
44
- DIA_AVAILABLE = False
45
- return False
46
- except ModuleNotFoundError as e:
47
- if "dac" in str(e):
48
- logger.warning("❌ Dia TTS engine is not available due to missing 'dac' module")
49
- logger.info("Please install descript-audio-codec: pip install descript-audio-codec")
50
- elif "dia" in str(e):
51
- logger.warning("❌ Dia TTS engine is not available due to missing 'dia' module")
52
- logger.info("Please install dia: pip install git+https://github.com/nari-labs/dia.git")
53
- else:
54
- logger.warning(f"❌ Dia TTS engine is not available: {str(e)}")
55
- logger.info(f"ModuleNotFoundError details: {type(e).__name__}: {e}")
56
- DIA_AVAILABLE = False
57
- return False
58
-
59
- # Initial check
60
- logger.info("πŸš€ Initializing Dia TTS provider...")
61
- _check_dia_dependencies()
62
-
63
-
64
- class DiaTTSProvider(TTSProviderBase):
65
- """Dia TTS provider implementation."""
66
-
67
- def __init__(self, lang_code: str = 'z'):
68
- """Initialize the Dia TTS provider."""
69
- super().__init__(
70
- provider_name="Dia",
71
- supported_languages=['en', 'z'] # Dia supports English and multilingual
72
- )
73
- self.lang_code = lang_code
74
- self.model = None
75
-
76
- def _ensure_model(self):
77
- """Ensure the model is loaded."""
78
- global DIA_AVAILABLE
79
-
80
- if self.model is None:
81
- logger.info("πŸ”„ Ensuring Dia model is loaded...")
82
-
83
- # If Dia is not available, check dependencies again
84
- if not DIA_AVAILABLE:
85
- logger.info("⚠️ Dia not available, checking dependencies again...")
86
- if _check_dia_dependencies():
87
- DIA_AVAILABLE = True
88
- logger.info("βœ… Dependencies are now available")
89
- else:
90
- logger.error("❌ Dependencies still not available")
91
- return False
92
-
93
- if DIA_AVAILABLE:
94
- try:
95
- logger.info("πŸ“₯ Loading Dia model from pretrained...")
96
- import torch
97
- from dia.model import Dia
98
- self.model = Dia.from_pretrained()
99
- logger.info("πŸŽ‰ Dia model successfully loaded")
100
- except ImportError as e:
101
- logger.error(f"❌ Failed to import Dia dependencies: {str(e)}")
102
- self.model = None
103
- except FileNotFoundError as e:
104
- logger.error(f"❌ Failed to load Dia model files: {str(e)}")
105
- logger.info("ℹ️ This might be the first time loading the model. It will be downloaded automatically.")
106
- self.model = None
107
- except Exception as e:
108
- logger.error(f"❌ Failed to initialize Dia model: {str(e)}")
109
- logger.info(f"Model initialization error: {type(e).__name__}: {e}")
110
- self.model = None
111
-
112
- is_available = self.model is not None
113
- logger.info(f"Model availability check result: {is_available}")
114
- return is_available
115
-
116
- def is_available(self) -> bool:
117
- """Check if Dia TTS is available."""
118
- logger.info(f"πŸ” Checking Dia availability: DIA_AVAILABLE={DIA_AVAILABLE}")
119
-
120
- if not DIA_AVAILABLE:
121
- logger.info("❌ Dia dependencies not available")
122
- return False
123
-
124
- model_available = self._ensure_model()
125
- logger.info(f"πŸ” Model availability: {model_available}")
126
-
127
- result = DIA_AVAILABLE and model_available
128
- logger.info(f"🎯 Dia TTS availability result: {result}")
129
- return result
130
-
131
- def get_available_voices(self) -> list[str]:
132
- """Get available voices for Dia."""
133
- # Dia typically uses a default voice
134
- return ['default']
135
-
136
- def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
137
- """Generate audio using Dia TTS."""
138
- if not self.is_available():
139
- raise SpeechSynthesisException("Dia TTS engine is not available")
140
-
141
- try:
142
- import torch
143
-
144
- # Extract parameters from request
145
- text = request.text_content.text
146
-
147
- # Generate audio using Dia
148
- with torch.inference_mode():
149
- output_audio_np = self.model.generate(
150
- text,
151
- max_tokens=None,
152
- cfg_scale=3.0,
153
- temperature=1.3,
154
- top_p=0.95,
155
- cfg_filter_top_k=35,
156
- use_torch_compile=False,
157
- verbose=False
158
- )
159
-
160
- if output_audio_np is None:
161
- raise SpeechSynthesisException("Dia model returned None for audio output")
162
-
163
- # Convert numpy array to bytes
164
- audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
165
- return audio_bytes, DEFAULT_SAMPLE_RATE
166
-
167
- except ModuleNotFoundError as e:
168
- if "dac" in str(e):
169
- raise SpeechSynthesisException("Dia TTS engine failed due to missing 'dac' module") from e
170
- else:
171
- self._handle_provider_error(e, "audio generation")
172
- except Exception as e:
173
- self._handle_provider_error(e, "audio generation")
174
-
175
- def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
176
- """Generate audio stream using Dia TTS."""
177
- if not self.is_available():
178
- raise SpeechSynthesisException("Dia TTS engine is not available")
179
-
180
- try:
181
- import torch
182
-
183
- # Extract parameters from request
184
- text = request.text_content.text
185
-
186
- # Generate audio using Dia
187
- with torch.inference_mode():
188
- output_audio_np = self.model.generate(
189
- text,
190
- max_tokens=None,
191
- cfg_scale=3.0,
192
- temperature=1.3,
193
- top_p=0.95,
194
- cfg_filter_top_k=35,
195
- use_torch_compile=False,
196
- verbose=False
197
- )
198
-
199
- if output_audio_np is None:
200
- raise SpeechSynthesisException("Dia model returned None for audio output")
201
-
202
- # Convert numpy array to bytes
203
- audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
204
- # Dia generates complete audio in one go
205
- yield audio_bytes, DEFAULT_SAMPLE_RATE, True
206
-
207
- except ModuleNotFoundError as e:
208
- if "dac" in str(e):
209
- raise SpeechSynthesisException("Dia TTS engine failed due to missing 'dac' module") from e
210
- else:
211
- self._handle_provider_error(e, "streaming audio generation")
212
- except Exception as e:
213
- self._handle_provider_error(e, "streaming audio generation")
214
-
215
- def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
216
- """Convert numpy audio array to bytes."""
217
- try:
218
- # Create an in-memory buffer
219
- buffer = io.BytesIO()
220
-
221
- # Write audio data to buffer as WAV
222
- sf.write(buffer, audio_array, sample_rate, format='WAV')
223
-
224
- # Get bytes from buffer
225
- buffer.seek(0)
226
- return buffer.read()
227
-
228
- except Exception as e:
229
- raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/infrastructure/tts/kokoro_provider.py DELETED
@@ -1,131 +0,0 @@
1
- """Kokoro TTS provider implementation."""
2
-
3
- import logging
4
- import numpy as np
5
- import soundfile as sf
6
- import io
7
- from typing import Iterator, TYPE_CHECKING
8
-
9
- if TYPE_CHECKING:
10
- from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
11
-
12
- from ..base.tts_provider_base import TTSProviderBase
13
- from ...domain.exceptions import SpeechSynthesisException
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
- # Flag to track Kokoro availability
18
- KOKORO_AVAILABLE = False
19
-
20
- # Try to import Kokoro
21
- try:
22
- from kokoro import KPipeline
23
- KOKORO_AVAILABLE = True
24
- logger.info("Kokoro TTS engine is available")
25
- except ImportError:
26
- logger.warning("Kokoro TTS engine is not available")
27
- except Exception as e:
28
- logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
29
- KOKORO_AVAILABLE = False
30
-
31
-
32
- class KokoroTTSProvider(TTSProviderBase):
33
- """Kokoro TTS provider implementation."""
34
-
35
- def __init__(self, lang_code: str = 'z'):
36
- """Initialize the Kokoro TTS provider."""
37
- super().__init__(
38
- provider_name="Kokoro",
39
- supported_languages=['en', 'z'] # Kokoro supports English and multilingual
40
- )
41
- self.lang_code = lang_code
42
- self.pipeline = None
43
-
44
- def _ensure_pipeline(self):
45
- """Ensure the pipeline is loaded."""
46
- if self.pipeline is None and KOKORO_AVAILABLE:
47
- try:
48
- self.pipeline = KPipeline(lang_code=self.lang_code)
49
- logger.info("Kokoro pipeline successfully loaded")
50
- except Exception as e:
51
- logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
52
- self.pipeline = None
53
- return self.pipeline is not None
54
-
55
- def is_available(self) -> bool:
56
- """Check if Kokoro TTS is available."""
57
- return KOKORO_AVAILABLE and self._ensure_pipeline()
58
-
59
- def get_available_voices(self) -> list[str]:
60
- """Get available voices for Kokoro."""
61
- # Common Kokoro voices based on the original implementation
62
- return [
63
- 'af_heart', 'af_bella', 'af_sarah', 'af_nicole',
64
- 'am_adam', 'am_michael', 'bf_emma', 'bf_isabella'
65
- ]
66
-
67
- def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
68
- """Generate audio using Kokoro TTS."""
69
- if not self.is_available():
70
- raise SpeechSynthesisException("Kokoro TTS engine is not available")
71
-
72
- try:
73
- # Extract parameters from request
74
- text = request.text_content.text
75
- voice = request.voice_settings.voice_id
76
- speed = request.voice_settings.speed
77
-
78
- # Generate speech using Kokoro
79
- generator = self.pipeline(text, voice=voice, speed=speed)
80
-
81
- for _, _, audio in generator:
82
- # Convert numpy array to bytes
83
- audio_bytes = self._numpy_to_bytes(audio, sample_rate=24000)
84
- return audio_bytes, 24000
85
-
86
- raise SpeechSynthesisException("Kokoro failed to generate audio")
87
-
88
- except Exception as e:
89
- self._handle_provider_error(e, "audio generation")
90
-
91
- def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
92
- """Generate audio stream using Kokoro TTS."""
93
- if not self.is_available():
94
- raise SpeechSynthesisException("Kokoro TTS engine is not available")
95
-
96
- try:
97
- # Extract parameters from request
98
- text = request.text_content.text
99
- voice = request.voice_settings.voice_id
100
- speed = request.voice_settings.speed
101
-
102
- # Generate speech stream using Kokoro
103
- generator = self.pipeline(text, voice=voice, speed=speed)
104
-
105
- chunk_count = 0
106
- for _, _, audio in generator:
107
- chunk_count += 1
108
- # Convert numpy array to bytes
109
- audio_bytes = self._numpy_to_bytes(audio, sample_rate=24000)
110
- # Assume this is the final chunk for now (Kokoro typically generates one chunk)
111
- is_final = True
112
- yield audio_bytes, 24000, is_final
113
-
114
- except Exception as e:
115
- self._handle_provider_error(e, "streaming audio generation")
116
-
117
- def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
118
- """Convert numpy audio array to bytes."""
119
- try:
120
- # Create an in-memory buffer
121
- buffer = io.BytesIO()
122
-
123
- # Write audio data to buffer as WAV
124
- sf.write(buffer, audio_array, sample_rate, format='WAV')
125
-
126
- # Get bytes from buffer
127
- buffer.seek(0)
128
- return buffer.read()
129
-
130
- except Exception as e:
131
- raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/infrastructure/tts/provider_factory.py CHANGED
@@ -25,40 +25,7 @@ class TTSProviderFactory:
25
  from .dummy_provider import DummyTTSProvider
26
  self._providers['dummy'] = DummyTTSProvider
27
 
28
- # Try to register Kokoro provider
29
- try:
30
- from .kokoro_provider import KokoroTTSProvider
31
- self._providers['kokoro'] = KokoroTTSProvider
32
- logger.info("Registered Kokoro TTS provider")
33
- except ImportError as e:
34
- logger.info(f"Kokoro TTS provider not available: {e}")
35
-
36
- # Try to register Dia provider
37
- try:
38
- from .dia_provider import DiaTTSProvider
39
- self._providers['dia'] = DiaTTSProvider
40
- logger.info("Registered Dia TTS provider")
41
- except ImportError as e:
42
- logger.warning(f"Dia TTS provider not available: {e}")
43
- # Still register it so it can attempt installation later
44
- try:
45
- from .dia_provider import DiaTTSProvider
46
- self._providers['dia'] = DiaTTSProvider
47
- logger.info("Registered Dia TTS provider (dependencies may be installed on demand)")
48
- except Exception:
49
- logger.warning("Failed to register Dia TTS provider")
50
- except Exception as e:
51
- logger.warning(f"Failed to register Dia TTS provider: {e}")
52
-
53
- # Try to register CosyVoice2 provider
54
- try:
55
- from .cosyvoice2_provider import CosyVoice2TTSProvider
56
- self._providers['cosyvoice2'] = CosyVoice2TTSProvider
57
- logger.info("Registered CosyVoice2 TTS provider")
58
- except ImportError as e:
59
- logger.info(f"CosyVoice2 TTS provider not available: {e}")
60
-
61
- # Try to register Chatterbox provider
62
  try:
63
  from .chatterbox_provider import ChatterboxTTSProvider
64
  self._providers['chatterbox'] = ChatterboxTTSProvider
@@ -77,14 +44,7 @@ class TTSProviderFactory:
77
  # Create instance if not cached
78
  if name not in self._provider_instances:
79
  logger.info(f"Creating instance for {name} provider")
80
- if name == 'kokoro':
81
- self._provider_instances[name] = provider_class()
82
- elif name == 'dia':
83
- logger.info(f"πŸ”§ Creating Dia TTS provider instance...")
84
- self._provider_instances[name] = provider_class()
85
- elif name == 'cosyvoice2':
86
- self._provider_instances[name] = provider_class()
87
- elif name == 'chatterbox':
88
  self._provider_instances[name] = provider_class()
89
  else:
90
  self._provider_instances[name] = provider_class()
@@ -134,8 +94,8 @@ class TTSProviderFactory:
134
  provider_class = self._providers[provider_name]
135
 
136
  # Create instance with appropriate parameters
137
- if provider_name in ['kokoro', 'dia', 'cosyvoice2', 'chatterbox']:
138
- lang_code = kwargs.get('lang_code', 'en' if provider_name == 'chatterbox' else 'z')
139
  provider = provider_class(lang_code=lang_code)
140
  else:
141
  provider = provider_class(**kwargs)
@@ -166,7 +126,7 @@ class TTSProviderFactory:
166
  SpeechSynthesisException: If no providers are available
167
  """
168
  if preferred_providers is None:
169
- preferred_providers = ['kokoro', 'dia', 'cosyvoice2', 'chatterbox', 'dummy']
170
 
171
  logger.info(f"πŸ”„ Getting TTS provider with fallback, preferred order: {preferred_providers}")
172
  available_providers = self.get_available_providers()
@@ -214,7 +174,7 @@ class TTSProviderFactory:
214
  # Create instance if not cached
215
  if provider_name not in self._provider_instances:
216
  provider_class = self._providers[provider_name]
217
- if provider_name in ['kokoro', 'dia', 'cosyvoice2', 'chatterbox']:
218
  self._provider_instances[provider_name] = provider_class()
219
  else:
220
  self._provider_instances[provider_name] = provider_class()
 
25
  from .dummy_provider import DummyTTSProvider
26
  self._providers['dummy'] = DummyTTSProvider
27
 
28
+ # Register only Chatterbox provider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  try:
30
  from .chatterbox_provider import ChatterboxTTSProvider
31
  self._providers['chatterbox'] = ChatterboxTTSProvider
 
44
  # Create instance if not cached
45
  if name not in self._provider_instances:
46
  logger.info(f"Creating instance for {name} provider")
47
+ if name == 'chatterbox':
 
 
 
 
 
 
 
48
  self._provider_instances[name] = provider_class()
49
  else:
50
  self._provider_instances[name] = provider_class()
 
94
  provider_class = self._providers[provider_name]
95
 
96
  # Create instance with appropriate parameters
97
+ if provider_name == 'chatterbox':
98
+ lang_code = kwargs.get('lang_code', 'en')
99
  provider = provider_class(lang_code=lang_code)
100
  else:
101
  provider = provider_class(**kwargs)
 
126
  SpeechSynthesisException: If no providers are available
127
  """
128
  if preferred_providers is None:
129
+ preferred_providers = ['chatterbox', 'dummy']
130
 
131
  logger.info(f"πŸ”„ Getting TTS provider with fallback, preferred order: {preferred_providers}")
132
  available_providers = self.get_available_providers()
 
174
  # Create instance if not cached
175
  if provider_name not in self._provider_instances:
176
  provider_class = self._providers[provider_name]
177
+ if provider_name == 'chatterbox':
178
  self._provider_instances[provider_name] = provider_class()
179
  else:
180
  self._provider_instances[provider_name] = provider_class()