Spaces:

DroolingPanda
/

teachingAssistant

Running

App Files Files Community

Michael Hu commited on Jul 29

Commit

6825e46

1 Parent(s): 8b93773

remove all tts providers

Browse files

Files changed (10) hide show

app.py +1 -1
pyproject.toml +2 -3
requirements.txt +3 -4
src/application/services/audio_processing_service.py +1 -1
src/application/services/configuration_service.py +2 -2
src/infrastructure/config/app_config.py +1 -1
src/infrastructure/tts/cosyvoice2_provider.py +0 -207
src/infrastructure/tts/dia_provider.py +0 -229
src/infrastructure/tts/kokoro_provider.py +0 -131
src/infrastructure/tts/provider_factory.py +6 -46

app.py CHANGED Viewed

@@ -248,7 +248,7 @@ def create_interface():
             ),
             gr.Dropdown(
                 choices=config['voices'],
-                value="kokoro",
                 label="Voice"
             ),
             gr.Slider(

             ),
             gr.Dropdown(
                 choices=config['voices'],
+                value="chatterbox",
                 label="Voice"
             ),
             gr.Slider(

pyproject.toml CHANGED Viewed

@@ -9,7 +9,7 @@ license = {text = "MIT"}
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "gradio>=5.25.2",
     "nltk>=3.8",
     "librosa>=0.10",
     "ffmpeg-python>=0.2",
@@ -20,13 +20,12 @@ dependencies = [
     "munch>=2.5",
     "accelerate>=1.2.0",
     "soundfile>=0.13.0",
-    "kokoro>=0.7.9",
     "ordered-set>=4.1.0",
     "phonemizer-fork>=3.3.2",
     "nemo_toolkit[asr]",
     "faster-whisper>=1.1.1",
     "chatterbox-tts",
-    "YouTokenToMe = { git = "https://github.com/LahiLuk/YouTokenToMe", branch = "main" }"
 ]
 [project.optional-dependencies]

 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "gradio>=4.44.0,<5.0.0",
     "nltk>=3.8",
     "librosa>=0.10",
     "ffmpeg-python>=0.2",
     "munch>=2.5",
     "accelerate>=1.2.0",
     "soundfile>=0.13.0",
     "ordered-set>=4.1.0",
     "phonemizer-fork>=3.3.2",
     "nemo_toolkit[asr]",
     "faster-whisper>=1.1.1",
     "chatterbox-tts",
+    "YouTokenToMe @ git+https://github.com/LahiLuk/YouTokenToMe@main"
 ]
 [project.optional-dependencies]

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio>=5.25.2
 nltk>=3.8
 librosa>=0.10
 ffmpeg-python>=0.2
@@ -9,9 +9,8 @@ scipy>=1.11
 munch>=2.5
 accelerate>=1.2.0
 soundfile>=0.13.0
-kokoro>=0.7.9
 ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
 faster-whisper>=1.1.1
-chatterbox-tts
-nemo_toolkit[asr]

+gradio>=4.44.0,<5.0.0
 nltk>=3.8
 librosa>=0.10
 ffmpeg-python>=0.2
 munch>=2.5
 accelerate>=1.2.0
 soundfile>=0.13.0
 ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
+nemo_toolkit[asr]
 faster-whisper>=1.1.1
+chatterbox-tts

src/application/services/audio_processing_service.py CHANGED Viewed

@@ -635,7 +635,7 @@ class AudioProcessingApplicationService:
         """
         return {
             'asr_models': ['parakeet', 'whisper-small', 'whisper-medium', 'whisper-large'],
-            'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
             'languages': [
                 'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh',
                 'ar', 'hi', 'tr', 'pl', 'nl', 'sv', 'da', 'no', 'fi'

         """
         return {
             'asr_models': ['parakeet', 'whisper-small', 'whisper-medium', 'whisper-large'],
+            'voices': ['chatterbox'],
             'languages': [
                 'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh',
                 'ar', 'hi', 'tr', 'pl', 'nl', 'sv', 'da', 'no', 'fi'

src/application/services/configuration_service.py CHANGED Viewed

@@ -294,7 +294,7 @@ class ConfigurationApplicationService:
         Raises:
             ConfigurationException: If validation fails
         """
-        valid_providers = ['kokoro', 'dia', 'cosyvoice2', 'dummy']
         valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
         for key, value in updates.items():
@@ -515,7 +515,7 @@ class ConfigurationApplicationService:
             # Check TTS providers
             tts_factory = self._container.resolve(type(self._container._get_tts_factory()))
-            for provider in ['kokoro', 'dia', 'cosyvoice2', 'dummy']:
                 try:
                     tts_factory.create_provider(provider)
                     availability['tts'][provider] = True

         Raises:
             ConfigurationException: If validation fails
         """
+        valid_providers = ['chatterbox', 'dummy']
         valid_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
         for key, value in updates.items():
             # Check TTS providers
             tts_factory = self._container.resolve(type(self._container._get_tts_factory()))
+            for provider in ['chatterbox', 'dummy']:
                 try:
                     tts_factory.create_provider(provider)
                     availability['tts'][provider] = True

src/infrastructure/config/app_config.py CHANGED Viewed

@@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
 @dataclass
 class TTSConfig:
     """Configuration for TTS providers."""
-    preferred_providers: List[str] = field(default_factory=lambda: ['kokoro', 'dia', 'cosyvoice2', 'dummy'])
     default_voice: str = 'default'
     default_speed: float = 1.0
     default_language: str = 'en'

 @dataclass
 class TTSConfig:
     """Configuration for TTS providers."""
+    preferred_providers: List[str] = field(default_factory=lambda: ['chatterbox', 'dummy'])
     default_voice: str = 'default'
     default_speed: float = 1.0
     default_language: str = 'en'

src/infrastructure/tts/cosyvoice2_provider.py DELETED Viewed

@@ -1,207 +0,0 @@
-"""CosyVoice2 TTS provider implementation."""
-import logging
-import numpy as np
-import soundfile as sf
-import io
-from typing import Iterator, TYPE_CHECKING
-if TYPE_CHECKING:
-    from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
-from ..base.tts_provider_base import TTSProviderBase
-from ...domain.exceptions import SpeechSynthesisException
-logger = logging.getLogger(__name__)
-# Flag to track CosyVoice2 availability
-COSYVOICE2_AVAILABLE = False
-DEFAULT_SAMPLE_RATE = 24000
-# Try to import CosyVoice2 dependencies
-try:
-    import torch
-    import torchaudio
-    # Import CosyVoice2 from the correct package
-    # Based on https://github.com/FunAudioLLM/CosyVoice
-    from cosyvoice.cli.cosyvoice import CosyVoice
-    COSYVOICE2_AVAILABLE = True
-    logger.info("CosyVoice2 TTS engine is available")
-except ImportError as e:
-    logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
-    COSYVOICE2_AVAILABLE = False
-except ModuleNotFoundError as e:
-    logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
-    COSYVOICE2_AVAILABLE = False
-class CosyVoice2TTSProvider(TTSProviderBase):
-    """CosyVoice2 TTS provider implementation."""
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the CosyVoice2 TTS provider."""
-        super().__init__(
-            provider_name="CosyVoice2",
-            supported_languages=['en', 'z']  # CosyVoice2 supports English and multilingual
-        )
-        self.lang_code = lang_code
-        self.model = None
-    def _ensure_model(self):
-        """Ensure the model is loaded."""
-        if self.model is None and COSYVOICE2_AVAILABLE:
-            try:
-                logger.info("Loading CosyVoice2 model...")
-                import torch
-                import torchaudio
-                from cosyvoice.cli.cosyvoice import CosyVoice
-                # Initialize CosyVoice with the correct model path
-                # You may need to adjust the model path based on your installation
-                self.model = CosyVoice('pretrained_models/CosyVoice-300M')
-                logger.info("CosyVoice2 model successfully loaded")
-            except ImportError as e:
-                logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}", exception=e)
-                self.model = None
-            except FileNotFoundError as e:
-                logger.error(f"Failed to load CosyVoice2 model files: {str(e)}", exception=e)
-                self.model = None
-            except Exception as e:
-                logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}", exception=e)
-                self.model = None
-        model_available = self.model is not None
-        logger.info(f"CosyVoice2 model availability check: {model_available}")
-        return model_available
-    def is_available(self) -> bool:
-        """Check if CosyVoice2 TTS is available."""
-        return COSYVOICE2_AVAILABLE and self._ensure_model()
-    def get_available_voices(self) -> list[str]:
-        """Get available voices for CosyVoice2."""
-        # CosyVoice2 typically uses a default voice
-        return ['default']
-    def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
-        """Generate audio using CosyVoice2 TTS."""
-        logger.info("Starting CosyVoice2 audio generation")
-        if not self.is_available():
-            logger.error("CosyVoice2 TTS engine is not available")
-            raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
-        try:
-            import torch
-            # Extract parameters from request
-            text = request.text_content.text
-            logger.info(f"CosyVoice2 generating audio for text length: {len(text)}")
-            logger.info(f"Voice settings: voice_id={request.voice_settings.voice_id}, speed={request.voice_settings.speed}")
-            # Generate audio using CosyVoice2
-            logger.info("Starting CosyVoice2 model inference")
-            # CosyVoice API - using inference method
-            # The model expects text and returns audio tensor
-            try:
-                # Use the inference method from CosyVoice
-                output_audio_tensor = self.model.inference_sft(text, '中文女')
-                # Convert tensor to numpy array
-                if isinstance(output_audio_tensor, torch.Tensor):
-                    output_audio_np = output_audio_tensor.cpu().numpy()
-                else:
-                    output_audio_np = output_audio_tensor
-                logger.info("CosyVoice2 model inference completed")
-            except Exception as api_error:
-                logger.error(f"CosyVoice2 API error: {str(api_error)}")
-                # Try alternative API if the first one fails
-                try:
-                    logger.info("Trying alternative CosyVoice2 API")
-                    output_audio_tensor = self.model.inference_zero_shot(text, '请输���提示文本', '中文女')
-                    if isinstance(output_audio_tensor, torch.Tensor):
-                        output_audio_np = output_audio_tensor.cpu().numpy()
-                    else:
-                        output_audio_np = output_audio_tensor
-                    logger.info("CosyVoice2 alternative API succeeded")
-                except Exception as alt_error:
-                    logger.error(f"CosyVoice2 alternative API also failed: {str(alt_error)}")
-                    raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
-            if output_audio_np is None:
-                logger.error("CosyVoice2 model returned None for audio output")
-                raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
-            logger.info(f"CosyVoice2 generated audio array shape: {output_audio_np.shape if hasattr(output_audio_np, 'shape') else 'unknown'}")
-            # Convert numpy array to bytes
-            logger.info("Converting CosyVoice2 audio to bytes")
-            audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
-            logger.info(f"CosyVoice2 audio conversion completed, bytes length: {len(audio_bytes)}")
-            return audio_bytes, DEFAULT_SAMPLE_RATE
-        except Exception as e:
-            logger.error(f"CosyVoice2 audio generation failed: {str(e)}", exception=e)
-            self._handle_provider_error(e, "audio generation")
-    def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
-        """Generate audio stream using CosyVoice2 TTS."""
-        if not self.is_available():
-            raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
-        try:
-            import torch
-            # Extract parameters from request
-            text = request.text_content.text
-            # Generate audio using CosyVoice2
-            try:
-                # Use the inference method from CosyVoice
-                output_audio_tensor = self.model.inference_sft(text, '中文女')
-                # Convert tensor to numpy array
-                if isinstance(output_audio_tensor, torch.Tensor):
-                    output_audio_np = output_audio_tensor.cpu().numpy()
-                else:
-                    output_audio_np = output_audio_tensor
-            except Exception as api_error:
-                # Try alternative API if the first one fails
-                try:
-                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
-                    if isinstance(output_audio_tensor, torch.Tensor):
-                        output_audio_np = output_audio_tensor.cpu().numpy()
-                    else:
-                        output_audio_np = output_audio_tensor
-                except Exception as alt_error:
-                    raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
-            if output_audio_np is None:
-                raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
-            # Convert numpy array to bytes
-            audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
-            # CosyVoice2 generates complete audio in one go
-            yield audio_bytes, DEFAULT_SAMPLE_RATE, True
-        except Exception as e:
-            self._handle_provider_error(e, "streaming audio generation")
-    def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
-        """Convert numpy audio array to bytes."""
-        try:
-            # Create an in-memory buffer
-            buffer = io.BytesIO()
-            # Write audio data to buffer as WAV
-            sf.write(buffer, audio_array, sample_rate, format='WAV')
-            # Get bytes from buffer
-            buffer.seek(0)
-            return buffer.read()
-        except Exception as e:
-            raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e

src/infrastructure/tts/dia_provider.py DELETED Viewed

@@ -1,229 +0,0 @@
-"""Dia TTS provider implementation."""
-import logging
-import numpy as np
-import soundfile as sf
-import io
-from typing import Iterator, TYPE_CHECKING
-if TYPE_CHECKING:
-    from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
-from ..base.tts_provider_base import TTSProviderBase
-from ...domain.exceptions import SpeechSynthesisException
-logger = logging.getLogger(__name__)
-# Flag to track Dia availability
-DIA_AVAILABLE = False
-DEFAULT_SAMPLE_RATE = 24000
-# Try to import Dia dependencies
-def _check_dia_dependencies():
-    """Check if Dia dependencies are available."""
-    global DIA_AVAILABLE
-    logger.info("🔍 Checking Dia TTS dependencies...")
-    try:
-        logger.info("Attempting to import torch...")
-        import torch
-        logger.info("✓ Successfully imported torch")
-        logger.info("Attempting to import dia.model...")
-        from dia.model import Dia
-        logger.info("✓ Successfully imported dia.model")
-        DIA_AVAILABLE = True
-        logger.info("✅ Dia TTS engine is available")
-        return True
-    except ImportError as e:
-        logger.warning(f"⚠️ Dia TTS engine dependencies not available: {e}")
-        logger.info(f"ImportError details: {type(e).__name__}: {e}")
-        DIA_AVAILABLE = False
-        return False
-    except ModuleNotFoundError as e:
-        if "dac" in str(e):
-            logger.warning("❌ Dia TTS engine is not available due to missing 'dac' module")
-            logger.info("Please install descript-audio-codec: pip install descript-audio-codec")
-        elif "dia" in str(e):
-            logger.warning("❌ Dia TTS engine is not available due to missing 'dia' module")
-            logger.info("Please install dia: pip install git+https://github.com/nari-labs/dia.git")
-        else:
-            logger.warning(f"❌ Dia TTS engine is not available: {str(e)}")
-        logger.info(f"ModuleNotFoundError details: {type(e).__name__}: {e}")
-        DIA_AVAILABLE = False
-        return False
-# Initial check
-logger.info("🚀 Initializing Dia TTS provider...")
-_check_dia_dependencies()
-class DiaTTSProvider(TTSProviderBase):
-    """Dia TTS provider implementation."""
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the Dia TTS provider."""
-        super().__init__(
-            provider_name="Dia",
-            supported_languages=['en', 'z']  # Dia supports English and multilingual
-        )
-        self.lang_code = lang_code
-        self.model = None
-    def _ensure_model(self):
-        """Ensure the model is loaded."""
-        global DIA_AVAILABLE
-        if self.model is None:
-            logger.info("🔄 Ensuring Dia model is loaded...")
-            # If Dia is not available, check dependencies again
-            if not DIA_AVAILABLE:
-                logger.info("⚠️ Dia not available, checking dependencies again...")
-                if _check_dia_dependencies():
-                    DIA_AVAILABLE = True
-                    logger.info("✅ Dependencies are now available")
-                else:
-                    logger.error("❌ Dependencies still not available")
-                    return False
-            if DIA_AVAILABLE:
-                try:
-                    logger.info("📥 Loading Dia model from pretrained...")
-                    import torch
-                    from dia.model import Dia
-                    self.model = Dia.from_pretrained()
-                    logger.info("🎉 Dia model successfully loaded")
-                except ImportError as e:
-                    logger.error(f"❌ Failed to import Dia dependencies: {str(e)}")
-                    self.model = None
-                except FileNotFoundError as e:
-                    logger.error(f"❌ Failed to load Dia model files: {str(e)}")
-                    logger.info("ℹ️ This might be the first time loading the model. It will be downloaded automatically.")
-                    self.model = None
-                except Exception as e:
-                    logger.error(f"❌ Failed to initialize Dia model: {str(e)}")
-                    logger.info(f"Model initialization error: {type(e).__name__}: {e}")
-                    self.model = None
-        is_available = self.model is not None
-        logger.info(f"Model availability check result: {is_available}")
-        return is_available
-    def is_available(self) -> bool:
-        """Check if Dia TTS is available."""
-        logger.info(f"🔍 Checking Dia availability: DIA_AVAILABLE={DIA_AVAILABLE}")
-        if not DIA_AVAILABLE:
-            logger.info("❌ Dia dependencies not available")
-            return False
-        model_available = self._ensure_model()
-        logger.info(f"🔍 Model availability: {model_available}")
-        result = DIA_AVAILABLE and model_available
-        logger.info(f"🎯 Dia TTS availability result: {result}")
-        return result
-    def get_available_voices(self) -> list[str]:
-        """Get available voices for Dia."""
-        # Dia typically uses a default voice
-        return ['default']
-    def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
-        """Generate audio using Dia TTS."""
-        if not self.is_available():
-            raise SpeechSynthesisException("Dia TTS engine is not available")
-        try:
-            import torch
-            # Extract parameters from request
-            text = request.text_content.text
-            # Generate audio using Dia
-            with torch.inference_mode():
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    cfg_filter_top_k=35,
-                    use_torch_compile=False,
-                    verbose=False
-                )
-            if output_audio_np is None:
-                raise SpeechSynthesisException("Dia model returned None for audio output")
-            # Convert numpy array to bytes
-            audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
-            return audio_bytes, DEFAULT_SAMPLE_RATE
-        except ModuleNotFoundError as e:
-            if "dac" in str(e):
-                raise SpeechSynthesisException("Dia TTS engine failed due to missing 'dac' module") from e
-            else:
-                self._handle_provider_error(e, "audio generation")
-        except Exception as e:
-            self._handle_provider_error(e, "audio generation")
-    def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
-        """Generate audio stream using Dia TTS."""
-        if not self.is_available():
-            raise SpeechSynthesisException("Dia TTS engine is not available")
-        try:
-            import torch
-            # Extract parameters from request
-            text = request.text_content.text
-            # Generate audio using Dia
-            with torch.inference_mode():
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    cfg_filter_top_k=35,
-                    use_torch_compile=False,
-                    verbose=False
-                )
-            if output_audio_np is None:
-                raise SpeechSynthesisException("Dia model returned None for audio output")
-            # Convert numpy array to bytes
-            audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
-            # Dia generates complete audio in one go
-            yield audio_bytes, DEFAULT_SAMPLE_RATE, True
-        except ModuleNotFoundError as e:
-            if "dac" in str(e):
-                raise SpeechSynthesisException("Dia TTS engine failed due to missing 'dac' module") from e
-            else:
-                self._handle_provider_error(e, "streaming audio generation")
-        except Exception as e:
-            self._handle_provider_error(e, "streaming audio generation")
-    def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
-        """Convert numpy audio array to bytes."""
-        try:
-            # Create an in-memory buffer
-            buffer = io.BytesIO()
-            # Write audio data to buffer as WAV
-            sf.write(buffer, audio_array, sample_rate, format='WAV')
-            # Get bytes from buffer
-            buffer.seek(0)
-            return buffer.read()
-        except Exception as e:
-            raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e

src/infrastructure/tts/kokoro_provider.py DELETED Viewed

@@ -1,131 +0,0 @@
-"""Kokoro TTS provider implementation."""
-import logging
-import numpy as np
-import soundfile as sf
-import io
-from typing import Iterator, TYPE_CHECKING
-if TYPE_CHECKING:
-    from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
-from ..base.tts_provider_base import TTSProviderBase
-from ...domain.exceptions import SpeechSynthesisException
-logger = logging.getLogger(__name__)
-# Flag to track Kokoro availability
-KOKORO_AVAILABLE = False
-# Try to import Kokoro
-try:
-    from kokoro import KPipeline
-    KOKORO_AVAILABLE = True
-    logger.info("Kokoro TTS engine is available")
-except ImportError:
-    logger.warning("Kokoro TTS engine is not available")
-except Exception as e:
-    logger.error(f"Kokoro import failed with unexpected error: {str(e)}")
-    KOKORO_AVAILABLE = False
-class KokoroTTSProvider(TTSProviderBase):
-    """Kokoro TTS provider implementation."""
-    def __init__(self, lang_code: str = 'z'):
-        """Initialize the Kokoro TTS provider."""
-        super().__init__(
-            provider_name="Kokoro",
-            supported_languages=['en', 'z']  # Kokoro supports English and multilingual
-        )
-        self.lang_code = lang_code
-        self.pipeline = None
-    def _ensure_pipeline(self):
-        """Ensure the pipeline is loaded."""
-        if self.pipeline is None and KOKORO_AVAILABLE:
-            try:
-                self.pipeline = KPipeline(lang_code=self.lang_code)
-                logger.info("Kokoro pipeline successfully loaded")
-            except Exception as e:
-                logger.error(f"Failed to initialize Kokoro pipeline: {str(e)}")
-                self.pipeline = None
-        return self.pipeline is not None
-    def is_available(self) -> bool:
-        """Check if Kokoro TTS is available."""
-        return KOKORO_AVAILABLE and self._ensure_pipeline()
-    def get_available_voices(self) -> list[str]:
-        """Get available voices for Kokoro."""
-        # Common Kokoro voices based on the original implementation
-        return [
-            'af_heart', 'af_bella', 'af_sarah', 'af_nicole',
-            'am_adam', 'am_michael', 'bf_emma', 'bf_isabella'
-        ]
-    def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
-        """Generate audio using Kokoro TTS."""
-        if not self.is_available():
-            raise SpeechSynthesisException("Kokoro TTS engine is not available")
-        try:
-            # Extract parameters from request
-            text = request.text_content.text
-            voice = request.voice_settings.voice_id
-            speed = request.voice_settings.speed
-            # Generate speech using Kokoro
-            generator = self.pipeline(text, voice=voice, speed=speed)
-            for _, _, audio in generator:
-                # Convert numpy array to bytes
-                audio_bytes = self._numpy_to_bytes(audio, sample_rate=24000)
-                return audio_bytes, 24000
-            raise SpeechSynthesisException("Kokoro failed to generate audio")
-        except Exception as e:
-            self._handle_provider_error(e, "audio generation")
-    def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
-        """Generate audio stream using Kokoro TTS."""
-        if not self.is_available():
-            raise SpeechSynthesisException("Kokoro TTS engine is not available")
-        try:
-            # Extract parameters from request
-            text = request.text_content.text
-            voice = request.voice_settings.voice_id
-            speed = request.voice_settings.speed
-            # Generate speech stream using Kokoro
-            generator = self.pipeline(text, voice=voice, speed=speed)
-            chunk_count = 0
-            for _, _, audio in generator:
-                chunk_count += 1
-                # Convert numpy array to bytes
-                audio_bytes = self._numpy_to_bytes(audio, sample_rate=24000)
-                # Assume this is the final chunk for now (Kokoro typically generates one chunk)
-                is_final = True
-                yield audio_bytes, 24000, is_final
-        except Exception as e:
-            self._handle_provider_error(e, "streaming audio generation")
-    def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
-        """Convert numpy audio array to bytes."""
-        try:
-            # Create an in-memory buffer
-            buffer = io.BytesIO()
-            # Write audio data to buffer as WAV
-            sf.write(buffer, audio_array, sample_rate, format='WAV')
-            # Get bytes from buffer
-            buffer.seek(0)
-            return buffer.read()
-        except Exception as e:
-            raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e

src/infrastructure/tts/provider_factory.py CHANGED Viewed

@@ -25,40 +25,7 @@ class TTSProviderFactory:
         from .dummy_provider import DummyTTSProvider
         self._providers['dummy'] = DummyTTSProvider
-        # Try to register Kokoro provider
-        try:
-            from .kokoro_provider import KokoroTTSProvider
-            self._providers['kokoro'] = KokoroTTSProvider
-            logger.info("Registered Kokoro TTS provider")
-        except ImportError as e:
-            logger.info(f"Kokoro TTS provider not available: {e}")
-        # Try to register Dia provider
-        try:
-            from .dia_provider import DiaTTSProvider
-            self._providers['dia'] = DiaTTSProvider
-            logger.info("Registered Dia TTS provider")
-        except ImportError as e:
-            logger.warning(f"Dia TTS provider not available: {e}")
-            # Still register it so it can attempt installation later
-            try:
-                from .dia_provider import DiaTTSProvider
-                self._providers['dia'] = DiaTTSProvider
-                logger.info("Registered Dia TTS provider (dependencies may be installed on demand)")
-            except Exception:
-                logger.warning("Failed to register Dia TTS provider")
-        except Exception as e:
-            logger.warning(f"Failed to register Dia TTS provider: {e}")
-        # Try to register CosyVoice2 provider
-        try:
-            from .cosyvoice2_provider import CosyVoice2TTSProvider
-            self._providers['cosyvoice2'] = CosyVoice2TTSProvider
-            logger.info("Registered CosyVoice2 TTS provider")
-        except ImportError as e:
-            logger.info(f"CosyVoice2 TTS provider not available: {e}")
-        # Try to register Chatterbox provider
         try:
             from .chatterbox_provider import ChatterboxTTSProvider
             self._providers['chatterbox'] = ChatterboxTTSProvider
@@ -77,14 +44,7 @@ class TTSProviderFactory:
                 # Create instance if not cached
                 if name not in self._provider_instances:
                     logger.info(f"Creating instance for {name} provider")
-                    if name == 'kokoro':
-                        self._provider_instances[name] = provider_class()
-                    elif name == 'dia':
-                        logger.info(f"🔧 Creating Dia TTS provider instance...")
-                        self._provider_instances[name] = provider_class()
-                    elif name == 'cosyvoice2':
-                        self._provider_instances[name] = provider_class()
-                    elif name == 'chatterbox':
                         self._provider_instances[name] = provider_class()
                     else:
                         self._provider_instances[name] = provider_class()
@@ -134,8 +94,8 @@ class TTSProviderFactory:
             provider_class = self._providers[provider_name]
             # Create instance with appropriate parameters
-            if provider_name in ['kokoro', 'dia', 'cosyvoice2', 'chatterbox']:
-                lang_code = kwargs.get('lang_code', 'en' if provider_name == 'chatterbox' else 'z')
                 provider = provider_class(lang_code=lang_code)
             else:
                 provider = provider_class(**kwargs)
@@ -166,7 +126,7 @@ class TTSProviderFactory:
             SpeechSynthesisException: If no providers are available
         """
         if preferred_providers is None:
-            preferred_providers = ['kokoro', 'dia', 'cosyvoice2', 'chatterbox', 'dummy']
         logger.info(f"🔄 Getting TTS provider with fallback, preferred order: {preferred_providers}")
         available_providers = self.get_available_providers()
@@ -214,7 +174,7 @@ class TTSProviderFactory:
             # Create instance if not cached
             if provider_name not in self._provider_instances:
                 provider_class = self._providers[provider_name]
-                if provider_name in ['kokoro', 'dia', 'cosyvoice2', 'chatterbox']:
                     self._provider_instances[provider_name] = provider_class()
                 else:
                     self._provider_instances[provider_name] = provider_class()

         from .dummy_provider import DummyTTSProvider
         self._providers['dummy'] = DummyTTSProvider
+        # Register only Chatterbox provider
         try:
             from .chatterbox_provider import ChatterboxTTSProvider
             self._providers['chatterbox'] = ChatterboxTTSProvider
                 # Create instance if not cached
                 if name not in self._provider_instances:
                     logger.info(f"Creating instance for {name} provider")
+                    if name == 'chatterbox':
                         self._provider_instances[name] = provider_class()
                     else:
                         self._provider_instances[name] = provider_class()
             provider_class = self._providers[provider_name]
             # Create instance with appropriate parameters
+            if provider_name == 'chatterbox':
+                lang_code = kwargs.get('lang_code', 'en')
                 provider = provider_class(lang_code=lang_code)
             else:
                 provider = provider_class(**kwargs)
             SpeechSynthesisException: If no providers are available
         """
         if preferred_providers is None:
+            preferred_providers = ['chatterbox', 'dummy']
         logger.info(f"🔄 Getting TTS provider with fallback, preferred order: {preferred_providers}")
         available_providers = self.get_available_providers()
             # Create instance if not cached
             if provider_name not in self._provider_instances:
                 provider_class = self._providers[provider_name]
+                if provider_name == 'chatterbox':
                     self._provider_instances[provider_name] = provider_class()
                 else:
                     self._provider_instances[provider_name] = provider_class()