Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on Jul 29

Commit

0f99c8d

1 Parent(s): c7f7521

add chatterbox

Browse files

Files changed (7) hide show

pyproject.toml +1 -1
requirements.txt +1 -1
src/domain/models/text_content.py +17 -17
src/domain/models/voice_settings.py +42 -21
src/infrastructure/tts/__init__.py +8 -2
src/infrastructure/tts/chatterbox_provider.py +201 -0
src/infrastructure/tts/provider_factory.py +14 -4

pyproject.toml CHANGED Viewed

@@ -25,7 +25,7 @@ dependencies = [
     "phonemizer-fork>=3.3.2",
     "nemo_toolkit[asr]",
     "faster-whisper>=1.1.1",
-    "descript-audio-codec>=1.0.0"
 ]
 [project.optional-dependencies]

     "phonemizer-fork>=3.3.2",
     "nemo_toolkit[asr]",
     "faster-whisper>=1.1.1",
+    "chatterbox-tts"
 ]
 [project.optional-dependencies]

requirements.txt CHANGED Viewed

@@ -14,4 +14,4 @@ ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
 nemo_toolkit[asr]
 faster-whisper>=1.1.1
-descript-audio-codec>=1.0.0

 phonemizer-fork>=3.3.2
 nemo_toolkit[asr]
 faster-whisper>=1.1.1
+chatterbox-tts

src/domain/models/text_content.py CHANGED Viewed

@@ -8,71 +8,71 @@ import re
 @dataclass(frozen=True)
 class TextContent:
     """Value object representing text content with language and encoding information."""
     text: str
     language: str
     encoding: str = 'utf-8'
     def __post_init__(self):
         """Validate text content after initialization."""
         self._validate()
     def _validate(self):
         """Validate text content properties."""
         if not isinstance(self.text, str):
             raise TypeError("Text must be a string")
         if not self.text.strip():
             raise ValueError("Text content cannot be empty or whitespace only")
         if len(self.text) > 50000:  # Reasonable limit for TTS processing
             raise ValueError("Text content too long (maximum 50,000 characters)")
         if not isinstance(self.language, str):
             raise TypeError("Language must be a string")
         if not self.language.strip():
             raise ValueError("Language cannot be empty")
         # Validate language code format (ISO 639-1 or ISO 639-3)
         if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language):
             raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.")
         if not isinstance(self.encoding, str):
             raise TypeError("Encoding must be a string")
         if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']:
             raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1")
         # Validate that text can be encoded with specified encoding
         try:
             self.text.encode(self.encoding)
         except UnicodeEncodeError:
             raise ValueError(f"Text cannot be encoded with {self.encoding} encoding")
     @property
     def word_count(self) -> int:
         """Get the approximate word count of the text."""
         return len(self.text.split())
     @property
     def character_count(self) -> int:
         """Get the character count of the text."""
         return len(self.text)
     @property
     def is_empty(self) -> bool:
         """Check if the text content is effectively empty."""
         return not self.text.strip()
     def truncate(self, max_length: int) -> 'TextContent':
         """Create a new TextContent with truncated text."""
         if max_length <= 0:
             raise ValueError("Max length must be positive")
         if len(self.text) <= max_length:
             return self
         truncated_text = self.text[:max_length].rstrip()
         return TextContent(
             text=truncated_text,

 @dataclass(frozen=True)
 class TextContent:
     """Value object representing text content with language and encoding information."""
     text: str
     language: str
     encoding: str = 'utf-8'
     def __post_init__(self):
         """Validate text content after initialization."""
         self._validate()
     def _validate(self):
         """Validate text content properties."""
         if not isinstance(self.text, str):
             raise TypeError("Text must be a string")
         if not self.text.strip():
             raise ValueError("Text content cannot be empty or whitespace only")
         if len(self.text) > 50000:  # Reasonable limit for TTS processing
             raise ValueError("Text content too long (maximum 50,000 characters)")
         if not isinstance(self.language, str):
             raise TypeError("Language must be a string")
         if not self.language.strip():
             raise ValueError("Language cannot be empty")
         # Validate language code format (ISO 639-1 or ISO 639-3)
         if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language):
             raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.")
         if not isinstance(self.encoding, str):
             raise TypeError("Encoding must be a string")
         if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']:
             raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1")
         # Validate that text can be encoded with specified encoding
         try:
             self.text.encode(self.encoding)
         except UnicodeEncodeError:
             raise ValueError(f"Text cannot be encoded with {self.encoding} encoding")
     @property
     def word_count(self) -> int:
         """Get the approximate word count of the text."""
         return len(self.text.split())
     @property
     def character_count(self) -> int:
         """Get the character count of the text."""
         return len(self.text)
     @property
     def is_empty(self) -> bool:
         """Check if the text content is effectively empty."""
         return not self.text.strip()
     def truncate(self, max_length: int) -> 'TextContent':
         """Create a new TextContent with truncated text."""
         if max_length <= 0:
             raise ValueError("Max length must be positive")
         if len(self.text) <= max_length:
             return self
         truncated_text = self.text[:max_length].rstrip()
         return TextContent(
             text=truncated_text,

src/domain/models/voice_settings.py CHANGED Viewed

@@ -8,74 +8,82 @@ import re
 @dataclass(frozen=True)
 class VoiceSettings:
     """Value object representing voice settings for text-to-speech synthesis."""
     voice_id: str
     speed: float
     language: str
     pitch: Optional[float] = None
     volume: Optional[float] = None
     def __post_init__(self):
         """Validate voice settings after initialization."""
         self._validate()
     def _validate(self):
         """Validate voice settings properties."""
         if not isinstance(self.voice_id, str):
             raise TypeError("Voice ID must be a string")
         if not self.voice_id.strip():
             raise ValueError("Voice ID cannot be empty")
         # Voice ID should be alphanumeric with possible underscores/hyphens
         if not re.match(r'^[a-zA-Z0-9_-]+$', self.voice_id):
             raise ValueError(f"Invalid voice ID format: {self.voice_id}. Must contain only letters, numbers, underscores, and hyphens")
         if not isinstance(self.speed, (int, float)):
             raise TypeError("Speed must be a number")
         if not 0.1 <= self.speed <= 3.0:
             raise ValueError(f"Speed must be between 0.1 and 3.0, got {self.speed}")
         if not isinstance(self.language, str):
             raise TypeError("Language must be a string")
         if not self.language.strip():
             raise ValueError("Language cannot be empty")
         # Validate language code format (ISO 639-1 or ISO 639-3)
         if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language):
             raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.")
         if self.pitch is not None:
             if not isinstance(self.pitch, (int, float)):
                 raise TypeError("Pitch must be a number")
             if not -2.0 <= self.pitch <= 2.0:
                 raise ValueError(f"Pitch must be between -2.0 and 2.0, got {self.pitch}")
         if self.volume is not None:
             if not isinstance(self.volume, (int, float)):
                 raise TypeError("Volume must be a number")
             if not 0.0 <= self.volume <= 2.0:
                 raise ValueError(f"Volume must be between 0.0 and 2.0, got {self.volume}")
     @property
     def is_default_speed(self) -> bool:
         """Check if speed is at default value (1.0)."""
         return abs(self.speed - 1.0) < 0.01
     @property
     def is_default_pitch(self) -> bool:
         """Check if pitch is at default value (0.0 or None)."""
         return self.pitch is None or abs(self.pitch) < 0.01
     @property
     def is_default_volume(self) -> bool:
         """Check if volume is at default value (1.0 or None)."""
         return self.volume is None or abs(self.volume - 1.0) < 0.01
     def with_speed(self, speed: float) -> 'VoiceSettings':
         """Create a new VoiceSettings with different speed."""
         return VoiceSettings(
@@ -83,9 +91,10 @@ class VoiceSettings:
             speed=speed,
             language=self.language,
             pitch=self.pitch,
-            volume=self.volume
         )
     def with_pitch(self, pitch: Optional[float]) -> 'VoiceSettings':
         """Create a new VoiceSettings with different pitch."""
         return VoiceSettings(
@@ -93,5 +102,17 @@ class VoiceSettings:
             speed=self.speed,
             language=self.language,
             pitch=pitch,
-            volume=self.volume
         )

 @dataclass(frozen=True)
 class VoiceSettings:
     """Value object representing voice settings for text-to-speech synthesis."""
     voice_id: str
     speed: float
     language: str
     pitch: Optional[float] = None
     volume: Optional[float] = None
+    audio_prompt_path: Optional[str] = None  # For voice cloning (e.g., Chatterbox)
     def __post_init__(self):
         """Validate voice settings after initialization."""
         self._validate()
     def _validate(self):
         """Validate voice settings properties."""
         if not isinstance(self.voice_id, str):
             raise TypeError("Voice ID must be a string")
         if not self.voice_id.strip():
             raise ValueError("Voice ID cannot be empty")
         # Voice ID should be alphanumeric with possible underscores/hyphens
         if not re.match(r'^[a-zA-Z0-9_-]+$', self.voice_id):
             raise ValueError(f"Invalid voice ID format: {self.voice_id}. Must contain only letters, numbers, underscores, and hyphens")
         if not isinstance(self.speed, (int, float)):
             raise TypeError("Speed must be a number")
         if not 0.1 <= self.speed <= 3.0:
             raise ValueError(f"Speed must be between 0.1 and 3.0, got {self.speed}")
         if not isinstance(self.language, str):
             raise TypeError("Language must be a string")
         if not self.language.strip():
             raise ValueError("Language cannot be empty")
         # Validate language code format (ISO 639-1 or ISO 639-3)
         if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language):
             raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.")
         if self.pitch is not None:
             if not isinstance(self.pitch, (int, float)):
                 raise TypeError("Pitch must be a number")
             if not -2.0 <= self.pitch <= 2.0:
                 raise ValueError(f"Pitch must be between -2.0 and 2.0, got {self.pitch}")
         if self.volume is not None:
             if not isinstance(self.volume, (int, float)):
                 raise TypeError("Volume must be a number")
             if not 0.0 <= self.volume <= 2.0:
                 raise ValueError(f"Volume must be between 0.0 and 2.0, got {self.volume}")
+        if self.audio_prompt_path is not None:
+            if not isinstance(self.audio_prompt_path, str):
+                raise TypeError("Audio prompt path must be a string")
+            if not self.audio_prompt_path.strip():
+                raise ValueError("Audio prompt path cannot be empty")
     @property
     def is_default_speed(self) -> bool:
         """Check if speed is at default value (1.0)."""
         return abs(self.speed - 1.0) < 0.01
     @property
     def is_default_pitch(self) -> bool:
         """Check if pitch is at default value (0.0 or None)."""
         return self.pitch is None or abs(self.pitch) < 0.01
     @property
     def is_default_volume(self) -> bool:
         """Check if volume is at default value (1.0 or None)."""
         return self.volume is None or abs(self.volume - 1.0) < 0.01
     def with_speed(self, speed: float) -> 'VoiceSettings':
         """Create a new VoiceSettings with different speed."""
         return VoiceSettings(
             speed=speed,
             language=self.language,
             pitch=self.pitch,
+            volume=self.volume,
+            audio_prompt_path=self.audio_prompt_path
         )
     def with_pitch(self, pitch: Optional[float]) -> 'VoiceSettings':
         """Create a new VoiceSettings with different pitch."""
         return VoiceSettings(
             speed=self.speed,
             language=self.language,
             pitch=pitch,
+            volume=self.volume,
+            audio_prompt_path=self.audio_prompt_path
+        )
+    def with_audio_prompt(self, audio_prompt_path: Optional[str]) -> 'VoiceSettings':
+        """Create a new VoiceSettings with different audio prompt path."""
+        return VoiceSettings(
+            voice_id=self.voice_id,
+            speed=self.speed,
+            language=self.language,
+            pitch=self.pitch,
+            volume=self.volume,
+            audio_prompt_path=audio_prompt_path
         )

src/infrastructure/tts/__init__.py CHANGED Viewed

@@ -19,10 +19,16 @@ try:
 except ImportError:
     CosyVoice2TTSProvider = None
 __all__ = [
     'TTSProviderFactory',
     'DummyTTSProvider',
     'KokoroTTSProvider',
-    'DiaTTSProvider',
-    'CosyVoice2TTSProvider'
 ]

 except ImportError:
     CosyVoice2TTSProvider = None
+try:
+    from .chatterbox_provider import ChatterboxTTSProvider
+except ImportError:
+    ChatterboxTTSProvider = None
 __all__ = [
     'TTSProviderFactory',
     'DummyTTSProvider',
     'KokoroTTSProvider',
+    'DiaTTSProvider',
+    'CosyVoice2TTSProvider',
+    'ChatterboxTTSProvider'
 ]

src/infrastructure/tts/chatterbox_provider.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""Chatterbox TTS provider implementation."""
+import logging
+import numpy as np
+import soundfile as sf
+import io
+from typing import Iterator, Optional, TYPE_CHECKING
+if TYPE_CHECKING:
+    from ...domain.models.speech_synthesis_request import SpeechSynthesisRequest
+from ..base.tts_provider_base import TTSProviderBase
+from ...domain.exceptions import SpeechSynthesisException
+logger = logging.getLogger(__name__)
+# Flag to track Chatterbox availability
+CHATTERBOX_AVAILABLE = False
+# Try to import Chatterbox
+try:
+    import torch
+    import torchaudio as ta
+    from chatterbox.tts import ChatterboxTTS
+    CHATTERBOX_AVAILABLE = True
+    logger.info("Chatterbox TTS engine is available")
+except ImportError as e:
+    logger.warning(f"Chatterbox TTS engine is not available: {e}")
+except Exception as e:
+    logger.error(f"Chatterbox import failed with unexpected error: {str(e)}")
+    CHATTERBOX_AVAILABLE = False
+class ChatterboxTTSProvider(TTSProviderBase):
+    """Chatterbox TTS provider implementation."""
+    def __init__(self, lang_code: str = 'en'):
+        """Initialize the Chatterbox TTS provider."""
+        super().__init__(
+            provider_name="Chatterbox",
+            supported_languages=['en']  # Chatterbox primarily supports English
+        )
+        self.lang_code = lang_code
+        self.model = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    def _ensure_model(self):
+        """Ensure the model is loaded."""
+        if self.model is None and CHATTERBOX_AVAILABLE:
+            try:
+                logger.info(f"Loading Chatterbox model on device: {self.device}")
+                self.model = ChatterboxTTS.from_pretrained(device=self.device)
+                logger.info("Chatterbox model successfully loaded")
+            except Exception as e:
+                logger.error(f"Failed to initialize Chatterbox model: {str(e)}")
+                self.model = None
+        return self.model is not None
+    def is_available(self) -> bool:
+        """Check if Chatterbox TTS is available."""
+        return CHATTERBOX_AVAILABLE and self._ensure_model()
+    def get_available_voices(self) -> list[str]:
+        """Get available voices for Chatterbox."""
+        # Chatterbox supports voice cloning with audio prompts
+        # Default voice is the base model voice
+        return ['default', 'custom']
+    def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
+        """Generate audio using Chatterbox TTS."""
+        if not self.is_available():
+            raise SpeechSynthesisException("Chatterbox TTS engine is not available")
+        try:
+            # Extract parameters from request
+            text = request.text_content.text
+            voice = request.voice_settings.voice_id
+            # Generate speech using Chatterbox
+            if voice == 'custom' and hasattr(request.voice_settings, 'audio_prompt_path'):
+                # Use custom voice with audio prompt
+                audio_prompt_path = request.voice_settings.audio_prompt_path
+                wav = self.model.generate(text, audio_prompt_path=audio_prompt_path)
+            else:
+                # Use default voice
+                wav = self.model.generate(text)
+            # Convert tensor to numpy array if needed
+            if hasattr(wav, 'cpu'):
+                wav = wav.cpu().numpy()
+            elif hasattr(wav, 'detach'):
+                wav = wav.detach().numpy()
+            # Get sample rate from model
+            sample_rate = self.model.sr
+            # Convert numpy array to bytes
+            audio_bytes = self._numpy_to_bytes(wav, sample_rate)
+            return audio_bytes, sample_rate
+        except Exception as e:
+            self._handle_provider_error(e, "audio generation")
+    def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
+        """Generate audio stream using Chatterbox TTS."""
+        if not self.is_available():
+            raise SpeechSynthesisException("Chatterbox TTS engine is not available")
+        try:
+            # Chatterbox doesn't natively support streaming, so we'll generate the full audio
+            # and split it into chunks for streaming
+            text = request.text_content.text
+            voice = request.voice_settings.voice_id
+            # Generate full audio
+            if voice == 'custom' and hasattr(request.voice_settings, 'audio_prompt_path'):
+                audio_prompt_path = request.voice_settings.audio_prompt_path
+                wav = self.model.generate(text, audio_prompt_path=audio_prompt_path)
+            else:
+                wav = self.model.generate(text)
+            # Convert tensor to numpy array if needed
+            if hasattr(wav, 'cpu'):
+                wav = wav.cpu().numpy()
+            elif hasattr(wav, 'detach'):
+                wav = wav.detach().numpy()
+            sample_rate = self.model.sr
+            # Split audio into chunks for streaming
+            chunk_size = int(sample_rate * 1.0)  # 1 second chunks
+            total_samples = len(wav)
+            for start_idx in range(0, total_samples, chunk_size):
+                end_idx = min(start_idx + chunk_size, total_samples)
+                chunk = wav[start_idx:end_idx]
+                # Convert chunk to bytes
+                audio_bytes = self._numpy_to_bytes(chunk, sample_rate)
+                # Check if this is the final chunk
+                is_final = (end_idx >= total_samples)
+                yield audio_bytes, sample_rate, is_final
+        except Exception as e:
+            self._handle_provider_error(e, "streaming audio generation")
+    def _numpy_to_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
+        """Convert numpy audio array to bytes."""
+        try:
+            # Ensure audio is in the right format
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+            # Normalize if needed
+            if np.max(np.abs(audio_array)) > 1.0:
+                audio_array = audio_array / np.max(np.abs(audio_array))
+            # Create an in-memory buffer
+            buffer = io.BytesIO()
+            # Write audio data to buffer as WAV
+            sf.write(buffer, audio_array, sample_rate, format='WAV')
+            # Get bytes from buffer
+            buffer.seek(0)
+            return buffer.read()
+        except Exception as e:
+            raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e
+    def generate_with_voice_prompt(self, text: str, audio_prompt_path: str) -> tuple[bytes, int]:
+        """
+        Generate audio with a custom voice prompt.
+        Args:
+            text: Text to synthesize
+            audio_prompt_path: Path to audio file for voice cloning
+        Returns:
+            tuple: (audio_bytes, sample_rate)
+        """
+        if not self.is_available():
+            raise SpeechSynthesisException("Chatterbox TTS engine is not available")
+        try:
+            wav = self.model.generate(text, audio_prompt_path=audio_prompt_path)
+            # Convert tensor to numpy array if needed
+            if hasattr(wav, 'cpu'):
+                wav = wav.cpu().numpy()
+            elif hasattr(wav, 'detach'):
+                wav = wav.detach().numpy()
+            sample_rate = self.model.sr
+            audio_bytes = self._numpy_to_bytes(wav, sample_rate)
+            return audio_bytes, sample_rate
+        except Exception as e:
+            self._handle_provider_error(e, "voice prompt audio generation")

src/infrastructure/tts/provider_factory.py CHANGED Viewed

@@ -58,6 +58,14 @@ class TTSProviderFactory:
         except ImportError as e:
             logger.info(f"CosyVoice2 TTS provider not available: {e}")
     def get_available_providers(self) -> List[str]:
         """Get list of available TTS providers."""
         logger.info("🔍 Checking availability of TTS providers...")
@@ -76,6 +84,8 @@ class TTSProviderFactory:
                         self._provider_instances[name] = provider_class()
                     elif name == 'cosyvoice2':
                         self._provider_instances[name] = provider_class()
                     else:
                         self._provider_instances[name] = provider_class()
@@ -124,8 +134,8 @@ class TTSProviderFactory:
             provider_class = self._providers[provider_name]
             # Create instance with appropriate parameters
-            if provider_name in ['kokoro', 'dia', 'cosyvoice2']:
-                lang_code = kwargs.get('lang_code', 'z')
                 provider = provider_class(lang_code=lang_code)
             else:
                 provider = provider_class(**kwargs)
@@ -156,7 +166,7 @@ class TTSProviderFactory:
             SpeechSynthesisException: If no providers are available
         """
         if preferred_providers is None:
-            preferred_providers = ['kokoro', 'dia', 'cosyvoice2', 'dummy']
         logger.info(f"🔄 Getting TTS provider with fallback, preferred order: {preferred_providers}")
         available_providers = self.get_available_providers()
@@ -204,7 +214,7 @@ class TTSProviderFactory:
             # Create instance if not cached
             if provider_name not in self._provider_instances:
                 provider_class = self._providers[provider_name]
-                if provider_name in ['kokoro', 'dia', 'cosyvoice2']:
                     self._provider_instances[provider_name] = provider_class()
                 else:
                     self._provider_instances[provider_name] = provider_class()

         except ImportError as e:
             logger.info(f"CosyVoice2 TTS provider not available: {e}")
+        # Try to register Chatterbox provider
+        try:
+            from .chatterbox_provider import ChatterboxTTSProvider
+            self._providers['chatterbox'] = ChatterboxTTSProvider
+            logger.info("Registered Chatterbox TTS provider")
+        except ImportError as e:
+            logger.info(f"Chatterbox TTS provider not available: {e}")
     def get_available_providers(self) -> List[str]:
         """Get list of available TTS providers."""
         logger.info("🔍 Checking availability of TTS providers...")
                         self._provider_instances[name] = provider_class()
                     elif name == 'cosyvoice2':
                         self._provider_instances[name] = provider_class()
+                    elif name == 'chatterbox':
+                        self._provider_instances[name] = provider_class()
                     else:
                         self._provider_instances[name] = provider_class()
             provider_class = self._providers[provider_name]
             # Create instance with appropriate parameters
+            if provider_name in ['kokoro', 'dia', 'cosyvoice2', 'chatterbox']:
+                lang_code = kwargs.get('lang_code', 'en' if provider_name == 'chatterbox' else 'z')
                 provider = provider_class(lang_code=lang_code)
             else:
                 provider = provider_class(**kwargs)
             SpeechSynthesisException: If no providers are available
         """
         if preferred_providers is None:
+            preferred_providers = ['kokoro', 'dia', 'cosyvoice2', 'chatterbox', 'dummy']
         logger.info(f"🔄 Getting TTS provider with fallback, preferred order: {preferred_providers}")
         available_providers = self.get_available_providers()
             # Create instance if not cached
             if provider_name not in self._provider_instances:
                 provider_class = self._providers[provider_name]
+                if provider_name in ['kokoro', 'dia', 'cosyvoice2', 'chatterbox']:
                     self._provider_instances[provider_name] = provider_class()
                 else:
                     self._provider_instances[provider_name] = provider_class()