Michael Hu commited on
Commit
0c2d9e7
·
1 Parent(s): 74466cd

attempt to fix cosyvoice2 tts

Browse files
src/application/error_handling/error_mapper.py CHANGED
@@ -202,6 +202,19 @@ class ErrorMapper:
202
  "Retry the operation",
203
  "Check system load and try again later"
204
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  )
206
  }
207
 
 
202
  "Retry the operation",
203
  "Check system load and try again later"
204
  ]
205
+ ),
206
+
207
+ # Type errors
208
+ TypeError: ErrorMapping(
209
+ user_message="Invalid data type provided. This is likely a configuration or implementation issue.",
210
+ error_code="TYPE_ERROR",
211
+ severity=ErrorSeverity.HIGH,
212
+ category=ErrorCategory.SYSTEM,
213
+ recovery_suggestions=[
214
+ "Retry the operation",
215
+ "Try a different voice or model",
216
+ "Contact support if the issue persists"
217
+ ]
218
  )
219
  }
220
 
src/application/services/audio_processing_service.py CHANGED
@@ -53,7 +53,7 @@ class AudioProcessingApplicationService:
53
  """
54
  try:
55
  logger.info("Initializing AudioProcessingApplicationService...")
56
-
57
  self._container = container
58
  self._config = config or container.resolve(AppConfig)
59
  self._temp_files: Dict[str, str] = {} # Track temporary files for cleanup
@@ -66,7 +66,7 @@ class AudioProcessingApplicationService:
66
  # self._setup_logging()
67
 
68
  logger.info("AudioProcessingApplicationService initialized successfully")
69
-
70
  except Exception as e:
71
  print(f"Error: Failed to initialize AudioProcessingApplicationService: {e}")
72
  raise
@@ -520,32 +520,42 @@ class AudioProcessingApplicationService:
520
  """
521
  try:
522
  logger.info(
523
- f"Starting TTS with voice: {voice}, speed: {speed} "
524
  f"[correlation_id={correlation_id}]"
525
  )
 
526
 
527
  # Get TTS provider from container
 
528
  tts_provider = self._container.get_tts_provider(voice)
 
529
 
530
  # Create voice settings
 
531
  voice_settings = VoiceSettings(
532
  voice_id=voice,
533
  speed=speed,
534
  language=language
535
  )
 
536
 
537
  # Create synthesis request
 
538
  synthesis_request = SpeechSynthesisRequest(
539
- text=text.text,
540
  voice_settings=voice_settings
541
  )
 
542
 
543
  # Perform synthesis
 
544
  audio_content = tts_provider.synthesize(synthesis_request)
 
545
 
546
  # Save output to file
547
  output_filename = f"output_{correlation_id}.{audio_content.format}"
548
  output_path = os.path.join(temp_dir, output_filename)
 
549
 
550
  with open(output_path, 'wb') as f:
551
  f.write(audio_content.data)
@@ -561,7 +571,7 @@ class AudioProcessingApplicationService:
561
  return output_path
562
 
563
  except Exception as e:
564
- logger.error(f"TTS failed: {e} [correlation_id={correlation_id}]")
565
  raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}")
566
 
567
  def _get_error_code_from_exception(self, exception: Exception) -> str:
@@ -792,11 +802,23 @@ class AudioProcessingApplicationService:
792
  component="AudioProcessingApplicationService"
793
  )
794
 
 
 
 
 
795
  def tts_operation():
796
- return self._perform_speech_synthesis(text, voice, speed, language, temp_dir, correlation_id)
 
 
 
 
 
 
 
797
 
798
  try:
799
  # Try with circuit breaker protection
 
800
  return self._recovery_manager.execute_with_circuit_breaker(
801
  tts_operation,
802
  f"tts_{voice}",
@@ -805,6 +827,8 @@ class AudioProcessingApplicationService:
805
  )
806
 
807
  except Exception as e:
 
 
808
  # Try fallback TTS providers
809
  tts_config = self._config.get_tts_config()
810
  fallback_voices = [v for v in tts_config['preferred_providers'] if v != voice]
@@ -829,4 +853,5 @@ class AudioProcessingApplicationService:
829
  correlation_id
830
  )
831
  else:
 
832
  raise
 
53
  """
54
  try:
55
  logger.info("Initializing AudioProcessingApplicationService...")
56
+
57
  self._container = container
58
  self._config = config or container.resolve(AppConfig)
59
  self._temp_files: Dict[str, str] = {} # Track temporary files for cleanup
 
66
  # self._setup_logging()
67
 
68
  logger.info("AudioProcessingApplicationService initialized successfully")
69
+
70
  except Exception as e:
71
  print(f"Error: Failed to initialize AudioProcessingApplicationService: {e}")
72
  raise
 
520
  """
521
  try:
522
  logger.info(
523
+ f"Starting TTS with voice: {voice}, speed: {speed}, language: {language} "
524
  f"[correlation_id={correlation_id}]"
525
  )
526
+ logger.info(f"Text to synthesize length: {len(text.text)} characters")
527
 
528
  # Get TTS provider from container
529
+ logger.info(f"Getting TTS provider for voice: {voice}")
530
  tts_provider = self._container.get_tts_provider(voice)
531
+ logger.info(f"TTS provider obtained: {tts_provider.__class__.__name__}")
532
 
533
  # Create voice settings
534
+ logger.info("Creating voice settings")
535
  voice_settings = VoiceSettings(
536
  voice_id=voice,
537
  speed=speed,
538
  language=language
539
  )
540
+ logger.info(f"Voice settings created: {voice_settings}")
541
 
542
  # Create synthesis request
543
+ logger.info("Creating synthesis request")
544
  synthesis_request = SpeechSynthesisRequest(
545
+ text_content=text, # text is already a TextContent object
546
  voice_settings=voice_settings
547
  )
548
+ logger.info("Synthesis request created successfully")
549
 
550
  # Perform synthesis
551
+ logger.info("Starting TTS synthesis")
552
  audio_content = tts_provider.synthesize(synthesis_request)
553
+ logger.info(f"TTS synthesis completed, audio format: {audio_content.format}, data length: {len(audio_content.data)}")
554
 
555
  # Save output to file
556
  output_filename = f"output_{correlation_id}.{audio_content.format}"
557
  output_path = os.path.join(temp_dir, output_filename)
558
+ logger.info(f"Saving audio to: {output_path}")
559
 
560
  with open(output_path, 'wb') as f:
561
  f.write(audio_content.data)
 
571
  return output_path
572
 
573
  except Exception as e:
574
+ logger.error(f"TTS failed: {e} [correlation_id={correlation_id}]", exc_info=True)
575
  raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}")
576
 
577
  def _get_error_code_from_exception(self, exception: Exception) -> str:
 
802
  component="AudioProcessingApplicationService"
803
  )
804
 
805
+ logger.info(f"Starting TTS synthesis with recovery [correlation_id={correlation_id}]")
806
+ logger.info(f"Parameters: voice={voice}, speed={speed}, language={language}")
807
+ logger.info(f"Text type: {type(text)}, Text content type: {type(text.text) if hasattr(text, 'text') else 'N/A'}")
808
+
809
  def tts_operation():
810
+ logger.info(f"Executing TTS operation [correlation_id={correlation_id}]")
811
+ try:
812
+ result = self._perform_speech_synthesis(text, voice, speed, language, temp_dir, correlation_id)
813
+ logger.info(f"TTS operation completed successfully [correlation_id={correlation_id}]")
814
+ return result
815
+ except Exception as e:
816
+ logger.error(f"TTS operation failed: {str(e)} [correlation_id={correlation_id}]", exc_info=True)
817
+ raise
818
 
819
  try:
820
  # Try with circuit breaker protection
821
+ logger.info(f"Attempting TTS with circuit breaker [correlation_id={correlation_id}]")
822
  return self._recovery_manager.execute_with_circuit_breaker(
823
  tts_operation,
824
  f"tts_{voice}",
 
827
  )
828
 
829
  except Exception as e:
830
+ logger.error(f"Primary TTS failed, trying fallbacks: {str(e)} [correlation_id={correlation_id}]", exc_info=True)
831
+
832
  # Try fallback TTS providers
833
  tts_config = self._config.get_tts_config()
834
  fallback_voices = [v for v in tts_config['preferred_providers'] if v != voice]
 
853
  correlation_id
854
  )
855
  else:
856
+ logger.error(f"No fallback voices available [correlation_id={correlation_id}]")
857
  raise
src/infrastructure/tts/cosyvoice2_provider.py CHANGED
@@ -21,15 +21,17 @@ DEFAULT_SAMPLE_RATE = 24000
21
  # Try to import CosyVoice2 dependencies
22
  try:
23
  import torch
24
- # Import CosyVoice2 - assuming it's installed and has a similar API to Dia
25
- # since they're both from nari-labs according to the GitHub link
26
- from cosyvoice2.model import CosyVoice2
 
27
  COSYVOICE2_AVAILABLE = True
28
  logger.info("CosyVoice2 TTS engine is available")
29
- except ImportError:
30
- logger.warning("CosyVoice2 TTS engine is not available")
 
31
  except ModuleNotFoundError as e:
32
- logger.warning(f"CosyVoice2 TTS engine is not available: {str(e)}")
33
  COSYVOICE2_AVAILABLE = False
34
 
35
 
@@ -49,20 +51,28 @@ class CosyVoice2TTSProvider(TTSProviderBase):
49
  """Ensure the model is loaded."""
50
  if self.model is None and COSYVOICE2_AVAILABLE:
51
  try:
 
52
  import torch
53
- from cosyvoice2.model import CosyVoice2
54
- self.model = CosyVoice2.from_pretrained()
 
 
 
 
55
  logger.info("CosyVoice2 model successfully loaded")
56
  except ImportError as e:
57
- logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
58
  self.model = None
59
  except FileNotFoundError as e:
60
- logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
61
  self.model = None
62
  except Exception as e:
63
- logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
64
  self.model = None
65
- return self.model is not None
 
 
 
66
 
67
  def is_available(self) -> bool:
68
  """Check if CosyVoice2 TTS is available."""
@@ -75,36 +85,66 @@ class CosyVoice2TTSProvider(TTSProviderBase):
75
 
76
  def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
77
  """Generate audio using CosyVoice2 TTS."""
 
 
78
  if not self.is_available():
 
79
  raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
80
 
81
  try:
82
  import torch
83
-
84
  # Extract parameters from request
85
  text = request.text_content.text
 
 
86
 
87
  # Generate audio using CosyVoice2
88
- with torch.inference_mode():
89
- # Assuming CosyVoice2 has a similar API to Dia
90
- output_audio_np = self.model.generate(
91
- text,
92
- max_tokens=None,
93
- cfg_scale=3.0,
94
- temperature=1.3,
95
- top_p=0.95,
96
- use_torch_compile=False,
97
- verbose=False
98
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  if output_audio_np is None:
 
101
  raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
102
 
 
 
103
  # Convert numpy array to bytes
 
104
  audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
 
 
105
  return audio_bytes, DEFAULT_SAMPLE_RATE
106
 
107
  except Exception as e:
 
108
  self._handle_provider_error(e, "audio generation")
109
 
110
  def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
@@ -114,22 +154,30 @@ class CosyVoice2TTSProvider(TTSProviderBase):
114
 
115
  try:
116
  import torch
117
-
118
  # Extract parameters from request
119
  text = request.text_content.text
120
 
121
  # Generate audio using CosyVoice2
122
- with torch.inference_mode():
123
- # Assuming CosyVoice2 has a similar API to Dia
124
- output_audio_np = self.model.generate(
125
- text,
126
- max_tokens=None,
127
- cfg_scale=3.0,
128
- temperature=1.3,
129
- top_p=0.95,
130
- use_torch_compile=False,
131
- verbose=False
132
- )
 
 
 
 
 
 
 
 
133
 
134
  if output_audio_np is None:
135
  raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
@@ -147,13 +195,13 @@ class CosyVoice2TTSProvider(TTSProviderBase):
147
  try:
148
  # Create an in-memory buffer
149
  buffer = io.BytesIO()
150
-
151
  # Write audio data to buffer as WAV
152
  sf.write(buffer, audio_array, sample_rate, format='WAV')
153
-
154
  # Get bytes from buffer
155
  buffer.seek(0)
156
  return buffer.read()
157
-
158
  except Exception as e:
159
  raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e
 
21
  # Try to import CosyVoice2 dependencies
22
  try:
23
  import torch
24
+ import torchaudio
25
+ # Import CosyVoice2 from the correct package
26
+ # Based on https://github.com/FunAudioLLM/CosyVoice
27
+ from cosyvoice.cli.cosyvoice import CosyVoice
28
  COSYVOICE2_AVAILABLE = True
29
  logger.info("CosyVoice2 TTS engine is available")
30
+ except ImportError as e:
31
+ logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
32
+ COSYVOICE2_AVAILABLE = False
33
  except ModuleNotFoundError as e:
34
+ logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
35
  COSYVOICE2_AVAILABLE = False
36
 
37
 
 
51
  """Ensure the model is loaded."""
52
  if self.model is None and COSYVOICE2_AVAILABLE:
53
  try:
54
+ logger.info("Loading CosyVoice2 model...")
55
  import torch
56
+ import torchaudio
57
+ from cosyvoice.cli.cosyvoice import CosyVoice
58
+
59
+ # Initialize CosyVoice with the correct model path
60
+ # You may need to adjust the model path based on your installation
61
+ self.model = CosyVoice('pretrained_models/CosyVoice-300M')
62
  logger.info("CosyVoice2 model successfully loaded")
63
  except ImportError as e:
64
+ logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}", exc_info=True)
65
  self.model = None
66
  except FileNotFoundError as e:
67
+ logger.error(f"Failed to load CosyVoice2 model files: {str(e)}", exc_info=True)
68
  self.model = None
69
  except Exception as e:
70
+ logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}", exc_info=True)
71
  self.model = None
72
+
73
+ model_available = self.model is not None
74
+ logger.info(f"CosyVoice2 model availability check: {model_available}")
75
+ return model_available
76
 
77
  def is_available(self) -> bool:
78
  """Check if CosyVoice2 TTS is available."""
 
85
 
86
  def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
87
  """Generate audio using CosyVoice2 TTS."""
88
+ logger.info("Starting CosyVoice2 audio generation")
89
+
90
  if not self.is_available():
91
+ logger.error("CosyVoice2 TTS engine is not available")
92
  raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
93
 
94
  try:
95
  import torch
96
+
97
  # Extract parameters from request
98
  text = request.text_content.text
99
+ logger.info(f"CosyVoice2 generating audio for text length: {len(text)}")
100
+ logger.info(f"Voice settings: voice_id={request.voice_settings.voice_id}, speed={request.voice_settings.speed}")
101
 
102
  # Generate audio using CosyVoice2
103
+ logger.info("Starting CosyVoice2 model inference")
104
+
105
+ # CosyVoice API - using inference method
106
+ # The model expects text and returns audio tensor
107
+ try:
108
+ # Use the inference method from CosyVoice
109
+ output_audio_tensor = self.model.inference_sft(text, '中文女')
110
+
111
+ # Convert tensor to numpy array
112
+ if isinstance(output_audio_tensor, torch.Tensor):
113
+ output_audio_np = output_audio_tensor.cpu().numpy()
114
+ else:
115
+ output_audio_np = output_audio_tensor
116
+
117
+ logger.info("CosyVoice2 model inference completed")
118
+ except Exception as api_error:
119
+ logger.error(f"CosyVoice2 API error: {str(api_error)}")
120
+ # Try alternative API if the first one fails
121
+ try:
122
+ logger.info("Trying alternative CosyVoice2 API")
123
+ output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
124
+ if isinstance(output_audio_tensor, torch.Tensor):
125
+ output_audio_np = output_audio_tensor.cpu().numpy()
126
+ else:
127
+ output_audio_np = output_audio_tensor
128
+ logger.info("CosyVoice2 alternative API succeeded")
129
+ except Exception as alt_error:
130
+ logger.error(f"CosyVoice2 alternative API also failed: {str(alt_error)}")
131
+ raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
132
 
133
  if output_audio_np is None:
134
+ logger.error("CosyVoice2 model returned None for audio output")
135
  raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
136
 
137
+ logger.info(f"CosyVoice2 generated audio array shape: {output_audio_np.shape if hasattr(output_audio_np, 'shape') else 'unknown'}")
138
+
139
  # Convert numpy array to bytes
140
+ logger.info("Converting CosyVoice2 audio to bytes")
141
  audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
142
+ logger.info(f"CosyVoice2 audio conversion completed, bytes length: {len(audio_bytes)}")
143
+
144
  return audio_bytes, DEFAULT_SAMPLE_RATE
145
 
146
  except Exception as e:
147
+ logger.error(f"CosyVoice2 audio generation failed: {str(e)}", exc_info=True)
148
  self._handle_provider_error(e, "audio generation")
149
 
150
  def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
 
154
 
155
  try:
156
  import torch
157
+
158
  # Extract parameters from request
159
  text = request.text_content.text
160
 
161
  # Generate audio using CosyVoice2
162
+ try:
163
+ # Use the inference method from CosyVoice
164
+ output_audio_tensor = self.model.inference_sft(text, '中文女')
165
+
166
+ # Convert tensor to numpy array
167
+ if isinstance(output_audio_tensor, torch.Tensor):
168
+ output_audio_np = output_audio_tensor.cpu().numpy()
169
+ else:
170
+ output_audio_np = output_audio_tensor
171
+ except Exception as api_error:
172
+ # Try alternative API if the first one fails
173
+ try:
174
+ output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
175
+ if isinstance(output_audio_tensor, torch.Tensor):
176
+ output_audio_np = output_audio_tensor.cpu().numpy()
177
+ else:
178
+ output_audio_np = output_audio_tensor
179
+ except Exception as alt_error:
180
+ raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
181
 
182
  if output_audio_np is None:
183
  raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
 
195
  try:
196
  # Create an in-memory buffer
197
  buffer = io.BytesIO()
198
+
199
  # Write audio data to buffer as WAV
200
  sf.write(buffer, audio_array, sample_rate, format='WAV')
201
+
202
  # Get bytes from buffer
203
  buffer.seek(0)
204
  return buffer.read()
205
+
206
  except Exception as e:
207
  raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e
utils/tts_cosyvoice2.py CHANGED
@@ -15,34 +15,37 @@ DEFAULT_SAMPLE_RATE = 24000
15
  # Try to import CosyVoice2 dependencies
16
  try:
17
  import torch
18
- # Import CosyVoice2 - assuming it's installed and has a similar API to Dia
19
- # since they're both from nari-labs according to the GitHub link
20
- from cosyvoice2.model import CosyVoice2
 
21
  COSYVOICE2_AVAILABLE = True
22
  logger.info("CosyVoice2 TTS engine is available")
23
- except ImportError:
24
- logger.warning("CosyVoice2 TTS engine is not available")
 
25
  except ModuleNotFoundError as e:
26
- logger.warning(f"CosyVoice2 TTS engine is not available: {str(e)}")
27
  COSYVOICE2_AVAILABLE = False
28
 
29
 
30
  def _get_model():
31
  """Lazy-load the CosyVoice2 model
32
-
33
  Returns:
34
  CosyVoice2 or None: The CosyVoice2 model or None if not available
35
  """
36
  if not COSYVOICE2_AVAILABLE:
37
  logger.warning("CosyVoice2 TTS engine is not available")
38
  return None
39
-
40
  try:
41
  import torch
42
- from cosyvoice2.model import CosyVoice2
43
-
44
- # Initialize the model
45
- model = CosyVoice2.from_pretrained()
 
46
  logger.info("CosyVoice2 model successfully loaded")
47
  return model
48
  except ImportError as e:
@@ -58,72 +61,81 @@ def _get_model():
58
 
59
  class CosyVoice2TTS(TTSBase):
60
  """CosyVoice2 TTS engine implementation
61
-
62
  This engine uses the CosyVoice2 model for TTS generation.
63
  """
64
-
65
  def __init__(self, lang_code: str = 'z'):
66
  """Initialize the CosyVoice2 TTS engine
67
-
68
  Args:
69
  lang_code (str): Language code for the engine
70
  """
71
  super().__init__(lang_code)
72
  self.model = None
73
-
74
  def _ensure_model(self):
75
  """Ensure the model is loaded
76
-
77
  Returns:
78
  bool: True if model is available, False otherwise
79
  """
80
  if self.model is None:
81
  self.model = _get_model()
82
-
83
  return self.model is not None
84
-
85
  def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
86
  """Generate speech using CosyVoice2 TTS engine
87
-
88
  Args:
89
  text (str): Input text to synthesize
90
  voice (str): Voice ID (may not be used in CosyVoice2)
91
  speed (float): Speech speed multiplier (may not be used in CosyVoice2)
92
-
93
  Returns:
94
  Optional[str]: Path to the generated audio file or None if generation fails
95
  """
96
  logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
97
-
98
  # Check if CosyVoice2 is available
99
  if not COSYVOICE2_AVAILABLE:
100
  logger.error("CosyVoice2 TTS engine is not available")
101
  return None
102
-
103
  # Ensure model is loaded
104
  if not self._ensure_model():
105
  logger.error("Failed to load CosyVoice2 model")
106
  return None
107
-
108
  try:
109
  import torch
110
-
111
  # Generate unique output path
112
  output_path = self._generate_output_path(prefix="cosyvoice2")
113
-
114
- # Generate audio
115
- with torch.inference_mode():
116
- # Assuming CosyVoice2 has a similar API to Dia
117
- output_audio_np = self.model.generate(
118
- text,
119
- max_tokens=None,
120
- cfg_scale=3.0,
121
- temperature=1.3,
122
- top_p=0.95,
123
- use_torch_compile=False,
124
- verbose=False
125
- )
126
-
 
 
 
 
 
 
 
 
 
127
  if output_audio_np is not None:
128
  logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
129
  sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
@@ -132,57 +144,66 @@ class CosyVoice2TTS(TTSBase):
132
  else:
133
  logger.error("CosyVoice2 model returned None for audio output")
134
  return None
135
-
136
  except Exception as e:
137
  logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
138
  return None
139
-
140
  def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
141
  """Generate speech stream using CosyVoice2 TTS engine
142
-
143
  Args:
144
  text (str): Input text to synthesize
145
  voice (str): Voice ID (may not be used in CosyVoice2)
146
  speed (float): Speech speed multiplier (may not be used in CosyVoice2)
147
-
148
  Yields:
149
  tuple: (sample_rate, audio_data) pairs for each segment
150
  """
151
  logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
152
-
153
  # Check if CosyVoice2 is available
154
  if not COSYVOICE2_AVAILABLE:
155
  logger.error("CosyVoice2 TTS engine is not available")
156
  return
157
-
158
  # Ensure model is loaded
159
  if not self._ensure_model():
160
  logger.error("Failed to load CosyVoice2 model")
161
  return
162
-
163
  try:
164
  import torch
165
-
166
- # Generate audio
167
- with torch.inference_mode():
168
- # Assuming CosyVoice2 has a similar API to Dia
169
- output_audio_np = self.model.generate(
170
- text,
171
- max_tokens=None,
172
- cfg_scale=3.0,
173
- temperature=1.3,
174
- top_p=0.95,
175
- use_torch_compile=False,
176
- verbose=False
177
- )
178
-
 
 
 
 
 
 
 
 
 
179
  if output_audio_np is not None:
180
  logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
181
  yield DEFAULT_SAMPLE_RATE, output_audio_np
182
  else:
183
  logger.error("CosyVoice2 model returned None for audio output")
184
  return
185
-
186
  except Exception as e:
187
  logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
188
  return
 
15
  # Try to import CosyVoice2 dependencies
16
  try:
17
  import torch
18
+ import torchaudio
19
+ # Import CosyVoice2 from the correct package
20
+ # Based on https://github.com/FunAudioLLM/CosyVoice
21
+ from cosyvoice.cli.cosyvoice import CosyVoice
22
  COSYVOICE2_AVAILABLE = True
23
  logger.info("CosyVoice2 TTS engine is available")
24
+ except ImportError as e:
25
+ logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
26
+ COSYVOICE2_AVAILABLE = False
27
  except ModuleNotFoundError as e:
28
+ logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
29
  COSYVOICE2_AVAILABLE = False
30
 
31
 
32
  def _get_model():
33
  """Lazy-load the CosyVoice2 model
34
+
35
  Returns:
36
  CosyVoice2 or None: The CosyVoice2 model or None if not available
37
  """
38
  if not COSYVOICE2_AVAILABLE:
39
  logger.warning("CosyVoice2 TTS engine is not available")
40
  return None
41
+
42
  try:
43
  import torch
44
+ import torchaudio
45
+ from cosyvoice.cli.cosyvoice import CosyVoice
46
+
47
+ # Initialize the model with correct path
48
+ model = CosyVoice('pretrained_models/CosyVoice-300M')
49
  logger.info("CosyVoice2 model successfully loaded")
50
  return model
51
  except ImportError as e:
 
61
 
62
  class CosyVoice2TTS(TTSBase):
63
  """CosyVoice2 TTS engine implementation
64
+
65
  This engine uses the CosyVoice2 model for TTS generation.
66
  """
67
+
68
  def __init__(self, lang_code: str = 'z'):
69
  """Initialize the CosyVoice2 TTS engine
70
+
71
  Args:
72
  lang_code (str): Language code for the engine
73
  """
74
  super().__init__(lang_code)
75
  self.model = None
76
+
77
  def _ensure_model(self):
78
  """Ensure the model is loaded
79
+
80
  Returns:
81
  bool: True if model is available, False otherwise
82
  """
83
  if self.model is None:
84
  self.model = _get_model()
85
+
86
  return self.model is not None
87
+
88
  def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
89
  """Generate speech using CosyVoice2 TTS engine
90
+
91
  Args:
92
  text (str): Input text to synthesize
93
  voice (str): Voice ID (may not be used in CosyVoice2)
94
  speed (float): Speech speed multiplier (may not be used in CosyVoice2)
95
+
96
  Returns:
97
  Optional[str]: Path to the generated audio file or None if generation fails
98
  """
99
  logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
100
+
101
  # Check if CosyVoice2 is available
102
  if not COSYVOICE2_AVAILABLE:
103
  logger.error("CosyVoice2 TTS engine is not available")
104
  return None
105
+
106
  # Ensure model is loaded
107
  if not self._ensure_model():
108
  logger.error("Failed to load CosyVoice2 model")
109
  return None
110
+
111
  try:
112
  import torch
113
+
114
  # Generate unique output path
115
  output_path = self._generate_output_path(prefix="cosyvoice2")
116
+
117
+ # Generate audio using CosyVoice2
118
+ try:
119
+ # Use the inference method from CosyVoice
120
+ output_audio_tensor = self.model.inference_sft(text, '中文女')
121
+
122
+ # Convert tensor to numpy array
123
+ if isinstance(output_audio_tensor, torch.Tensor):
124
+ output_audio_np = output_audio_tensor.cpu().numpy()
125
+ else:
126
+ output_audio_np = output_audio_tensor
127
+ except Exception as api_error:
128
+ # Try alternative API if the first one fails
129
+ try:
130
+ output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
131
+ if isinstance(output_audio_tensor, torch.Tensor):
132
+ output_audio_np = output_audio_tensor.cpu().numpy()
133
+ else:
134
+ output_audio_np = output_audio_tensor
135
+ except Exception as alt_error:
136
+ logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
137
+ return None
138
+
139
  if output_audio_np is not None:
140
  logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
141
  sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
 
144
  else:
145
  logger.error("CosyVoice2 model returned None for audio output")
146
  return None
147
+
148
  except Exception as e:
149
  logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
150
  return None
151
+
152
  def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
153
  """Generate speech stream using CosyVoice2 TTS engine
154
+
155
  Args:
156
  text (str): Input text to synthesize
157
  voice (str): Voice ID (may not be used in CosyVoice2)
158
  speed (float): Speech speed multiplier (may not be used in CosyVoice2)
159
+
160
  Yields:
161
  tuple: (sample_rate, audio_data) pairs for each segment
162
  """
163
  logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
164
+
165
  # Check if CosyVoice2 is available
166
  if not COSYVOICE2_AVAILABLE:
167
  logger.error("CosyVoice2 TTS engine is not available")
168
  return
169
+
170
  # Ensure model is loaded
171
  if not self._ensure_model():
172
  logger.error("Failed to load CosyVoice2 model")
173
  return
174
+
175
  try:
176
  import torch
177
+
178
+ # Generate audio using CosyVoice2
179
+ try:
180
+ # Use the inference method from CosyVoice
181
+ output_audio_tensor = self.model.inference_sft(text, '中文女')
182
+
183
+ # Convert tensor to numpy array
184
+ if isinstance(output_audio_tensor, torch.Tensor):
185
+ output_audio_np = output_audio_tensor.cpu().numpy()
186
+ else:
187
+ output_audio_np = output_audio_tensor
188
+ except Exception as api_error:
189
+ # Try alternative API if the first one fails
190
+ try:
191
+ output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
192
+ if isinstance(output_audio_tensor, torch.Tensor):
193
+ output_audio_np = output_audio_tensor.cpu().numpy()
194
+ else:
195
+ output_audio_np = output_audio_tensor
196
+ except Exception as alt_error:
197
+ logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
198
+ return
199
+
200
  if output_audio_np is not None:
201
  logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
202
  yield DEFAULT_SAMPLE_RATE, output_audio_np
203
  else:
204
  logger.error("CosyVoice2 model returned None for audio output")
205
  return
206
+
207
  except Exception as e:
208
  logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
209
  return