Spaces:

DroolingPanda
/

teachingAssistant

Build error

teachingAssistant / utils /tts_cosyvoice2.py

Michael Hu

attempt to fix cosyvoice2 tts

0c2d9e7 22 days ago

7.88 kB

	import logging
	import numpy as np
	import soundfile as sf
	from typing import Optional, Generator, Tuple

	from utils.tts_base import TTSBase

	# Configure logging
	logger = logging.getLogger(__name__)

	# Flag to track CosyVoice2 availability
	COSYVOICE2_AVAILABLE = False
	DEFAULT_SAMPLE_RATE = 24000

	# Try to import CosyVoice2 dependencies
	try:
	import torch
	import torchaudio
	# Import CosyVoice2 from the correct package
	# Based on https://github.com/FunAudioLLM/CosyVoice
	from cosyvoice.cli.cosyvoice import CosyVoice
	COSYVOICE2_AVAILABLE = True
	logger.info("CosyVoice2 TTS engine is available")
	except ImportError as e:
	logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
	COSYVOICE2_AVAILABLE = False
	except ModuleNotFoundError as e:
	logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
	COSYVOICE2_AVAILABLE = False


	def _get_model():
	"""Lazy-load the CosyVoice2 model

	Returns:
	CosyVoice2 or None: The CosyVoice2 model or None if not available
	"""
	if not COSYVOICE2_AVAILABLE:
	logger.warning("CosyVoice2 TTS engine is not available")
	return None

	try:
	import torch
	import torchaudio
	from cosyvoice.cli.cosyvoice import CosyVoice

	# Initialize the model with correct path
	model = CosyVoice('pretrained_models/CosyVoice-300M')
	logger.info("CosyVoice2 model successfully loaded")
	return model
	except ImportError as e:
	logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
	return None
	except FileNotFoundError as e:
	logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
	return None
	except Exception as e:
	logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
	return None


	class CosyVoice2TTS(TTSBase):
	"""CosyVoice2 TTS engine implementation

	This engine uses the CosyVoice2 model for TTS generation.
	"""

	def __init__(self, lang_code: str = 'z'):
	"""Initialize the CosyVoice2 TTS engine

	Args:
	lang_code (str): Language code for the engine
	"""
	super().__init__(lang_code)
	self.model = None

	def _ensure_model(self):
	"""Ensure the model is loaded

	Returns:
	bool: True if model is available, False otherwise
	"""
	if self.model is None:
	self.model = _get_model()

	return self.model is not None

	def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
	"""Generate speech using CosyVoice2 TTS engine

	Args:
	text (str): Input text to synthesize
	voice (str): Voice ID (may not be used in CosyVoice2)
	speed (float): Speech speed multiplier (may not be used in CosyVoice2)

	Returns:
	Optional[str]: Path to the generated audio file or None if generation fails
	"""
	logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")

	# Check if CosyVoice2 is available
	if not COSYVOICE2_AVAILABLE:
	logger.error("CosyVoice2 TTS engine is not available")
	return None

	# Ensure model is loaded
	if not self._ensure_model():
	logger.error("Failed to load CosyVoice2 model")
	return None

	try:
	import torch

	# Generate unique output path
	output_path = self._generate_output_path(prefix="cosyvoice2")

	# Generate audio using CosyVoice2
	try:
	# Use the inference method from CosyVoice
	output_audio_tensor = self.model.inference_sft(text, '中文女')

	# Convert tensor to numpy array
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as api_error:
	# Try alternative API if the first one fails
	try:
	output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as alt_error:
	logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
	return None

	if output_audio_np is not None:
	logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
	sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
	logger.info(f"CosyVoice2 audio generation complete: {output_path}")
	return output_path
	else:
	logger.error("CosyVoice2 model returned None for audio output")
	return None

	except Exception as e:
	logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
	return None

	def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
	"""Generate speech stream using CosyVoice2 TTS engine

	Args:
	text (str): Input text to synthesize
	voice (str): Voice ID (may not be used in CosyVoice2)
	speed (float): Speech speed multiplier (may not be used in CosyVoice2)

	Yields:
	tuple: (sample_rate, audio_data) pairs for each segment
	"""
	logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")

	# Check if CosyVoice2 is available
	if not COSYVOICE2_AVAILABLE:
	logger.error("CosyVoice2 TTS engine is not available")
	return

	# Ensure model is loaded
	if not self._ensure_model():
	logger.error("Failed to load CosyVoice2 model")
	return

	try:
	import torch

	# Generate audio using CosyVoice2
	try:
	# Use the inference method from CosyVoice
	output_audio_tensor = self.model.inference_sft(text, '中文女')

	# Convert tensor to numpy array
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as api_error:
	# Try alternative API if the first one fails
	try:
	output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as alt_error:
	logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
	return

	if output_audio_np is not None:
	logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
	yield DEFAULT_SAMPLE_RATE, output_audio_np
	else:
	logger.error("CosyVoice2 model returned None for audio output")
	return

	except Exception as e:
	logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
	return