Spaces:

HumeAI
/

expressive-tts-arena

Running

expressive-tts-arena / src /integrations /elevenlabs_api.py

zach

Improve ElevenLabs integration documentation

f8ddf74 5 months ago

5.73 kB

	"""
	elevenlabs_api.py

	This file defines the interaction with the ElevenLabs text-to-speech (TTS) API using the ElevenLabs Python SDK.
	It includes functionality for API request handling and processing API responses.

	Key Features:
	- Encapsulates all logic related to the ElevenLabs TTS API.
	- Implements retry logic using Tenacity for handling transient API errors.
	- Handles received audio and processes it for playback on the web.
	- Provides detailed logging for debugging and error tracking.
	- Utilizes robust error handling (EAFP) to validate API responses.

	Classes:
	- ElevenLabsConfig: Immutable configuration for interacting with ElevenLabs' TTS API.
	- ElevenLabsError: Custom exception for ElevenLabs API-related errors.

	Functions:
	- text_to_speech_with_elevenlabs: Synthesizes speech from text using ElevenLabs' TTS API.
	"""

	# Standard Library Imports
	from dataclasses import dataclass
	from enum import Enum
	import logging
	import random
	from typing import Literal, Optional, Tuple

	# Third-Party Library Imports
	from elevenlabs import ElevenLabs
	from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_log

	# Local Application Imports
	from src.config import logger
	from src.utils import validate_env_var


	ElevenlabsVoiceName = Literal['Adam', 'Antoni', 'Rachel', 'Matilda']

	class ElevenLabsVoice(Enum):
	ADAM = ('Adam', 'pNInz6obpgDQGcFmaJgB')
	ANTONI = ('Antoni', 'ErXwobaYiN019PkySvjV')
	RACHEL = ('Rachel', '21m00Tcm4TlvDq8ikWAM')
	MATILDA = ('Matilda', 'XrExE9yKIg1WjnnlVkGX')

	@property
	def voice_name(self) -> ElevenlabsVoiceName:
	"""Returns the display name of the voice."""
	return self.value[0]

	@property
	def voice_id(self) -> str:
	"""Returns the ElevenLabs voice ID."""
	return self.value[1]


	@dataclass(frozen=True)
	class ElevenLabsConfig:
	"""Immutable configuration for interacting with the ElevenLabs TTS API."""
	api_key: str = validate_env_var('ELEVENLABS_API_KEY')
	model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
	output_format: str = 'mp3_44100_128' # Output format of the generated audio

	def __post_init__(self):
	# Validate that required attributes are set
	if not self.api_key:
	raise ValueError('ElevenLabs API key is not set.')
	if not self.model_id:
	raise ValueError('ElevenLabs Model ID is not set.')
	if not self.output_format:
	raise ValueError('ElevenLabs Output Format is not set.')

	@property
	def client(self) -> ElevenLabs:
	"""
	Lazy initialization of the ElevenLabs client.

	Returns:
	ElevenLabs: Configured client instance.
	"""
	return ElevenLabs(api_key=self.api_key)

	@property
	def random_voice(self) -> ElevenLabsVoice:
	"""
	Selects a random ElevenLabs voice.

	Returns:
	ElevenLabsVoice: A randomly selected voice enum member.
	"""
	return random.choice(list(ElevenLabsVoice))


	class ElevenLabsError(Exception):
	"""Custom exception for errors related to the ElevenLabs TTS API."""
	def __init__(self, message: str, original_exception: Optional[Exception] = None):
	super().__init__(message)
	self.original_exception = original_exception


	# Initialize the ElevenLabs client
	elevenlabs_config = ElevenLabsConfig()


	@retry(
	stop=stop_after_attempt(3),
	wait=wait_fixed(2),
	before=before_log(logger, logging.DEBUG),
	after=after_log(logger, logging.DEBUG),
	reraise=True
	)
	def text_to_speech_with_elevenlabs(text: str) -> Tuple[ElevenlabsVoiceName, bytes]:
	"""
	Synthesizes text to speech using the ElevenLabs TTS API.

	Args:
	text (str): The text to be synthesized to speech.

	Returns:
	Tuple[ElevenlabsVoiceName, bytes]: A tuple containing the voice name used for speech synthesis
	and the raw binary audio data for playback.

	Raises:
	ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
	"""
	logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')

	# Get a random voice as an enum member.
	voice = elevenlabs_config.random_voice
	logger.debug(f"Selected voice: {voice.voice_name}")

	try:
	# Synthesize speech using the ElevenLabs SDK
	audio_iterator = elevenlabs_config.client.text_to_speech.convert(
	text=text,
	voice_id=voice.voice_id,
	model_id=elevenlabs_config.model_id,
	output_format=elevenlabs_config.output_format,
	)

	# Attempt to combine chunks into a single bytes object.
	# If audio_iterator is not iterable or invalid, an exception will be raised.
	try:
	audio = b''.join(chunk for chunk in audio_iterator)
	except Exception as iter_error:
	logger.error('Invalid audio iterator response.')
	raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.') from iter_error

	# Validate audio
	if not audio:
	logger.error('No audio data received from ElevenLabs API.')
	raise ElevenLabsError('Empty audio data received from ElevenLabs API.')

	logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
	return voice.voice_name, audio

	except Exception as e:
	logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
	raise ElevenLabsError(
	message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
	original_exception=e,
	)