Spaces:

HumeAI
/

expressive-tts-arena

Running

File size: 5,265 Bytes

a375dbf
 
 
c8f7e68
a375dbf
 
 
 
f8ddf74
a375dbf
 
f8ddf74
a375dbf
 
f8ddf74
5a007ca
a375dbf
 
f8ddf74
a375dbf
 
 
 
 
681c05f
0e508c8
5a007ca
a375dbf
0e508c8
234af57
a375dbf
5a007ca
a375dbf
 
0e508c8
36b195f
 
a375dbf
 
 
d1ed6b1
 
0e508c8
a375dbf
 
 
 
d1ed6b1
 
a375dbf
 
 
 
 
 
 
 
 
 
 
63ef86b
a375dbf
d1ed6b1
a375dbf
 
 
234af57
 
 
 
 
 
 
 
a375dbf
 
 
 
 
 
 
 
 
 
 
d1ed6b1
a375dbf
5bf19b3
a375dbf
0e508c8
a375dbf
 
5bf19b3
8047063
a375dbf
 
5bf19b3
 
 
a375dbf
 
63ef86b
a375dbf
d1ed6b1
7f25817
d1ed6b1
a375dbf
 
bc5091e
7f25817
5bf19b3
a375dbf
0e508c8
a375dbf
 
7f25817
 
 
 
 
a375dbf
0e508c8
d4b2b49
0e508c8
d4b2b49
0e508c8
 
 
5bf19b3
a375dbf
 
3885d80
234af57
 
 
 
 
 
63ef86b
7f25817
a375dbf
7f25817

"""
elevenlabs_api.py

This file defines the interaction with the ElevenLabs text-to-speech (TTS) API using the ElevenLabs Python SDK.
It includes functionality for API request handling and processing API responses.

Key Features:
- Encapsulates all logic related to the ElevenLabs TTS API.
- Implements retry logic using Tenacity for handling transient API errors.
- Handles received audio and processes it for playback on the web.
- Provides detailed logging for debugging and error tracking.
- Utilizes robust error handling (EAFP) to validate API responses.

Classes:
- ElevenLabsConfig: Immutable configuration for interacting with ElevenLabs' TTS API.
- ElevenLabsError: Custom exception for ElevenLabs API-related errors.

Functions:
- text_to_speech_with_elevenlabs: Synthesizes speech from text using ElevenLabs' TTS API.
"""

# Standard Library Imports
from dataclasses import dataclass
import logging
import random
from typing import Optional

# Third-Party Library Imports
from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
from elevenlabs.core import ApiError
from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_log

# Local Application Imports
from src.config import logger
from src.utils import save_base64_audio_to_file, validate_env_var


@dataclass(frozen=True)
class ElevenLabsConfig:
    """Immutable configuration for interacting with the ElevenLabs TTS API."""

    api_key: str = validate_env_var("ELEVENLABS_API_KEY")
    output_format: TextToVoiceCreatePreviewsRequestOutputFormat = "mp3_44100_128"

    def __post_init__(self):
        # Validate that required attributes are set
        if not self.api_key:
            raise ValueError("ElevenLabs API key is not set.")

    @property
    def client(self) -> ElevenLabs:
        """
        Lazy initialization of the ElevenLabs client.

        Returns:
            ElevenLabs: Configured client instance.
        """
        return ElevenLabs(api_key=self.api_key)


class ElevenLabsError(Exception):
    """Custom exception for errors related to the ElevenLabs TTS API."""

    def __init__(self, message: str, original_exception: Optional[Exception] = None):
        super().__init__(message)
        self.original_exception = original_exception
        self.message = message


class UnretryableElevenLabsError(ElevenLabsError):
    """Custom exception for errors related to the ElevenLabs TTS API that should not be retried."""

    def __init__(self, message: str, original_exception: Optional[Exception] = None):
        super().__init__(message, original_exception)


# Initialize the ElevenLabs client
elevenlabs_config = ElevenLabsConfig()


@retry(
    stop=stop_after_attempt(3),
    wait=wait_fixed(2),
    before=before_log(logger, logging.DEBUG),
    after=after_log(logger, logging.DEBUG),
    reraise=True,
)
def text_to_speech_with_elevenlabs(character_description: str, text: str) -> bytes:
    """
    Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.

    Args:
        character_description (str): The original user character description used as the voice description.
        text (str): The text to be synthesized to speech.

    Returns:
        Tuple[None, str]: A tuple containing:
            - generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
            - file_path (str): The relative path to the file where the synthesized audio was saved.

    Raises:
        ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
    """
    logger.debug(
        f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
    )

    try:
        # Synthesize speech using the ElevenLabs SDK
        response = elevenlabs_config.client.text_to_voice.create_previews(
            voice_description=character_description,
            text=text,
            output_format=elevenlabs_config.output_format,
        )

        previews = response.previews
        if not previews:
            msg = "No previews returned by ElevenLabs API."
            logger.error(msg)
            raise ElevenLabsError(message=msg)

        # Extract the base64 encoded audio and generated voice ID from the preview
        preview = random.choice(previews)
        generated_voice_id = preview.generated_voice_id
        base64_audio = preview.audio_base_64
        filename = f"{generated_voice_id}.mp3"

        # Write audio to file and return the relative path
        return None, save_base64_audio_to_file(base64_audio, filename)

    except Exception as e:
        logger.exception(f"Error generating text with the ElevenLabs API: {str(e)}")
        if isinstance(e, ApiError):
            if e.status_code >= 400 and e.status_code < 500:
                raise UnretryableElevenLabsError(
                    message=f"Failed to synthesize speech with ElevenLabs: \"{e.body['detail']['message']}\"",
                    original_exception=e,
                ) from e
        raise ElevenLabsError(
            message=f"Failed to synthesize speech with ElevenLabs: {e}",
            original_exception=e,
        ) from e