Spaces:

HumeAI
/

expressive-tts-arena

Running

File size: 8,963 Bytes

adecb62
 
 
c8f7e68
 
adecb62
 
 
 
 
 
 
 
bc5091e
5a007ca
adecb62
 
bc5091e
adecb62
 
 
a375dbf
fc85b67
 
5a007ca
adecb62
 
2f050a8
048c3fc
5a007ca
adecb62
1ed6720
048c3fc
1ed6720
0e508c8
 
fc85b67
adecb62
 
 
 
e9bcee8
d1ed6b1
fc85b67
 
 
 
 
d4b2b49
0e508c8
adecb62
fc85b67
 
d4b2b49
d1ed6b1
0e508c8
 
e9bcee8
fc85b67
 
 
 
 
 
 
 
 
 
adecb62
 
a6d4367
adecb62
d1ed6b1
fc85b67
2f050a8
 
 
 
 
 
 
 
fc85b67
 
adecb62
 
 
a5cafbd
7f25817
a5cafbd
 
 
d1ed6b1
a5cafbd
ba3994f
fc85b67
 
 
 
ba3994f
adecb62
0e508c8
adecb62
ba3994f
 
 
 
 
 
adecb62
ba3994f
 
 
fc85b67
ba3994f
fc85b67
 
adecb62
 
ba3994f
048c3fc
 
adecb62
 
ba3994f
 
 
048c3fc
 
adecb62
d1ed6b1
048c3fc
 
d1ed6b1
adecb62
ba3994f
 
 
1ed6720
5bf19b3
ee8b196
fc85b67
ba3994f
5bf19b3
adecb62
 
bc5091e
adecb62
d4b2b49
adecb62
 
 
7f25817
 
8047063
0e508c8
d4b2b49
0e508c8
 
 
 
fc85b67
ba3994f
1ed6720
0e508c8
ba3994f
 
 
 
1ed6720
ba3994f
0e508c8
 
fc85b67
 
 
 
 
048c3fc
 
 
 
 
 
 
 
 
ba3994f
 
1ed6720
ba3994f
 
 
 
 
 
 
 
 
 
 
 
 
fc85b67
ba3994f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ed6720
ba3994f

"""
hume_api.py

This file defines the interaction with the Hume text-to-speech (TTS) API.
It includes functionality for API request handling and processing API responses.

Key Features:
- Encapsulates all logic related to the Hume TTS API.
- Implements retry logic for handling transient API errors.
- Handles received audio and processes it for playback on the web.
- Provides detailed logging for debugging and error tracking.

Classes:
- HumeConfig: Immutable configuration for interacting with Hume's TTS API.
- HumeError: Custom exception for Hume API-related errors.

Functions:
- text_to_speech_with_hume: Synthesizes speech from text using Hume's TTS API.
"""

# Standard Library Imports
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, Literal, Tuple, Union

# Third-Party Library Imports
import requests
from requests.exceptions import HTTPError
from tenacity import after_log, before_log, retry, stop_after_attempt, wait_fixed

# Local Application Imports
from src.config import Config, logger
from src.constants import CLIENT_ERROR_CODE, SERVER_ERROR_CODE
from src.utils import save_base64_audio_to_file, validate_env_var

HumeSupportedFileFormat = Literal["mp3", "pcm", "wav"]
"""Supported audio file formats for the Hume TTS API"""


@dataclass(frozen=True)
class HumeConfig:
    """Immutable configuration for interacting with the Hume TTS API."""

    # Computed fields.
    api_key: str = field(init=False)
    headers: Dict[str, str] = field(init=False)

    # Provided fields.
    url: str = "https://test-api.hume.ai/v0/tts/octave"
    file_format: HumeSupportedFileFormat = "mp3"

    def __post_init__(self) -> None:
        # Validate required attributes.
        if not self.url:
            raise ValueError("Hume TTS endpoint URL is not set.")
        if not self.file_format:
            raise ValueError("Hume TTS file format is not set.")

        # Compute the API key from the environment.
        computed_api_key = validate_env_var("HUME_API_KEY")
        object.__setattr__(self, "api_key", computed_api_key)

        # Compute the headers.
        computed_headers = {
            "X-Hume-Api-Key": f"{computed_api_key}",
            "Content-Type": "application/json",
        }
        object.__setattr__(self, "headers", computed_headers)


class HumeError(Exception):
    """Custom exception for errors related to the Hume TTS API."""

    def __init__(self, message: str, original_exception: Union[Exception, None] = None):
        super().__init__(message)
        self.original_exception = original_exception
        self.message = message


class UnretryableHumeError(HumeError):
    """Custom exception for errors related to the Hume TTS API that should not be retried."""

    def __init__(self, message: str, original_exception: Union[Exception, None] = None):
        super().__init__(message, original_exception)
        self.original_exception = original_exception


@retry(
    stop=stop_after_attempt(3),
    wait=wait_fixed(2),
    before=before_log(logger, logging.DEBUG),
    after=after_log(logger, logging.DEBUG),
    reraise=True,
)
def text_to_speech_with_hume(
    character_description: str,
    text: str,
    num_generations: int,
    config: Config,
) -> Union[Tuple[str, str], Tuple[str, str, str, str]]:
    """
    Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.

    This function sends a POST request to the Hume TTS API with a character description and text
    to be converted to speech. Depending on the specified number of generations (allowed values: 1 or 2),
    the API returns one or two generations. For each generation, the function extracts the base64-encoded
    audio and the generation ID, saves the audio as an MP3 file via the `save_base64_audio_to_file` helper,
    and returns the relevant details.

    Args:
        character_description (str): A description of the character, which is used as contextual input
            for generating the voice.
        text (str): The text to be converted to speech.
        num_generations (int): The number of audio generations to request from the API.
            Allowed values are 1 or 2. If 1, only a single generation is processed; if 2, a second
            generation is expected in the API response.
        config (Config): The application configuration containing Hume API settings.

    Returns:
        Union[Tuple[str, str], Tuple[str, str, str, str]]:
            - If num_generations == 1: (generation_a_id, audio_a_path).
            - If num_generations == 2: (generation_a_id, audio_a_path, generation_b_id, audio_b_path).

    Raises:
        ValueError: If num_generations is not 1 or 2.
        HumeError: If there is an error communicating with the Hume TTS API or parsing its response.
        UnretryableHumeError: If a client-side HTTP error (status code in the 4xx range) is encountered.
        Exception: Any other exceptions raised during the request or processing will be wrapped and
                   re-raised as HumeError.
    """
    logger.debug(
        f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. "
        f"Text length: {len(text)} characters."
    )

    if num_generations < 1 or num_generations > 2:
        raise ValueError("Invalid number of generations specified. Must be 1 or 2.")

    hume_config = config.hume_config
    request_body = {
        "utterances": [{"text": text, "description": character_description or None}],
        "format": {"type": hume_config.file_format},
        "num_generations": num_generations,
    }

    try:
        # Synthesize speech using the Hume TTS API
        response = requests.post(
            url=hume_config.url,
            headers=hume_config.headers,
            json=request_body,
        )
        response.raise_for_status()
        response_data = response.json()

        generations = response_data.get("generations")
        if not generations:
            msg = "No generations returned by Hume API."
            logger.error(msg)
            raise HumeError(msg)

        # Extract the base64 encoded audio and generation ID from the generation.
        generation_a = generations[0]
        generation_a_id, audio_a_path = parse_hume_tts_generation(generation_a, config)

        if num_generations == 1:
            return (generation_a_id, audio_a_path)

        generation_b = generations[1]
        generation_b_id, audio_b_path = parse_hume_tts_generation(generation_b, config)
        return (generation_a_id, audio_a_path, generation_b_id, audio_b_path)

    except Exception as e:
        if (
            isinstance(e, HTTPError)
            and e.response is not None
            and CLIENT_ERROR_CODE <= e.response.status_code < SERVER_ERROR_CODE
        ):
            raise UnretryableHumeError(
                message=f"{e.response.text}",
                original_exception=e,
            ) from e

        raise HumeError(
            message=f"{e}",
            original_exception=e,
        ) from e


def parse_hume_tts_generation(generation: Dict[str, Any], config: Config) -> Tuple[str, str]:
    """
    Parse a Hume TTS generation response and save the decoded audio as an MP3 file.

    This function extracts the generation ID and the base64-encoded audio from the provided
    dictionary. It then decodes and saves the audio data to an MP3 file, naming the file using
    the generation ID. Finally, it returns a tuple containing the generation ID and the file path
    of the saved audio.

    Args:
        generation (Dict[str, Any]): A dictionary representing the TTS generation response from Hume.
            Expected keys are:
                - "generation_id" (str): A unique identifier for the generated audio.
                - "audio" (str): A base64 encoded string of the audio data.
        config (Config): The application configuration used for saving the audio file.

    Returns:
        Tuple[str, str]: A tuple containing:
            - generation_id (str): The unique identifier for the audio generation.
            - audio_path (str): The filesystem path where the audio file was saved.

    Raises:
        KeyError: If the "generation_id" or "audio" key is missing from the generation dictionary.
        Exception: Propagates any exceptions raised by save_base64_audio_to_file, such as errors during
                   the decoding or file saving process.
    """
    generation_id = generation.get("generation_id")
    if generation_id is None:
        raise KeyError("The generation dictionary is missing the 'generation_id' key.")

    base64_audio = generation.get("audio")
    if base64_audio is None:
        raise KeyError("The generation dictionary is missing the 'audio' key.")

    filename = f"{generation_id}.mp3"
    audio_file_path = save_base64_audio_to_file(base64_audio, filename, config)
    return generation_id, audio_file_path