Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 8

Commit

ba3994f

1 Parent(s): 2f050a8

Move business logic out of app.py, refactor code to call hume once specifying 2 generations instead of call Hume twice

Browse files

Files changed (5) hide show

src/app.py +70 -54
src/integrations/elevenlabs_api.py +10 -7
src/integrations/hume_api.py +79 -21
src/types.py +20 -1
src/utils.py +107 -8

src/app.py CHANGED Viewed

@@ -30,8 +30,12 @@ from src.integrations import (
     text_to_speech_with_hume,
 )
 from src.theme import CustomTheme
-from src.types import ComparisonType, OptionMap, VotingResults
-from src.utils import validate_character_description_length
 def generate_text(
@@ -73,73 +77,85 @@ def generate_text(
 def text_to_speech(
     character_description: str, text: str, generated_text_state: str
-) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
     """
-    Synthesizes two text to speech outputs, loads the two audio players with the
-    output audio, and updates related UI state components.
-        - 50% chance to synthesize one Hume and one Elevenlabs output.
-        - 50% chance to synthesize two Hume outputs.
     Args:
-        character_description (str): The original character_description.
-        text (str): The text to synthesize to speech.
     Returns:
-        A tuple of:
-         - Update for first audio player (with autoplay)
-         - Update for second audio player
-         - A dictionary mapping options to providers
-         - The raw audio value for option B
     Raises:
-        gr.Error: On API or unexpected errors.
     """
     if not text:
         logger.warning("Skipping text-to-speech due to empty text.")
         raise gr.Error("Please generate or enter text to synthesize.")
-    # Hume AI always included in comparison
-    provider_a = constants.HUME_AI
-    # If not using generated text, then only compare Hume to Hume
     text_modified = text != generated_text_state
-    provider_b: constants.TTSProviderName = (
-        constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
-    )
     try:
-        with ThreadPoolExecutor(max_workers=2) as executor:
-            future_audio_a = executor.submit(
-                text_to_speech_with_hume, character_description, text
-            )
-            match provider_b:
-                case constants.HUME_AI:
-                    comparison_type: ComparisonType = constants.HUME_TO_HUME
-                    future_audio_b = executor.submit(
-                        text_to_speech_with_hume, character_description, text
-                    )
-                case constants.ELEVENLABS:
-                    comparison_type: ComparisonType = constants.HUME_TO_ELEVENLABS
-                    future_audio_b = executor.submit(
-                        text_to_speech_with_elevenlabs, character_description, text
-                    )
-                case _:
-                    raise ValueError(f"Unsupported provider: {provider_b}")
-            generation_id_a, audio_a = future_audio_a.result()
-            generation_id_b, audio_b = future_audio_b.result()
-        options = [
-            (provider_a, audio_a, generation_id_a),
-            (provider_b, audio_b, generation_id_b),
-        ]
-        random.shuffle(options)
-        options_map: OptionMap = {
-            constants.OPTION_A: options[0][0],
-            constants.OPTION_B: options[1][0],
-        }
-        option_a_audio, option_b_audio = options[0][1], options[1][1]
-        option_a_generation_id, option_b_generation_id = options[0][2], options[1][2]
         return (
             gr.update(value=option_a_audio, visible=True, autoplay=True),

     text_to_speech_with_hume,
 )
 from src.theme import CustomTheme
+from src.types import ComparisonType, Option, OptionMap, VotingResults
+from src.utils import (
+    choose_providers,
+    create_shuffled_tts_options,
+    validate_character_description_length,
+)
 def generate_text(
 def text_to_speech(
     character_description: str, text: str, generated_text_state: str
+) -> Tuple[gr.update, gr.update, dict, str, ComparisonType, str, str, bool, str, str]:
     """
+    Synthesizes two text-to-speech outputs, updates UI state components, and returns additional TTS metadata.
+    This function generates TTS outputs using different providers based on the input text and its modification
+    state. Depending on the selected providers, it may:
+      - Synthesize one Hume and one ElevenLabs output (50% chance), or
+      - Synthesize two Hume outputs (50% chance).
+    The outputs are processed and shuffled, and the corresponding UI components for two audio players are updated.
+    Additional metadata such as the generation IDs, comparison type, and state information are also returned.
     Args:
+        character_description (str): The description of the character used for generating the voice.
+        text (str): The text content to be synthesized into speech.
+        generated_text_state (str): The previously generated text state, used to determine if the text has been modified.
     Returns:
+        Tuple containing:
+            - gr.update: Update for the first audio player (with autoplay enabled).
+            - gr.update: Update for the second audio player.
+            - dict: A mapping of option constants to their corresponding TTS providers.
+            - str: The raw audio value (relative file path) for option B.
+            - ComparisonType: The comparison type between the selected TTS providers.
+            - str: Generation ID for option A.
+            - str: Generation ID for option B.
+            - bool: Flag indicating whether the text was modified.
+            - str: The original text that was synthesized.
+            - str: The original character description.
     Raises:
+        gr.Error: If any API or unexpected errors occur during the TTS synthesis process.
     """
     if not text:
         logger.warning("Skipping text-to-speech due to empty text.")
         raise gr.Error("Please generate or enter text to synthesize.")
+    # Select 2 TTS providers based on whether the text has been modified.
     text_modified = text != generated_text_state
+    comparison_type, provider_a, provider_b = choose_providers(text_modified)
     try:
+        if provider_b == constants.HUME_AI:
+            # If generating 2 Hume outputs, do so in a single API call
+            (
+                generation_id_a,
+                audio_a,
+                generation_id_b,
+                audio_b,
+            ) = text_to_speech_with_hume(character_description, text, 2)
+        else:
+            with ThreadPoolExecutor(max_workers=2) as executor:
+                # Generate a single Hume output
+                future_audio_a = executor.submit(
+                    text_to_speech_with_hume, character_description, text
+                )
+                # Generate a second TTS output from the second provider
+                match provider_b:
+                    case constants.ELEVENLABS:
+                        future_audio_b = executor.submit(
+                            text_to_speech_with_elevenlabs, character_description, text
+                        )
+                    case _:
+                        # Additional TTS Providers can be added here
+                        raise ValueError(f"Unsupported provider: {provider_b}")
+                generation_id_a, audio_a = future_audio_a.result()
+                generation_id_b, audio_b = future_audio_b.result()
+        # Shuffle options so that placement of options in the UI will always be random
+        (
+            option_a_audio,
+            option_b_audio,
+            option_a_generation_id,
+            option_b_generation_id,
+            options_map,
+        ) = create_shuffled_tts_options(
+            provider_a, audio_a, generation_id_a, provider_b, audio_b, generation_id_b
+        )
         return (
             gr.update(value=option_a_audio, visible=True, autoplay=True),

src/integrations/elevenlabs_api.py CHANGED Viewed

@@ -23,7 +23,7 @@ Functions:
 from dataclasses import dataclass
 import logging
 import random
-from typing import Optional
 # Third-Party Library Imports
 from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
@@ -85,18 +85,20 @@ elevenlabs_config = ElevenLabsConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def text_to_speech_with_elevenlabs(character_description: str, text: str) -> bytes:
     """
-    Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.
     Args:
-        character_description (str): The original user character description used as the voice description.
-        text (str): The text to be synthesized to speech.
     Returns:
         Tuple[None, str]: A tuple containing:
             - generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
-            - file_path (str): The relative path to the file where the synthesized audio was saved.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
@@ -124,9 +126,10 @@ def text_to_speech_with_elevenlabs(character_description: str, text: str) -> byt
         generated_voice_id = preview.generated_voice_id
         base64_audio = preview.audio_base_64
         filename = f"{generated_voice_id}.mp3"
         # Write audio to file and return the relative path
-        return None, save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         if isinstance(e, ApiError):

 from dataclasses import dataclass
 import logging
 import random
+from typing import Optional, Union
 # Third-Party Library Imports
 from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def text_to_speech_with_elevenlabs(
+    character_description: str, text: str
+) -> Tuple[None, str]:
     """
+    Synthesizes text to speech using the ElevenLabs TTS API, processes the audio data, and writes it to a file.
     Args:
+        character_description (str): The character description used as the voice description.
+        text (str): The text to be synthesized into speech.
     Returns:
         Tuple[None, str]: A tuple containing:
             - generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
+            - file_path (str): The relative file path to the audio file where the synthesized speech was saved.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
         generated_voice_id = preview.generated_voice_id
         base64_audio = preview.audio_base_64
         filename = f"{generated_voice_id}.mp3"
+        audio_file_path = save_base64_audio_to_file(base64_audio, filename)
         # Write audio to file and return the relative path
+        return None, audio_file_path
     except Exception as e:
         if isinstance(e, ApiError):

src/integrations/hume_api.py CHANGED Viewed

@@ -23,7 +23,7 @@ from dataclasses import dataclass
 import logging
 import os
 import random
-from typing import Literal, Optional
 # Third-Party Library Imports
 import requests
@@ -96,28 +96,50 @@ hume_config = HumeConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def text_to_speech_with_hume(character_description: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
     Args:
-        character_description (str): The original user character description to use as the description for generating the voice.
-        text (str): The generated text to be converted to speech.
     Returns:
-        Tuple[str, str]: A tuple containing:
-            - generation_id (str): The generation ID returned from the Hume API.
-            - file_path (str): The relative path to the file where the synthesized audio was saved.
     Raises:
-        HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
     """
     logger.debug(
         f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. Text length: {len(text)} characters."
     )
     request_body = {
-        "utterances": [{"text": text, "description": character_description}]
     }
     try:
@@ -137,22 +159,58 @@ def text_to_speech_with_hume(character_description: str, text: str) -> bytes:
             raise HumeError(msg)
         # Extract the base64 encoded audio and generation ID from the generation
-        generation = generations[0]
-        generation_id = generation.get("generation_id")
-        base64_audio = generation.get("audio")
-        filename = f"{generation_id}.mp3"
-        # Write audio to file and return the relative path
-        return generation_id, save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         if isinstance(e, HTTPError):
             if e.response.status_code >= 400 and e.response.status_code < 500:
                 raise UnretryableHumeError(
-                    message=f'"{e.response.text}"',
-                    original_exception=e,
                 ) from e
-        raise HumeError(
-            message=f"{e}",
-            original_exception=e,
-        ) from e

 import logging
 import os
 import random
+from typing import Any, Dict, Literal, Optional, Tuple, Union
 # Third-Party Library Imports
 import requests
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def text_to_speech_with_hume(
+    character_description: str, text: str, num_generations: int = 1
+) -> Union[Tuple[str, str], Tuple[str, str, str, str]]:
     """
     Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
+    This function sends a POST request to the Hume TTS API with a character description and text
+    to be converted to speech. Depending on the specified number of generations (allowed values: 1 or 2),
+    the API returns one or two generations. For each generation, the function extracts the base64-encoded
+    audio and the generation ID, saves the audio as an MP3 file via the `save_base64_audio_to_file` helper,
+    and returns the relevant details.
     Args:
+        character_description (str): A description of the character, which is used as contextual input
+            for generating the voice.
+        text (str): The text to be converted to speech.
+        num_generations (int, optional): The number of audio generations to request from the API.
+            Allowed values are 1 or 2. If 1, only a single generation is processed; if 2, a second
+            generation is expected in the API response. Defaults to 1.
     Returns:
+        Union[Tuple[str, str], Tuple[str, str, str, str]]:
+            - If num_generations == 1: A tuple in the form (generation_a_id, audio_a_path).
+            - If num_generations == 2: A tuple in the form (generation_a_id, audio_a_path, generation_b_id, audio_b_path).
     Raises:
+        ValueError: If num_generations is not 1 or 2.
+        HumeError: If there is an error communicating with the Hume TTS API or parsing its response.
+        UnretryableHumeError: If a client-side HTTP error (status code in the 4xx range) is encountered.
+        Exception: Any other exceptions raised during the request or processing will be wrapped and re-raised as HumeError.
     """
     logger.debug(
         f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. Text length: {len(text)} characters."
     )
+    if num_generations < 1 or num_generations > 2:
+        raise ValueError("Invalid number of generations specified. Must be 1 or 2.")
     request_body = {
+        "utterances": [{"text": text, "description": character_description}],
+        "format": {
+            "type": hume_config.file_format,
+        },
+        "num_generations": num_generations,
     }
     try:
             raise HumeError(msg)
         # Extract the base64 encoded audio and generation ID from the generation
+        generation_a = generations[0]
+        generation_a_id, audio_a_path = parse_hume_tts_generation(generation_a)
+        if num_generations == 1:
+            return (generation_a_id, audio_a_path)
+        generation_b = generations[1]
+        generation_b_id, audio_b_path = parse_hume_tts_generation(generation_b)
+        return (generation_a_id, audio_a_path, generation_b_id, audio_b_path)
     except Exception as e:
         if isinstance(e, HTTPError):
             if e.response.status_code >= 400 and e.response.status_code < 500:
                 raise UnretryableHumeError(
+                    message=f'"{e.response.text}"', original_exception=e
                 ) from e
+        raise HumeError(message=f"{e}", original_exception=e) from e
+def parse_hume_tts_generation(generation: Dict[str, Any]) -> Tuple[str, str]:
+    """
+    Parse a Hume TTS generation response and save the decoded audio as an MP3 file.
+    This function extracts the generation ID and the base64-encoded audio from the provided
+    dictionary. It then decodes and saves the audio data to an MP3 file, naming the file using
+    the generation ID. Finally, it returns a tuple containing the generation ID and the file path
+    of the saved audio.
+    Args:
+        generation (Dict[str, Any]): A dictionary representing the TTS generation response from Hume.
+            Expected keys are:
+                - "generation_id" (str): A unique identifier for the generated audio.
+                - "audio" (str): A base64 encoded string of the audio data.
+    Returns:
+        Tuple[str, str]: A tuple containing:
+            - generation_id (str): The unique identifier for the audio generation.
+            - audio_path (str): The filesystem path where the audio file was saved.
+    Raises:
+        KeyError: If the "generation_id" or "audio" key is missing from the generation dictionary.
+        Exception: Propagates any exceptions raised by save_base64_audio_to_file, such as errors during
+                   the decoding or file saving process.
+    """
+    generation_id = generation.get("generation_id")
+    if generation_id is None:
+        raise KeyError("The generation dictionary is missing the 'generation_id' key.")
+    base64_audio = generation.get("audio")
+    if base64_audio is None:
+        raise KeyError("The generation dictionary is missing the 'audio' key.")
+    filename = f"{generation_id}.mp3"
+    audio_file_path = save_base64_audio_to_file(base64_audio, filename)
+    return generation_id, audio_file_path

src/types.py CHANGED Viewed

@@ -5,7 +5,7 @@ This module defines custom types for the application.
 """
 # Standard Library Imports
-from typing import Dict, Literal, TypedDict
 TTSProviderName = Literal["Hume AI", "ElevenLabs"]
@@ -24,6 +24,25 @@ OptionMap = Dict[OptionKey, TTSProviderName]
 """OptionMap defines the structure of the options mapping, where each key is an OptionKey and the value is a TTS provider."""
 class VotingResults(TypedDict):
     """Voting results data structure representing values we want to persist to the votes DB"""

 """
 # Standard Library Imports
+from typing import Dict, Literal, NamedTuple, TypedDict
 TTSProviderName = Literal["Hume AI", "ElevenLabs"]
 """OptionMap defines the structure of the options mapping, where each key is an OptionKey and the value is a TTS provider."""
+class Option(NamedTuple):
+    """
+    Represents a text-to-speech generation option.
+    This type encapsulates the details for a generated text-to-speech (TTS) option,
+    including the provider that produced the audio, the relative file path to the generated
+    audio file, and the unique generation identifier associated with the TTS output.
+    Attributes:
+        provider (TTSProviderName): The TTS provider that generated the audio.
+        audio (str): The relative file path to the audio file produced by the TTS provider.
+        generation_id (str): The unique identifier for this TTS generation.
+    """
+    provider: TTSProviderName
+    audio: str
+    generation_id: str
 class VotingResults(TypedDict):
     """Voting results data structure representing values we want to persist to the votes DB"""

src/utils.py CHANGED Viewed

@@ -13,13 +13,13 @@ Functions:
 # Standard Library Imports
 import base64
 import os
 # Local Application Imports
 from src.config import AUDIO_DIR, logger
-from src.constants import (
-    CHARACTER_DESCRIPTION_MIN_LENGTH,
-    CHARACTER_DESCRIPTION_MAX_LENGTH,
-)
 def truncate_text(text: str, max_length: int = 50) -> str:
@@ -108,14 +108,14 @@ def validate_character_description_length(character_description: str) -> None:
         f"Voice description length being validated: {character_description_length} characters"
     )
-    if character_description_length < CHARACTER_DESCRIPTION_MIN_LENGTH:
         raise ValueError(
-            f"Your character description is too short. Please enter at least {CHARACTER_DESCRIPTION_MIN_LENGTH} characters. "
             f"(Current length: {character_description_length})"
         )
-    if character_description_length > CHARACTER_DESCRIPTION_MAX_LENGTH:
         raise ValueError(
-            f"Your character description is too long. Please limit it to {CHARACTER_DESCRIPTION_MAX_LENGTH} characters. "
             f"(Current length: {character_description_length})"
         )
     logger.debug(
@@ -162,3 +162,102 @@ def save_base64_audio_to_file(base64_audio: str, filename: str) -> str:
     logger.debug(f"Audio file relative path: {relative_path}")
     return relative_path

 # Standard Library Imports
 import base64
 import os
+import random
+from typing import Tuple
 # Local Application Imports
+from src import constants
 from src.config import AUDIO_DIR, logger
+from src.types import ComparisonType, Option, OptionMap, TTSProviderName
 def truncate_text(text: str, max_length: int = 50) -> str:
         f"Voice description length being validated: {character_description_length} characters"
     )
+    if character_description_length < constants.CHARACTER_DESCRIPTION_MIN_LENGTH:
         raise ValueError(
+            f"Your character description is too short. Please enter at least {constants.CHARACTER_DESCRIPTION_MIN_LENGTH} characters. "
             f"(Current length: {character_description_length})"
         )
+    if character_description_length > constants.CHARACTER_DESCRIPTION_MAX_LENGTH:
         raise ValueError(
+            f"Your character description is too long. Please limit it to {constants.CHARACTER_DESCRIPTION_MAX_LENGTH} characters. "
             f"(Current length: {character_description_length})"
         )
     logger.debug(
     logger.debug(f"Audio file relative path: {relative_path}")
     return relative_path
+def choose_providers(
+    text_modified: bool,
+) -> Tuple[ComparisonType, TTSProviderName, TTSProviderName]:
+    """
+    Select two TTS providers based on whether the text has been modified.
+    The first provider is always set to "Hume AI". For the second provider, the function
+    selects "Hume AI" if the text has been modified; otherwise, it randomly chooses one from
+    the TTS_PROVIDERS list.
+    Args:
+        text_modified (bool): A flag indicating whether the text has been modified.
+            - If True, both providers will be "Hume AI".
+            - If False, the second provider is randomly selected from TTS_PROVIDERS.
+    Returns:
+        Tuple[TTSProviderName, TTSProviderName]: A tuple containing two TTS provider names,
+        where the first is always "Hume AI" and the second is determined by the text_modified
+        flag and random selection.
+    """
+    provider_a = constants.HUME_AI
+    provider_b = (
+        constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
+    )
+    match provider_b:
+        case constants.HUME_AI:
+            comparison_type = constants.HUME_TO_HUME
+        case constants.ELEVENLABS:
+            comparison_type = constants.HUME_TO_ELEVENLABS
+    return comparison_type, provider_a, provider_b
+def create_shuffled_tts_options(
+    provider_a: TTSProviderName,
+    audio_a: str,
+    generation_id_a: str,
+    provider_b: TTSProviderName,
+    audio_b: str,
+    generation_id_b: str,
+) -> Tuple[str, str, str, str, OptionMap]:
+    """
+    Create and shuffle TTS generation options.
+    This function creates two Option instances from the provided TTS details, shuffles them,
+    and then extracts the audio file paths and generation IDs from the shuffled options.
+    It also returns a mapping from option constants to the corresponding TTS providers.
+    Args:
+        provider_a (TTSProviderName): The TTS provider for the first generation.
+        audio_a (str): The relative file path to the audio file for the first generation.
+        generation_id_a (str): The generation ID for the first generation.
+        provider_b (TTSProviderName): The TTS provider for the second generation.
+        audio_b (str): The relative file path to the audio file for the second generation.
+        generation_id_b (str): The generation ID for the second generation.
+    Returns:
+        Tuple[str, str, str, str, OptionMap]:
+            A tuple containing:
+            - option_a_audio (str): Audio file path for the first shuffled option.
+            - option_b_audio (str): Audio file path for the second shuffled option.
+            - option_a_generation_id (str): Generation ID for the first shuffled option.
+            - option_b_generation_id (str): Generation ID for the second shuffled option.
+            - options_map (OptionMap): Mapping from option constants to their TTS providers.
+    """
+    # Create a list of Option instances for the available providers.
+    options = [
+        Option(provider=provider_a, audio=audio_a, generation_id=generation_id_a),
+        Option(provider=provider_b, audio=audio_b, generation_id=generation_id_b),
+    ]
+    # Randomly shuffle the list of options.
+    random.shuffle(options)
+    # Unpack the two options.
+    option_a, option_b = options
+    # Extract audio file paths and generation IDs.
+    option_a_audio = option_a.audio
+    option_b_audio = option_b.audio
+    option_a_generation_id = option_a.generation_id
+    option_b_generation_id = option_b.generation_id
+    # Build a mapping from option constants to the corresponding providers.
+    options_map: OptionMap = {
+        constants.OPTION_A: option_a.provider,
+        constants.OPTION_B: option_b.provider,
+    }
+    return (
+        option_a_audio,
+        option_b_audio,
+        option_a_generation_id,
+        option_b_generation_id,
+        options_map,
+    )