Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 5

Commit

7f25817

1 Parent(s): 6431bab

Update Hume integration to use OCTAVE TTS endpoint, update Elevenlabs integration to use voice design endpoint, no longer specify voice since voices are now generated

Browse files

Files changed (5) hide show

src/app.py +24 -34
src/integrations/__init__.py +1 -1
src/integrations/elevenlabs_api.py +17 -31
src/integrations/hume_api.py +32 -59
src/types.py +2 -15

src/app.py CHANGED Viewed

@@ -35,7 +35,6 @@ from src.integrations import (
     AnthropicError,
     ElevenLabsError,
     generate_text_with_claude,
-    get_random_hume_voice_names,
     HumeError,
     text_to_speech_with_elevenlabs,
     text_to_speech_with_hume,
@@ -114,34 +113,29 @@ def text_to_speech(
         random.random() < 0.5
     )
-    # Pre-select two Hume voices pre-emptively in case we compare Hume to Hume to ensure we do not select the same voice twice.
-    hume_voice_a, hume_voice_b = get_random_hume_voice_names()
     try:
         with ThreadPoolExecutor(max_workers=2) as executor:
             provider_a = HUME_AI
-            future_audio_a = executor.submit(
-                text_to_speech_with_hume, prompt, text, hume_voice_a
-            )
             if compare_hume_with_elevenlabs:
                 provider_b = ELEVENLABS
-                future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text)
-            else:
-                provider_b = HUME_AI
                 future_audio_b = executor.submit(
-                    text_to_speech_with_hume, prompt, text, hume_voice_b
                 )
-            voice_a, audio_a = future_audio_a.result()
-            voice_b, audio_b = future_audio_b.result()
         logger.info(
             f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
         )
         options = [
-            (audio_a, {"provider": provider_a, "voice": voice_a}),
-            (audio_b, {"provider": provider_b, "voice": voice_b}),
         ]
         random.shuffle(options)
         option_a_audio, option_b_audio = options[0][0], options[1][0]
@@ -179,16 +173,16 @@ def vote(
         option_map (OptionMap): A dictionary mapping option labels to their details.
             Expected structure:
             {
-                'Option A': '{"provider": "Hume AI", "voice": "<voice_name>"}',
-                'Option B': '{"provider": "ElevenLabs", "voice": "<voice_name>"}'
             }
         selected_button (str): The button that was clicked.
     Returns:
         A tuple of:
          - A boolean indicating if the vote was accepted.
-         - An update for the selected vote button (showing provider, voice, and trophy emoji).
-         - An update for the unselected vote button (showing provider and voice).
          - An update for enabling vote interactions.
     """
     if not option_map or vote_submitted:
@@ -198,20 +192,12 @@ def vote(
     selected_option, other_option = (
         (OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
     )
-    # Parse selected option details from options map
-    selected_details = option_map.get(selected_option, {})
-    selected_provider = selected_details.get("provider", UNKNOWN_PROVIDER)
-    selected_voice = selected_details.get("voice", "")
-    # Parse other option details from options map
-    other_details = option_map.get(other_option, {})
-    other_provider = other_details.get("provider", UNKNOWN_PROVIDER)
-    other_voice = other_details.get("voice", "")
     # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
-    selected_label = f"{selected_provider} | Voice: {selected_voice} {TROPHY_EMOJI}"
-    other_label = f"{other_provider} | Voice: {other_voice}"
     return (
         True,
@@ -245,7 +231,7 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
     """
     return (
         gr.update(value=None),
-        gr.update(value=None),
         gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
         gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
         None,
@@ -398,9 +384,13 @@ def build_gradio_interface() -> gr.Blocks:
         # 3. Synthesize speech, load audio players, and display vote button
         # 4. Enable the "Synthesize speech" button and display vote buttons
         synthesize_speech_button.click(
-            fn=lambda: gr.update(interactive=False),
             inputs=[],
-            outputs=[synthesize_speech_button],
         ).then(
             fn=reset_ui,
             inputs=[],

     AnthropicError,
     ElevenLabsError,
     generate_text_with_claude,
     HumeError,
     text_to_speech_with_elevenlabs,
     text_to_speech_with_hume,
         random.random() < 0.5
     )
     try:
         with ThreadPoolExecutor(max_workers=2) as executor:
             provider_a = HUME_AI
+            future_audio_a = executor.submit(text_to_speech_with_hume, prompt, text)
             if compare_hume_with_elevenlabs:
                 provider_b = ELEVENLABS
                 future_audio_b = executor.submit(
+                    text_to_speech_with_elevenlabs, prompt, text
                 )
+            else:
+                provider_b = HUME_AI
+                future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text)
+            audio_a = future_audio_a.result()
+            audio_b = future_audio_b.result()
         logger.info(
             f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
         )
         options = [
+            (audio_a, provider_a),
+            (audio_b, provider_b),
         ]
         random.shuffle(options)
         option_a_audio, option_b_audio = options[0][0], options[1][0]
         option_map (OptionMap): A dictionary mapping option labels to their details.
             Expected structure:
             {
+                'Option A': 'Hume AI',
+                'Option B': 'ElevenLabs',
             }
         selected_button (str): The button that was clicked.
     Returns:
         A tuple of:
          - A boolean indicating if the vote was accepted.
+         - An update for the selected vote button (showing provider and trophy emoji).
+         - An update for the unselected vote button (showing provider).
          - An update for enabling vote interactions.
     """
     if not option_map or vote_submitted:
     selected_option, other_option = (
         (OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
     )
+    selected_provider = option_map.get(selected_option)
+    other_provider = option_map.get(other_option)
     # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
+    selected_label = f"{selected_provider} {TROPHY_EMOJI}"
+    other_label = f"{other_provider}"
     return (
         True,
     """
     return (
         gr.update(value=None),
+        gr.update(value=None, autoplay=False),
         gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
         gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
         None,
         # 3. Synthesize speech, load audio players, and display vote button
         # 4. Enable the "Synthesize speech" button and display vote buttons
         synthesize_speech_button.click(
+            fn=lambda: (
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+            ),
             inputs=[],
+            outputs=[synthesize_speech_button, vote_button_a, vote_button_b],
         ).then(
             fn=reset_ui,
             inputs=[],

src/integrations/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .anthropic_api import generate_text_with_claude, AnthropicError
 from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
-from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError

 from .anthropic_api import generate_text_with_claude, AnthropicError
 from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
+from .hume_api import text_to_speech_with_hume, HumeError

src/integrations/elevenlabs_api.py CHANGED Viewed

@@ -114,58 +114,44 @@ elevenlabs_config = ElevenLabsConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def text_to_speech_with_elevenlabs(text: str) -> Tuple[ElevenlabsVoiceName, bytes]:
     """
     Synthesizes text to speech using the ElevenLabs TTS API.
     Args:
         text (str): The text to be synthesized to speech.
     Returns:
-        Tuple[ElevenlabsVoiceName, bytes]: A tuple containing the voice name used for speech synthesis
-                                           and the raw binary audio data for playback.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
     """
     logger.debug(
-        f"Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters."
     )
-    # Get a random voice as an enum member.
-    voice = elevenlabs_config.random_voice
-    logger.debug(f"Selected voice: {voice.voice_name}")
     try:
         # Synthesize speech using the ElevenLabs SDK
-        audio_iterator = elevenlabs_config.client.text_to_speech.convert(
             text=text,
-            voice_id=voice.voice_id,
-            model_id=elevenlabs_config.model_id,
-            output_format=elevenlabs_config.output_format,
         )
-        # Attempt to combine chunks into a single bytes object.
-        # If audio_iterator is not iterable or invalid, an exception will be raised.
-        try:
-            audio = b"".join(chunk for chunk in audio_iterator)
-        except Exception as iter_error:
-            logger.error("Invalid audio iterator response.")
-            raise ElevenLabsError(
-                "Invalid audio iterator received from ElevenLabs API."
-            ) from iter_error
-        # Validate audio
-        if not audio:
-            logger.error("No audio data received from ElevenLabs API.")
-            raise ElevenLabsError("Empty audio data received from ElevenLabs API.")
-        logger.info(f"Received ElevenLabs audio ({len(audio)} bytes).")
-        return voice.voice_name, audio
     except Exception as e:
-        logger.exception(f"Error synthesizing speech from text with Elevenlabs: {e}")
         raise ElevenLabsError(
-            message=f"Failed to synthesize speech from text with ElevenLabs: {e}",
             original_exception=e,
-        )

     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the ElevenLabs TTS API.
     Args:
+        prompt (str): The original user prompt used as the voice description.
         text (str): The text to be synthesized to speech.
     Returns:
+        bytes: The raw binary audio data for playback.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
     """
     logger.debug(
+        f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
     )
     try:
         # Synthesize speech using the ElevenLabs SDK
+        response = elevenlabs_config.client.text_to_voice.create_previews(
+            voice_description=prompt,
             text=text,
         )
+        previews = response.previews
+        if not previews:
+            msg = "No previews returned by ElevenLabs API."
+            logger.error(msg)
+            raise ElevenLabsError(message=msg)
+        base64_audio = previews[0].audio_base64
+        audio = base64.b64decode(base64_audio)
+        return audio
     except Exception as e:
+        logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")
         raise ElevenLabsError(
+            message=f"Failed to synthesize speech with ElevenLabs: {e}",
             original_exception=e,
+        ) from e

src/integrations/hume_api.py CHANGED Viewed

@@ -19,6 +19,7 @@ Functions:
 """
 # Standard Library Imports
 from dataclasses import dataclass
 import logging
 import random
@@ -33,17 +34,12 @@ from src.config import logger
 from src.utils import validate_env_var, truncate_text
-HumeVoiceName = Literal["ITO", "KORA", "STELLA", "DACHER"]
 @dataclass(frozen=True)
 class HumeConfig:
     """Immutable configuration for interacting with the Hume TTS API."""
     api_key: str = validate_env_var("HUME_API_KEY")
-    tts_endpoint_url: str = "https://api.hume.ai/v0/tts"
-    voice_names: List[HumeVoiceName] = ("ITO", "KORA", "STELLA", "DACHER")
-    audio_format: str = "wav"
     headers: dict = None
     def __post_init__(self):
@@ -52,10 +48,6 @@ class HumeConfig:
             raise ValueError("Hume API key is not set.")
         if not self.tts_endpoint_url:
             raise ValueError("Hume TTS endpoint URL is not set.")
-        if not self.voice_names:
-            raise ValueError("Hume voice names list is not set.")
-        if not self.audio_format:
-            raise ValueError("Hume audio format is not set.")
         # Set headers dynamically after validation
         object.__setattr__(
@@ -81,38 +73,31 @@ hume_config = HumeConfig()
 @retry(
-    stop=stop_after_attempt(1),
     wait=wait_fixed(2),
     before=before_log(logger, logging.DEBUG),
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def text_to_speech_with_hume(
-    prompt: str, text: str, voice_name: HumeVoiceName
-) -> bytes:
     """
     Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
     Args:
-        prompt (str): The original user prompt (for debugging).
         text (str): The generated text to be converted to speech.
-        voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
     Returns:
-        voice_name: The name of the voice used for speech synthesis.
         bytes: The raw binary audio data for playback.
     Raises:
-        HumeError: If there is an error communicating with the Hume TTS API.
     """
     logger.debug(
         f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
     )
-    request_body = {
-        "text": text,
-        "voice": {"name": voice_name},
-    }
     try:
         # Synthesize speech using the Hume TTS API
@@ -121,42 +106,30 @@ def text_to_speech_with_hume(
             headers=hume_config.headers,
             json=request_body,
         )
-        # Validate response
-        if response.status_code != 200:
-            logger.error(
-                f"Hume TTS API Error: {response.status_code} - {response.text[:200]}... (truncated)"
-            )
-            raise HumeError(
-                f"Hume TTS API responded with status {response.status_code}: {response.text[:200]}"
-            )
-        # Process response audio
-        if response.headers.get("Content-Type", "").startswith("audio/"):
-            audio = response.content  # Raw binary audio data
-            logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
-            return voice_name, audio
-        raise HumeError(
-            f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}'
-        )
-    except Exception as e:
-        logger.exception(f"Error synthesizing speech from text with Hume: {e}")
-        raise HumeError(
-            message=f"Failed to synthesize speech from text with Hume: {e}",
-            original_exception=e,
-        )
-def get_random_hume_voice_names() -> Tuple[HumeVoiceName, HumeVoiceName]:
-    """
-    Get two random Hume voice names.
-    Voices:
-        - ITO
-        - KORA
-        - STELLA
-        - DACHER
-    """
-    return tuple(random.sample(hume_config.voice_names, 2))

 """
 # Standard Library Imports
+import base64
 from dataclasses import dataclass
 import logging
 import random
 from src.utils import validate_env_var, truncate_text
 @dataclass(frozen=True)
 class HumeConfig:
     """Immutable configuration for interacting with the Hume TTS API."""
     api_key: str = validate_env_var("HUME_API_KEY")
+    tts_endpoint_url: str = "https://test-api.hume.ai/v0/tts/octave"
     headers: dict = None
     def __post_init__(self):
             raise ValueError("Hume API key is not set.")
         if not self.tts_endpoint_url:
             raise ValueError("Hume TTS endpoint URL is not set.")
         # Set headers dynamically after validation
         object.__setattr__(
 @retry(
+    stop=stop_after_attempt(3),
     wait=wait_fixed(2),
     before=before_log(logger, logging.DEBUG),
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
     Args:
+        prompt (str): The original user prompt to use as the description for generating the voice.
         text (str): The generated text to be converted to speech.
     Returns:
         bytes: The raw binary audio data for playback.
     Raises:
+        HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
     """
     logger.debug(
         f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
     )
+    request_body = {"utterances": [{"text": text, "description": prompt}]}
     try:
         # Synthesize speech using the Hume TTS API
             headers=hume_config.headers,
             json=request_body,
         )
+        response.raise_for_status()
+    except requests.RequestException as re:
+        logger.exception(f"Error communicating with Hume TTS API: {re}")
+        raise HumeError(f"Error communicating with Hume TTS API: {re}") from re
+    try:
+        # Parse JSON response
+        response_data = response.json()
+    except ValueError as ve:
+        logger.exception("Invalid JSON response from Hume TTS API")
+        raise HumeError("Invalid JSON response from Hume TTS API") from ve
+    try:
+        # Safely extract the generation result from the response JSON
+        generations = response_data.get("generations", [])
+        if not generations or "audio" not in generations[0]:
+            logger.error("Missing 'audio' data in the response.")
+            raise HumeError("Missing audio data in response from Hume TTS API")
+        base64_audio = generations[0]["audio"]
+        # Decode base64 encoded audio
+        audio = base64.b64decode(base64_audio)
+    except (KeyError, TypeError, base64.binascii.Error) as ae:
+        logger.exception(f"Error processing audio data: {ae}")
+        raise HumeError(f"Error processing audio data from Hume TTS API: {ae}") from ae
+    logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
+    return audio

src/types.py CHANGED Viewed

@@ -9,27 +9,14 @@ has a consistent structure including both the provider and the associated voice.
 from typing import TypedDict, Literal, Dict
-TTSProviderName = Literal["Hume AI", "ElevenLabs", "Unknown"]
 """TTSProviderName represents the allowed provider names for TTS services."""
-class OptionDetails(TypedDict):
-    """
-    A typed dictionary representing the details of an option.
-    Attributes:
-        provider (TTSProviderName): The name of the provider (either 'Hume AI' or 'ElevenLabs').
-        voice (str): The name of the voice associated with the option.
-    """
-    provider: TTSProviderName
-    voice: str
 OptionKey = Literal["Option A", "Option B"]
 """OptionKey is restricted to the literal values 'Option A' or 'Option B'."""
-OptionMap = Dict[OptionKey, OptionDetails]
 """OptionMap defines the structure of the options mapping, where each key is an OptionKey
 and the value is an OptionDetails dictionary."""

 from typing import TypedDict, Literal, Dict
+TTSProviderName = Literal["Hume AI", "ElevenLabs"]
 """TTSProviderName represents the allowed provider names for TTS services."""
 OptionKey = Literal["Option A", "Option B"]
 """OptionKey is restricted to the literal values 'Option A' or 'Option B'."""
+OptionMap = Dict[OptionKey, TTSProviderName]
 """OptionMap defines the structure of the options mapping, where each key is an OptionKey
 and the value is an OptionDetails dictionary."""