Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 3

Commit

36b195f

1 Parent(s): 514de3d

Restore encapsulation for ElevenLabs integration, update TTS functions to return the voice name in addition to the audio

Browse files

Files changed (4) hide show

src/app.py +4 -6
src/integrations/__init__.py +1 -1
src/integrations/elevenlabs_api.py +46 -45
src/integrations/hume_api.py +3 -2

src/app.py CHANGED Viewed

@@ -35,7 +35,6 @@ from src.integrations import (
     AnthropicError,
     ElevenLabsError,
     generate_text_with_claude,
-    get_random_elevenlabs_voice_id,
     get_random_hume_voice_names,
     HumeError,
     text_to_speech_with_elevenlabs,
@@ -106,9 +105,7 @@ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[g
     # If not using generated text, then only compare Hume to Hume
     compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
-    elevenlabs_voice = get_random_elevenlabs_voice_id()
-    # Get two Hume voices preemptively in case we compare Hume with Hume
-    # to remove chance synthesizing speech twice with the same voice
     hume_voice_a, hume_voice_b = get_random_hume_voice_names()
     try:
@@ -118,12 +115,13 @@ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[g
             if compare_hume_with_elevenlabs:
                 provider_b = ELEVENLABS
-                future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text, elevenlabs_voice)
             else:
                 provider_b = HUME_AI
                 future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
-            audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
         logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
         options = [(audio_a, provider_a), (audio_b, provider_b)]

     AnthropicError,
     ElevenLabsError,
     generate_text_with_claude,
     get_random_hume_voice_names,
     HumeError,
     text_to_speech_with_elevenlabs,
     # If not using generated text, then only compare Hume to Hume
     compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
+    # Pre-select two Hume voices pre-emptively in case we compare Hume to Hume to ensure we do not select the same voice twice.
     hume_voice_a, hume_voice_b = get_random_hume_voice_names()
     try:
             if compare_hume_with_elevenlabs:
                 provider_b = ELEVENLABS
+                future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text)
             else:
                 provider_b = HUME_AI
                 future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
+            voice_a, audio_a = future_audio_a.result()
+            voice_b, audio_b = future_audio_b.result()
         logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
         options = [(audio_a, provider_a), (audio_b, provider_b)]

src/integrations/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .anthropic_api import generate_text_with_claude, AnthropicError
-from .elevenlabs_api import text_to_speech_with_elevenlabs, get_random_elevenlabs_voice_id, ElevenLabsError
 from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError

 from .anthropic_api import generate_text_with_claude, AnthropicError
+from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
 from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError

src/integrations/elevenlabs_api.py CHANGED Viewed

@@ -20,9 +20,10 @@ Functions:
 # Standard Library Imports
 from dataclasses import dataclass
 import logging
 import random
-from typing import Literal, Optional
 # Third-Party Library Imports
 from elevenlabs import ElevenLabs
@@ -30,27 +31,34 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
 # Local Application Imports
 from src.config import logger
-from src.utils import validate_env_var, truncate_text
-ElevenlabsVoiceId = Literal[
-    "pNInz6obpgDQGcFmaJgB",
-    "ErXwobaYiN019PkySvjV",
-    "21m00Tcm4TlvDq8ikWAM",
-    "XrExE9yKIg1WjnnlVkGX"
-]
 @dataclass(frozen=True)
 class ElevenLabsConfig:
     """Immutable configuration for interacting with the ElevenLabs TTS API."""
     api_key: str = validate_env_var('ELEVENLABS_API_KEY')
-    model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
-    output_format: str = 'mp3_44100_128' # Output format of the generated audio
-    voice_ids: list[ElevenlabsVoiceId] = (
-        'pNInz6obpgDQGcFmaJgB',  # Adam
-        'ErXwobaYiN019PkySvjV',  # Antoni
-        '21m00Tcm4TlvDq8ikWAM',  # Rachel
-        'XrExE9yKIg1WjnnlVkGX',  # Matilda
-    )
     def __post_init__(self):
         # Validate that required attributes are set
@@ -60,8 +68,6 @@ class ElevenLabsConfig:
             raise ValueError('ElevenLabs Model ID is not set.')
         if not self.output_format:
             raise ValueError('ElevenLabs Output Format is not set.')
-        if not self.voice_ids:
-            raise ValueError('ElevenLabs Voice IDs are not set.')
     @property
     def client(self) -> ElevenLabs:
@@ -74,11 +80,14 @@ class ElevenLabsConfig:
         return ElevenLabs(api_key=self.api_key)
     @property
-    def random_voice_id(self) -> str:
         """
-        Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
         """
-        return random.choice(self.voice_ids)
 class ElevenLabsError(Exception):
@@ -99,38 +108,42 @@ elevenlabs_config = ElevenLabsConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True
 )
-def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> bytes:
     """
     Synthesizes text to speech using the ElevenLabs TTS API.
     Args:
         text (str): The text to be synthesized to speech.
-        voice_id (str): The voice ID for Elevenlabs to use when synthesizing speech.
     Returns:
-        bytes: The raw binary audio data for playback.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
     """
     logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')
     try:
         # Synthesize speech using the ElevenLabs SDK
         audio_iterator = elevenlabs_config.client.text_to_speech.convert(
             text=text,
-            voice_id=voice_id,
             model_id=elevenlabs_config.model_id,
             output_format=elevenlabs_config.output_format,
         )
-       # Ensure the response is an iterator
-        if not hasattr(audio_iterator, '__iter__') or not hasattr(audio_iterator, '__next__'):
             logger.error('Invalid audio iterator response.')
-            raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.')
-        # Combine chunks into a single bytes object
-        audio = b''.join(chunk for chunk in audio_iterator)
         # Validate audio
         if not audio:
@@ -138,23 +151,11 @@ def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> by
             raise ElevenLabsError('Empty audio data received from ElevenLabs API.')
         logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
-        return audio
     except Exception as e:
         logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
         raise ElevenLabsError(
             message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
             original_exception=e,
-        )
-def get_random_elevenlabs_voice_id() -> ElevenlabsVoiceId:
-    """
-    Get a random Elevenlabs voice ID.
-    Voices:
-        - pNInz6obpgDQGcFmaJgB (Adam)
-        - ErXwobaYiN019PkySvjV (Antoni)
-        - 21m00Tcm4TlvDq8ikWAM (Rachel)
-        - XrExE9yKIg1WjnnlVkGX (Matilda)
-    """
-    return elevenlabs_config.random_voice_id

 # Standard Library Imports
 from dataclasses import dataclass
+from enum import Enum
 import logging
 import random
+from typing import Literal, Optional, Tuple
 # Third-Party Library Imports
 from elevenlabs import ElevenLabs
 # Local Application Imports
 from src.config import logger
+from src.utils import validate_env_var
+ElevenlabsVoiceName = Literal['Adam', 'Antoni', 'Rachel', 'Matilda']
+class ElevenLabsVoice(Enum):
+    ADAM = ('Adam', 'pNInz6obpgDQGcFmaJgB')
+    ANTONI = ('Antoni', 'ErXwobaYiN019PkySvjV')
+    RACHEL = ('Rachel', '21m00Tcm4TlvDq8ikWAM')
+    MATILDA = ('Matilda', 'XrExE9yKIg1WjnnlVkGX')
+    @property
+    def voice_name(self) -> ElevenlabsVoiceName:
+        """Returns the display name of the voice."""
+        return self.value[0]
+    @property
+    def voice_id(self) -> str:
+        """Returns the ElevenLabs voice ID."""
+        return self.value[1]
 @dataclass(frozen=True)
 class ElevenLabsConfig:
     """Immutable configuration for interacting with the ElevenLabs TTS API."""
     api_key: str = validate_env_var('ELEVENLABS_API_KEY')
+    model_id: str = 'eleven_multilingual_v2'  # ElevenLab's most emotionally expressive model
+    output_format: str = 'mp3_44100_128'  # Output format of the generated audio
     def __post_init__(self):
         # Validate that required attributes are set
             raise ValueError('ElevenLabs Model ID is not set.')
         if not self.output_format:
             raise ValueError('ElevenLabs Output Format is not set.')
     @property
     def client(self) -> ElevenLabs:
         return ElevenLabs(api_key=self.api_key)
     @property
+    def random_voice(self) -> ElevenLabsVoice:
         """
+        Selects a random ElevenLabs voice.
+        Returns:
+            ElevenLabsVoice: A randomly selected voice enum member.
         """
+        return random.choice(list(ElevenLabsVoice))
 class ElevenLabsError(Exception):
     after=after_log(logger, logging.DEBUG),
     reraise=True
 )
+def text_to_speech_with_elevenlabs(text: str) -> Tuple[ElevenlabsVoiceName, bytes]:
     """
     Synthesizes text to speech using the ElevenLabs TTS API.
     Args:
         text (str): The text to be synthesized to speech.
     Returns:
+        Tuple[ElevenlabsVoiceName, bytes]: A tuple containing the voice name used for speech synthesis
+                                           and the raw binary audio data for playback.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
     """
     logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')
+    # Get a random voice as an enum member.
+    voice = elevenlabs_config.random_voice
+    logger.debug(f"Selected voice: {voice.voice_name}")
     try:
         # Synthesize speech using the ElevenLabs SDK
         audio_iterator = elevenlabs_config.client.text_to_speech.convert(
             text=text,
+            voice_id=voice.voice_id,
             model_id=elevenlabs_config.model_id,
             output_format=elevenlabs_config.output_format,
         )
+        # Attempt to combine chunks into a single bytes object.
+        # If audio_iterator is not iterable or invalid, an exception will be raised.
+        try:
+            audio = b''.join(chunk for chunk in audio_iterator)
+        except Exception as iter_error:
             logger.error('Invalid audio iterator response.')
+            raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.') from iter_error
         # Validate audio
         if not audio:
             raise ElevenLabsError('Empty audio data received from ElevenLabs API.')
         logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
+        return voice.voice_name, audio
     except Exception as e:
         logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
         raise ElevenLabsError(
             message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
             original_exception=e,
+        )

src/integrations/hume_api.py CHANGED Viewed

@@ -90,6 +90,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
         voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
     Returns:
         bytes: The raw binary audio data for playback.
     Raises:
@@ -121,7 +122,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
         if response.headers.get('Content-Type', '').startswith('audio/'):
             audio = response.content  # Raw binary audio data
             logger.info(f'Received audio data from Hume ({len(audio)} bytes).')
-            return audio
         raise HumeError(f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}')
@@ -132,7 +133,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
             original_exception=e,
         )
-def get_random_hume_voice_names() -> Tuple[str, str]:
     """
     Get two random Hume voice names.

         voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
     Returns:
+        voice_name: The name of the voice used for speech synthesis.
         bytes: The raw binary audio data for playback.
     Raises:
         if response.headers.get('Content-Type', '').startswith('audio/'):
             audio = response.content  # Raw binary audio data
             logger.info(f'Received audio data from Hume ({len(audio)} bytes).')
+            return voice_name, audio
         raise HumeError(f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}')
             original_exception=e,
         )
+def get_random_hume_voice_names() -> Tuple[HumeVoiceName, HumeVoiceName]:
     """
     Get two random Hume voice names.