Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 2

Commit

8047063

1 Parent(s): bc5091e

Update UI to compare Hume vs Elevenlabs 50% of the time, and Hume vs Hume 50% of the time

Browse files

Files changed (5) hide show

src/app.py +71 -53
src/constants.py +11 -6
src/integrations/__init__.py +2 -2
src/integrations/elevenlabs_api.py +29 -10
src/integrations/hume_api.py +26 -19

src/app.py CHANGED Viewed

@@ -19,23 +19,27 @@ import gradio as gr
 # Local Application Imports
 from src.config import logger
 from src.constants import (
-    OPTION_ONE,
-    OPTION_TWO,
     PROMPT_MAX_LENGTH,
     PROMPT_MIN_LENGTH,
     SAMPLE_PROMPTS,
     TROPHY_EMOJI,
     UNKNOWN_PROVIDER,
-    VOTE_FOR_OPTION_ONE,
-    VOTE_FOR_OPTION_TWO
 )
 from src.integrations import (
     AnthropicError,
     ElevenLabsError,
     generate_text_with_claude,
     HumeError,
     text_to_speech_with_elevenlabs,
-    text_to_speech_with_hume
 )
 from src.theme import CustomTheme
 from src.utils import truncate_text, validate_prompt_length
@@ -76,7 +80,9 @@ def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
 def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
     """
-    Synthesizes generated text to speech using Hume and ElevenLabs APIs in parallel.
     Args:
         prompt (str): The original prompt.
@@ -96,25 +102,37 @@ def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.upda
         logger.warning('Skipping text-to-speech due to empty text.')
         return gr.skip(), gr.skip(), gr.skip(), gr.skip()
     try:
         with ThreadPoolExecutor(max_workers=2) as executor:
-            future_hume = executor.submit(text_to_speech_with_hume, prompt, generated_text)
-            future_elevenlabs = executor.submit(text_to_speech_with_elevenlabs, generated_text)
-            hume_audio = future_hume.result()
-            elevenlabs_audio = future_elevenlabs.result()
-        logger.info(f'TTS generated: Hume={len(hume_audio)} bytes, ElevenLabs={len(elevenlabs_audio)} bytes')
-        options = [(hume_audio, 'Hume AI'), (elevenlabs_audio, 'ElevenLabs')]
         random.shuffle(options)
-        option_1_audio, option_2_audio = options[0][0], options[1][0]
-        options_map = { OPTION_ONE: options[0][1], OPTION_TWO: options[1][1] }
         return (
-            gr.update(value=option_1_audio, autoplay=True),
-            gr.update(value=option_2_audio),
             options_map,
-            option_2_audio,
         )
     except ElevenLabsError as ee:
         logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
@@ -145,16 +163,16 @@ def vote(vote_submitted: bool, option_mapping: dict, selected_button: str) -> Tu
     if not option_mapping or vote_submitted:
         return gr.skip(), gr.skip(), gr.skip()
-    is_option_1 = selected_button == VOTE_FOR_OPTION_ONE
-    selected_option, other_option = (OPTION_ONE, OPTION_TWO) if is_option_1 else (OPTION_TWO, OPTION_ONE)
     selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
     other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
     return (
         True,
-        gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_1
             else gr.update(value=other_provider, variant='secondary'),
-        gr.update(value=other_provider, variant='secondary') if is_option_1
             else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
     )
@@ -164,10 +182,10 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
     Returns:
         A tuple of updates for:
-         - option1_audio_player (clear audio)
-         - option2_audio_player (clear audio)
-         - vote_button_1 (disable and reset button text)
-         - vote_button_2 (disable and reset button text)
          - option_mapping_state (reset option map state)
          - option2_audio_state (reset option 2 audio state)
          - vote_submitted_state (reset submitted vote state)
@@ -175,8 +193,8 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
     return (
         gr.update(value=None),
         gr.update(value=None),
-        gr.update(interactive=False, value=VOTE_FOR_OPTION_ONE, variant='secondary'),
-        gr.update(interactive=False, value=VOTE_FOR_OPTION_TWO, variant='secondary'),
         None,
         None,
         False,
@@ -204,7 +222,7 @@ def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Butt
             max_length=PROMPT_MAX_LENGTH,
             show_copy_button=True,
         )
-    generate_button = gr.Button('Generate', variant='primary')
     return instructions, sample_prompt_dropdown, prompt_input, generate_button
@@ -212,7 +230,7 @@ def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, g
     """Builds the output section including generated text, audio players, and vote buttons."""
     with gr.Column(variant='compact'):
         generated_text = gr.Textbox(
-            label='Generated text',
             interactive=False,
             autoscroll=False,
             lines=5,
@@ -221,12 +239,12 @@ def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, g
             show_copy_button=True,
         )
         with gr.Row(equal_height=True):
-            option1_audio_player = gr.Audio(label=OPTION_ONE, type='filepath', interactive=False)
-            option2_audio_player = gr.Audio(label=OPTION_TWO, type='filepath', interactive=False)
     with gr.Row():
-        vote_button_1 = gr.Button(VOTE_FOR_OPTION_ONE, interactive=False)
-        vote_button_2 = gr.Button(VOTE_FOR_OPTION_TWO, interactive=False)
-    return generated_text, option1_audio_player, option2_audio_player, vote_button_1, vote_button_2
 def build_gradio_interface() -> gr.Blocks:
@@ -250,7 +268,7 @@ def build_gradio_interface() -> gr.Blocks:
         instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
         # Build output section
-        generated_text, option1_audio_player, option2_audio_player, vote_button_1, vote_button_2 = build_output_section()
         # UI state components
         option_mapping_state = gr.State()       # Track option map (option 1 and option 2 are randomized)
@@ -280,10 +298,10 @@ def build_gradio_interface() -> gr.Blocks:
             fn=reset_ui,
             inputs=[],
             outputs=[
-                option1_audio_player,
-                option2_audio_player,
-                vote_button_1,
-                vote_button_2,
                 option_mapping_state,
                 option2_audio_state,
                 vote_submitted_state,
@@ -296,10 +314,10 @@ def build_gradio_interface() -> gr.Blocks:
             fn=text_to_speech,
             inputs=[prompt_input, generated_text],
             outputs=[
-                option1_audio_player,
-                option2_audio_player,
                 option_mapping_state,
-                option2_audio_state
             ],
         ).then(
             fn=lambda: gr.update(interactive=True), # Re-enable the button
@@ -308,33 +326,33 @@ def build_gradio_interface() -> gr.Blocks:
         )
         # Vote button click handlers
-        vote_button_1.click(
             fn=vote,
-            inputs=[vote_submitted_state, option_mapping_state, vote_button_1],
-            outputs=[vote_submitted_state, vote_button_1, vote_button_2],
         )
-        vote_button_2.click(
             fn=vote,
-            inputs=[vote_submitted_state, option_mapping_state, vote_button_2],
-            outputs=[vote_submitted_state, vote_button_1, vote_button_2],
         )
         # Auto-play second audio after first finishes (workaround for playing audio back-to-back)
-        option1_audio_player.stop(
             fn=lambda _: gr.update(value=None),
             inputs=[],
-            outputs=[option2_audio_player],
         ).then(
             fn=lambda audio: gr.update(value=audio, autoplay=True),
             inputs=[option2_audio_state],
-            outputs=[option2_audio_player],
         )
         # Enable voting after second audio option playback finishes
-        option2_audio_player.stop(
             fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
             inputs=[],
-            outputs=[vote_button_1, vote_button_2, option2_audio_player],
         )
     logger.debug('Gradio interface built successfully')

 # Local Application Imports
 from src.config import logger
 from src.constants import (
+    ELEVENLABS,
+    HUME_AI,
+    OPTION_A,
+    OPTION_B,
     PROMPT_MAX_LENGTH,
     PROMPT_MIN_LENGTH,
     SAMPLE_PROMPTS,
     TROPHY_EMOJI,
     UNKNOWN_PROVIDER,
+    VOTE_FOR_OPTION_A,
+    VOTE_FOR_OPTION_B,
 )
 from src.integrations import (
     AnthropicError,
     ElevenLabsError,
     generate_text_with_claude,
+    get_random_elevenlabs_voice_id,
+    get_random_hume_voice_names,
     HumeError,
     text_to_speech_with_elevenlabs,
+    text_to_speech_with_hume,
 )
 from src.theme import CustomTheme
 from src.utils import truncate_text, validate_prompt_length
 def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
     """
+    Synthesizes two text to speech outputs and loads the two audio players in the UI with the output audio.
+        - 50% of the time one Hume tts output and one Elevenlabs output will be synthesized.
+        = 50% of the time two Hume tts outputs will be synthesized.
     Args:
         prompt (str): The original prompt.
         logger.warning('Skipping text-to-speech due to empty text.')
         return gr.skip(), gr.skip(), gr.skip(), gr.skip()
+    # compare_hume_with_elevenlabs = random.random() < 0.5
+    compare_hume_with_elevenlabs = False
+    elevenlabs_voice = get_random_elevenlabs_voice_id()
+    hume_voice_a, hume_voice_b = get_random_hume_voice_names() # We get two Hume voices preemptively in case we compare Hume with Hume
     try:
         with ThreadPoolExecutor(max_workers=2) as executor:
+            provider_a = HUME_AI
+            future_audio_a = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_a)
+            if compare_hume_with_elevenlabs:
+                provider_b = ELEVENLABS
+                future_audio_b = executor.submit(text_to_speech_with_elevenlabs, generated_text, elevenlabs_voice)
+            else:
+                provider_b = HUME_AI
+                future_audio_b = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_b)
+            audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
+        logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
+        options = [(audio_a, provider_a), (audio_b, provider_b)]
         random.shuffle(options)
+        option_a_audio, option_b_audio = options[0][0], options[1][0]
+        options_map = { OPTION_A: options[0][1], OPTION_B: options[1][1] }
         return (
+            gr.update(value=option_a_audio, autoplay=True),
+            gr.update(value=option_b_audio),
             options_map,
+            option_b_audio,
         )
     except ElevenLabsError as ee:
         logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
     if not option_mapping or vote_submitted:
         return gr.skip(), gr.skip(), gr.skip()
+    is_option_a = selected_button == VOTE_FOR_OPTION_A
+    selected_option, other_option = (OPTION_A, OPTION_B) if is_option_a else (OPTION_B, OPTION_A)
     selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
     other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
     return (
         True,
+        gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_a
             else gr.update(value=other_provider, variant='secondary'),
+        gr.update(value=other_provider, variant='secondary') if is_option_a
             else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
     )
     Returns:
         A tuple of updates for:
+         - option_a_audio_player (clear audio)
+         - option_b_audio_player (clear audio)
+         - vote_button_a (disable and reset button text)
+         - vote_button_a (disable and reset button text)
          - option_mapping_state (reset option map state)
          - option2_audio_state (reset option 2 audio state)
          - vote_submitted_state (reset submitted vote state)
     return (
         gr.update(value=None),
         gr.update(value=None),
+        gr.update(interactive=False, value=VOTE_FOR_OPTION_A, variant='secondary'),
+        gr.update(interactive=False, value=VOTE_FOR_OPTION_B, variant='secondary'),
         None,
         None,
         False,
             max_length=PROMPT_MAX_LENGTH,
             show_copy_button=True,
         )
+    generate_button = gr.Button('Generate text', variant='primary')
     return instructions, sample_prompt_dropdown, prompt_input, generate_button
     """Builds the output section including generated text, audio players, and vote buttons."""
     with gr.Column(variant='compact'):
         generated_text = gr.Textbox(
+            label='Text',
             interactive=False,
             autoscroll=False,
             lines=5,
             show_copy_button=True,
         )
         with gr.Row(equal_height=True):
+            option_a_audio_player = gr.Audio(label=OPTION_A, type='filepath', interactive=False)
+            option_b_audio_player = gr.Audio(label=OPTION_B, type='filepath', interactive=False)
     with gr.Row():
+        vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
+        vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
+    return generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b
 def build_gradio_interface() -> gr.Blocks:
         instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
         # Build output section
+        generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b = build_output_section()
         # UI state components
         option_mapping_state = gr.State()       # Track option map (option 1 and option 2 are randomized)
             fn=reset_ui,
             inputs=[],
             outputs=[
+                option_a_audio_player,
+                option_b_audio_player,
+                vote_button_a,
+                vote_button_b,
                 option_mapping_state,
                 option2_audio_state,
                 vote_submitted_state,
             fn=text_to_speech,
             inputs=[prompt_input, generated_text],
             outputs=[
+                option_a_audio_player,
+                option_b_audio_player,
                 option_mapping_state,
+                option2_audio_state,
             ],
         ).then(
             fn=lambda: gr.update(interactive=True), # Re-enable the button
         )
         # Vote button click handlers
+        vote_button_a.click(
             fn=vote,
+            inputs=[vote_submitted_state, option_mapping_state, vote_button_a],
+            outputs=[vote_submitted_state, vote_button_a, vote_button_b],
         )
+        vote_button_b.click(
             fn=vote,
+            inputs=[vote_submitted_state, option_mapping_state, vote_button_b],
+            outputs=[vote_submitted_state, vote_button_a, vote_button_b],
         )
         # Auto-play second audio after first finishes (workaround for playing audio back-to-back)
+        option_a_audio_player.stop(
             fn=lambda _: gr.update(value=None),
             inputs=[],
+            outputs=[option_b_audio_player],
         ).then(
             fn=lambda audio: gr.update(value=audio, autoplay=True),
             inputs=[option2_audio_state],
+            outputs=[option_b_audio_player],
         )
         # Enable voting after second audio option playback finishes
+        option_b_audio_player.stop(
             fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
             inputs=[],
+            outputs=[vote_button_a, vote_button_b, option_b_audio_player],
         )
     logger.debug('Gradio interface built successfully')

src/constants.py CHANGED Viewed

@@ -5,14 +5,19 @@ This module defines global constants used throughout the project.
 """
 # UI constants
 PROMPT_MIN_LENGTH: int = 10
 PROMPT_MAX_LENGTH: int = 400
-OPTION_ONE: str = "Option 1"
-OPTION_TWO: str = "Option 2"
-TROPHY_EMOJI: str = "🏆"
-UNKNOWN_PROVIDER: str = "Unknown"
-VOTE_FOR_OPTION_ONE: str = "Vote for option 1"
-VOTE_FOR_OPTION_TWO: str = "Vote for option 2"
 # A collection of pre-defined prompts categorized by theme, used to provide users with
 # inspiration for generating creative text for expressive TTS.

 """
 # UI constants
+HUME_AI: str = 'Hume'
+ELEVENLABS: str = 'ElevenLabs'
+UNKNOWN_PROVIDER: str = 'Unknown'
 PROMPT_MIN_LENGTH: int = 10
 PROMPT_MAX_LENGTH: int = 400
+OPTION_A: str = 'Option A'
+OPTION_B: str = 'Option B'
+TROPHY_EMOJI: str = '🏆'
+VOTE_FOR_OPTION_A: str = 'Vote for option A'
+VOTE_FOR_OPTION_B: str = 'Vote for option B'
 # A collection of pre-defined prompts categorized by theme, used to provide users with
 # inspiration for generating creative text for expressive TTS.

src/integrations/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .anthropic_api import generate_text_with_claude, AnthropicError
-from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
-from .hume_api import text_to_speech_with_hume, HumeError

 from .anthropic_api import generate_text_with_claude, AnthropicError
+from .elevenlabs_api import text_to_speech_with_elevenlabs, get_random_elevenlabs_voice_id, ElevenLabsError
+from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError

src/integrations/elevenlabs_api.py CHANGED Viewed

@@ -22,7 +22,7 @@ Functions:
 from dataclasses import dataclass
 import logging
 import random
-from typing import Optional
 # Third-Party Library Imports
 from elevenlabs import ElevenLabs
@@ -32,6 +32,12 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
 from src.config import logger
 from src.utils import validate_env_var, truncate_text
 @dataclass(frozen=True)
 class ElevenLabsConfig:
@@ -39,7 +45,7 @@ class ElevenLabsConfig:
     api_key: str = validate_env_var('ELEVENLABS_API_KEY')
     model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
     output_format: str = 'mp3_44100_128' # Output format of the generated audio
-    top_voices: list[str] = (
         'pNInz6obpgDQGcFmaJgB',  # Adam
         'ErXwobaYiN019PkySvjV',  # Antoni
         '21m00Tcm4TlvDq8ikWAM',  # Rachel
@@ -54,8 +60,8 @@ class ElevenLabsConfig:
             raise ValueError('ElevenLabs Model ID is not set.')
         if not self.output_format:
             raise ValueError('ElevenLabs Output Format is not set.')
-        if not self.top_voices:
-            raise ValueError('ElevenLabs Top Voices are not set.')
     @property
     def client(self) -> ElevenLabs:
@@ -72,7 +78,7 @@ class ElevenLabsConfig:
         """
         Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
         """
-        return random.choice(self.top_voices)
 class ElevenLabsError(Exception):
@@ -93,12 +99,13 @@ elevenlabs_config = ElevenLabsConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True
 )
-def text_to_speech_with_elevenlabs(text: str) -> bytes:
     """
-    Converts text to speech using the ElevenLabs TTS API.
     Args:
-        text (str): The text to be converted to speech.
     Returns:
         bytes: The raw binary audio data for playback.
@@ -112,7 +119,7 @@ def text_to_speech_with_elevenlabs(text: str) -> bytes:
         # Synthesize speech using the ElevenLabs SDK
         audio_iterator = elevenlabs_config.client.text_to_speech.convert(
             text=text,
-            voice_id=elevenlabs_config.random_voice_id,
             model_id=elevenlabs_config.model_id,
             output_format=elevenlabs_config.output_format,
         )
@@ -138,4 +145,16 @@ def text_to_speech_with_elevenlabs(text: str) -> bytes:
         raise ElevenLabsError(
             message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
             original_exception=e,
-        )

 from dataclasses import dataclass
 import logging
 import random
+from typing import Literal, Optional
 # Third-Party Library Imports
 from elevenlabs import ElevenLabs
 from src.config import logger
 from src.utils import validate_env_var, truncate_text
+ElevenlabsVoiceId = Literal[
+    "pNInz6obpgDQGcFmaJgB",
+    "ErXwobaYiN019PkySvjV",
+    "21m00Tcm4TlvDq8ikWAM",
+    "XrExE9yKIg1WjnnlVkGX"
+]
 @dataclass(frozen=True)
 class ElevenLabsConfig:
     api_key: str = validate_env_var('ELEVENLABS_API_KEY')
     model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
     output_format: str = 'mp3_44100_128' # Output format of the generated audio
+    voice_ids: list[ElevenlabsVoiceId] = (
         'pNInz6obpgDQGcFmaJgB',  # Adam
         'ErXwobaYiN019PkySvjV',  # Antoni
         '21m00Tcm4TlvDq8ikWAM',  # Rachel
             raise ValueError('ElevenLabs Model ID is not set.')
         if not self.output_format:
             raise ValueError('ElevenLabs Output Format is not set.')
+        if not self.voice_ids:
+            raise ValueError('ElevenLabs Voice IDs are not set.')
     @property
     def client(self) -> ElevenLabs:
         """
         Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
         """
+        return random.choice(self.voice_ids)
 class ElevenLabsError(Exception):
     after=after_log(logger, logging.DEBUG),
     reraise=True
 )
+def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> bytes:
     """
+    Synthesizes text to speech using the ElevenLabs TTS API.
     Args:
+        text (str): The text to be synthesized to speech.
+        voice_id (str): The voice ID for Elevenlabs to use when synthesizing speech.
     Returns:
         bytes: The raw binary audio data for playback.
         # Synthesize speech using the ElevenLabs SDK
         audio_iterator = elevenlabs_config.client.text_to_speech.convert(
             text=text,
+            voice_id=voice_id,
             model_id=elevenlabs_config.model_id,
             output_format=elevenlabs_config.output_format,
         )
         raise ElevenLabsError(
             message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
             original_exception=e,
+        )
+def get_random_elevenlabs_voice_id() -> ElevenlabsVoiceId:
+    """
+    Get a random Elevenlabs voice ID.
+    Voices:
+        - pNInz6obpgDQGcFmaJgB (Adam)
+        - ErXwobaYiN019PkySvjV (Antoni)
+        - 21m00Tcm4TlvDq8ikWAM (Rachel)
+        - XrExE9yKIg1WjnnlVkGX (Matilda)
+    """
+    return elevenlabs_config.random_voice_id

src/integrations/hume_api.py CHANGED Viewed

@@ -22,7 +22,7 @@ Functions:
 from dataclasses import dataclass
 import logging
 import random
-from typing import List, Optional
 # Third-Party Library Imports
 import requests
@@ -33,12 +33,14 @@ from src.config import logger
 from src.utils import validate_env_var, truncate_text
 @dataclass(frozen=True)
 class HumeConfig:
     """Immutable configuration for interacting with the Hume TTS API."""
-    tts_endpoint_url: str = 'https://api.hume.ai/v0/tts'
     api_key: str = validate_env_var('HUME_API_KEY')
-    voices: List[str] = ('ITO', 'KORA', 'STELLA')
     audio_format: str = 'wav'
     headers: dict = None
@@ -46,8 +48,10 @@ class HumeConfig:
         # Validate required attributes
         if not self.api_key:
             raise ValueError('Hume API key is not set.')
-        if not self.voices:
-            raise ValueError('Hume voices list is empty. Please provide at least one voice.')
         if not self.audio_format:
             raise ValueError('Hume audio format is not set.')
@@ -57,16 +61,6 @@ class HumeConfig:
             'Content-Type': 'application/json',
         })
-    @property
-    def random_voice(self) -> str:
-        """
-        Randomly selects a voice from the available voices.
-        Returns:
-            str: A randomly chosen voice name.
-        """
-        return random.choice(self.voices)
 class HumeError(Exception):
     """Custom exception for errors related to the Hume TTS API."""
@@ -86,13 +80,14 @@ hume_config = HumeConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True
 )
-def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
     """
-    Converts text to speech using the Hume TTS API and processes raw binary audio data.
     Args:
         prompt (str): The original user prompt (for debugging).
         text (str): The generated text to be converted to speech.
     Returns:
         bytes: The raw binary audio data for playback.
@@ -105,7 +100,7 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
     request_body = {
         'text': text,
         'voice': {
-            'name': hume_config.random_voice
         },
     }
@@ -135,4 +130,16 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
         raise HumeError(
             message=f'Failed to synthesize speech from text with Hume: {e}',
             original_exception=e,
-        )

 from dataclasses import dataclass
 import logging
 import random
+from typing import List, Literal, Optional, Tuple
 # Third-Party Library Imports
 import requests
 from src.utils import validate_env_var, truncate_text
+HumeVoiceName = Literal['ITO', 'KORA', 'STELLA', 'DACHER']
 @dataclass(frozen=True)
 class HumeConfig:
     """Immutable configuration for interacting with the Hume TTS API."""
     api_key: str = validate_env_var('HUME_API_KEY')
+    tts_endpoint_url: str = 'https://api.hume.ai/v0/tts'
+    voice_names: List[HumeVoiceName] = ('ITO', 'KORA', 'STELLA', 'DACHER')
     audio_format: str = 'wav'
     headers: dict = None
         # Validate required attributes
         if not self.api_key:
             raise ValueError('Hume API key is not set.')
+        if not self.tts_endpoint_url:
+            raise ValueError('Hume TTS endpoint URL is not set.')
+        if not self.voice_names:
+            raise ValueError('Hume voice names list is not set.')
         if not self.audio_format:
             raise ValueError('Hume audio format is not set.')
             'Content-Type': 'application/json',
         })
 class HumeError(Exception):
     """Custom exception for errors related to the Hume TTS API."""
     after=after_log(logger, logging.DEBUG),
     reraise=True
 )
+def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName) -> bytes:
     """
+    Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
     Args:
         prompt (str): The original user prompt (for debugging).
         text (str): The generated text to be converted to speech.
+        voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
     Returns:
         bytes: The raw binary audio data for playback.
     request_body = {
         'text': text,
         'voice': {
+            'name': voice_name
         },
     }
         raise HumeError(
             message=f'Failed to synthesize speech from text with Hume: {e}',
             original_exception=e,
+        )
+def get_random_hume_voice_names() -> Tuple[str, str]:
+    """
+    Get two random Hume voice names.
+    Voices:
+        - ITO
+        - KORA
+        - STELLA
+        - DACHER
+    """
+    return tuple(random.sample(hume_config.voice_names, 2))