zach commited on
Commit
8047063
·
1 Parent(s): bc5091e

Update UI to compare Hume vs Elevenlabs 50% of the time, and Hume vs Hume 50% of the time

Browse files
src/app.py CHANGED
@@ -19,23 +19,27 @@ import gradio as gr
19
  # Local Application Imports
20
  from src.config import logger
21
  from src.constants import (
22
- OPTION_ONE,
23
- OPTION_TWO,
 
 
24
  PROMPT_MAX_LENGTH,
25
  PROMPT_MIN_LENGTH,
26
  SAMPLE_PROMPTS,
27
  TROPHY_EMOJI,
28
  UNKNOWN_PROVIDER,
29
- VOTE_FOR_OPTION_ONE,
30
- VOTE_FOR_OPTION_TWO
31
  )
32
  from src.integrations import (
33
  AnthropicError,
34
  ElevenLabsError,
35
  generate_text_with_claude,
 
 
36
  HumeError,
37
  text_to_speech_with_elevenlabs,
38
- text_to_speech_with_hume
39
  )
40
  from src.theme import CustomTheme
41
  from src.utils import truncate_text, validate_prompt_length
@@ -76,7 +80,9 @@ def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
76
 
77
  def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
78
  """
79
- Synthesizes generated text to speech using Hume and ElevenLabs APIs in parallel.
 
 
80
 
81
  Args:
82
  prompt (str): The original prompt.
@@ -96,25 +102,37 @@ def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.upda
96
  logger.warning('Skipping text-to-speech due to empty text.')
97
  return gr.skip(), gr.skip(), gr.skip(), gr.skip()
98
 
 
 
 
 
 
 
99
  try:
100
  with ThreadPoolExecutor(max_workers=2) as executor:
101
- future_hume = executor.submit(text_to_speech_with_hume, prompt, generated_text)
102
- future_elevenlabs = executor.submit(text_to_speech_with_elevenlabs, generated_text)
 
 
 
 
 
 
 
103
 
104
- hume_audio = future_hume.result()
105
- elevenlabs_audio = future_elevenlabs.result()
106
 
107
- logger.info(f'TTS generated: Hume={len(hume_audio)} bytes, ElevenLabs={len(elevenlabs_audio)} bytes')
108
- options = [(hume_audio, 'Hume AI'), (elevenlabs_audio, 'ElevenLabs')]
109
  random.shuffle(options)
110
- option_1_audio, option_2_audio = options[0][0], options[1][0]
111
- options_map = { OPTION_ONE: options[0][1], OPTION_TWO: options[1][1] }
112
 
113
  return (
114
- gr.update(value=option_1_audio, autoplay=True),
115
- gr.update(value=option_2_audio),
116
  options_map,
117
- option_2_audio,
118
  )
119
  except ElevenLabsError as ee:
120
  logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
@@ -145,16 +163,16 @@ def vote(vote_submitted: bool, option_mapping: dict, selected_button: str) -> Tu
145
  if not option_mapping or vote_submitted:
146
  return gr.skip(), gr.skip(), gr.skip()
147
 
148
- is_option_1 = selected_button == VOTE_FOR_OPTION_ONE
149
- selected_option, other_option = (OPTION_ONE, OPTION_TWO) if is_option_1 else (OPTION_TWO, OPTION_ONE)
150
  selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
151
  other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
152
 
153
  return (
154
  True,
155
- gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_1
156
  else gr.update(value=other_provider, variant='secondary'),
157
- gr.update(value=other_provider, variant='secondary') if is_option_1
158
  else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
159
  )
160
 
@@ -164,10 +182,10 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
164
 
165
  Returns:
166
  A tuple of updates for:
167
- - option1_audio_player (clear audio)
168
- - option2_audio_player (clear audio)
169
- - vote_button_1 (disable and reset button text)
170
- - vote_button_2 (disable and reset button text)
171
  - option_mapping_state (reset option map state)
172
  - option2_audio_state (reset option 2 audio state)
173
  - vote_submitted_state (reset submitted vote state)
@@ -175,8 +193,8 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
175
  return (
176
  gr.update(value=None),
177
  gr.update(value=None),
178
- gr.update(interactive=False, value=VOTE_FOR_OPTION_ONE, variant='secondary'),
179
- gr.update(interactive=False, value=VOTE_FOR_OPTION_TWO, variant='secondary'),
180
  None,
181
  None,
182
  False,
@@ -204,7 +222,7 @@ def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Butt
204
  max_length=PROMPT_MAX_LENGTH,
205
  show_copy_button=True,
206
  )
207
- generate_button = gr.Button('Generate', variant='primary')
208
  return instructions, sample_prompt_dropdown, prompt_input, generate_button
209
 
210
 
@@ -212,7 +230,7 @@ def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, g
212
  """Builds the output section including generated text, audio players, and vote buttons."""
213
  with gr.Column(variant='compact'):
214
  generated_text = gr.Textbox(
215
- label='Generated text',
216
  interactive=False,
217
  autoscroll=False,
218
  lines=5,
@@ -221,12 +239,12 @@ def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, g
221
  show_copy_button=True,
222
  )
223
  with gr.Row(equal_height=True):
224
- option1_audio_player = gr.Audio(label=OPTION_ONE, type='filepath', interactive=False)
225
- option2_audio_player = gr.Audio(label=OPTION_TWO, type='filepath', interactive=False)
226
  with gr.Row():
227
- vote_button_1 = gr.Button(VOTE_FOR_OPTION_ONE, interactive=False)
228
- vote_button_2 = gr.Button(VOTE_FOR_OPTION_TWO, interactive=False)
229
- return generated_text, option1_audio_player, option2_audio_player, vote_button_1, vote_button_2
230
 
231
 
232
  def build_gradio_interface() -> gr.Blocks:
@@ -250,7 +268,7 @@ def build_gradio_interface() -> gr.Blocks:
250
  instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
251
 
252
  # Build output section
253
- generated_text, option1_audio_player, option2_audio_player, vote_button_1, vote_button_2 = build_output_section()
254
 
255
  # UI state components
256
  option_mapping_state = gr.State() # Track option map (option 1 and option 2 are randomized)
@@ -280,10 +298,10 @@ def build_gradio_interface() -> gr.Blocks:
280
  fn=reset_ui,
281
  inputs=[],
282
  outputs=[
283
- option1_audio_player,
284
- option2_audio_player,
285
- vote_button_1,
286
- vote_button_2,
287
  option_mapping_state,
288
  option2_audio_state,
289
  vote_submitted_state,
@@ -296,10 +314,10 @@ def build_gradio_interface() -> gr.Blocks:
296
  fn=text_to_speech,
297
  inputs=[prompt_input, generated_text],
298
  outputs=[
299
- option1_audio_player,
300
- option2_audio_player,
301
  option_mapping_state,
302
- option2_audio_state
303
  ],
304
  ).then(
305
  fn=lambda: gr.update(interactive=True), # Re-enable the button
@@ -308,33 +326,33 @@ def build_gradio_interface() -> gr.Blocks:
308
  )
309
 
310
  # Vote button click handlers
311
- vote_button_1.click(
312
  fn=vote,
313
- inputs=[vote_submitted_state, option_mapping_state, vote_button_1],
314
- outputs=[vote_submitted_state, vote_button_1, vote_button_2],
315
  )
316
- vote_button_2.click(
317
  fn=vote,
318
- inputs=[vote_submitted_state, option_mapping_state, vote_button_2],
319
- outputs=[vote_submitted_state, vote_button_1, vote_button_2],
320
  )
321
 
322
  # Auto-play second audio after first finishes (workaround for playing audio back-to-back)
323
- option1_audio_player.stop(
324
  fn=lambda _: gr.update(value=None),
325
  inputs=[],
326
- outputs=[option2_audio_player],
327
  ).then(
328
  fn=lambda audio: gr.update(value=audio, autoplay=True),
329
  inputs=[option2_audio_state],
330
- outputs=[option2_audio_player],
331
  )
332
 
333
  # Enable voting after second audio option playback finishes
334
- option2_audio_player.stop(
335
  fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
336
  inputs=[],
337
- outputs=[vote_button_1, vote_button_2, option2_audio_player],
338
  )
339
 
340
  logger.debug('Gradio interface built successfully')
 
19
  # Local Application Imports
20
  from src.config import logger
21
  from src.constants import (
22
+ ELEVENLABS,
23
+ HUME_AI,
24
+ OPTION_A,
25
+ OPTION_B,
26
  PROMPT_MAX_LENGTH,
27
  PROMPT_MIN_LENGTH,
28
  SAMPLE_PROMPTS,
29
  TROPHY_EMOJI,
30
  UNKNOWN_PROVIDER,
31
+ VOTE_FOR_OPTION_A,
32
+ VOTE_FOR_OPTION_B,
33
  )
34
  from src.integrations import (
35
  AnthropicError,
36
  ElevenLabsError,
37
  generate_text_with_claude,
38
+ get_random_elevenlabs_voice_id,
39
+ get_random_hume_voice_names,
40
  HumeError,
41
  text_to_speech_with_elevenlabs,
42
+ text_to_speech_with_hume,
43
  )
44
  from src.theme import CustomTheme
45
  from src.utils import truncate_text, validate_prompt_length
 
80
 
81
  def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
82
  """
83
+ Synthesizes two text to speech outputs and loads the two audio players in the UI with the output audio.
84
+ - 50% of the time one Hume tts output and one Elevenlabs output will be synthesized.
85
+ = 50% of the time two Hume tts outputs will be synthesized.
86
 
87
  Args:
88
  prompt (str): The original prompt.
 
102
  logger.warning('Skipping text-to-speech due to empty text.')
103
  return gr.skip(), gr.skip(), gr.skip(), gr.skip()
104
 
105
+ # compare_hume_with_elevenlabs = random.random() < 0.5
106
+ compare_hume_with_elevenlabs = False
107
+
108
+ elevenlabs_voice = get_random_elevenlabs_voice_id()
109
+ hume_voice_a, hume_voice_b = get_random_hume_voice_names() # We get two Hume voices preemptively in case we compare Hume with Hume
110
+
111
  try:
112
  with ThreadPoolExecutor(max_workers=2) as executor:
113
+ provider_a = HUME_AI
114
+ future_audio_a = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_a)
115
+
116
+ if compare_hume_with_elevenlabs:
117
+ provider_b = ELEVENLABS
118
+ future_audio_b = executor.submit(text_to_speech_with_elevenlabs, generated_text, elevenlabs_voice)
119
+ else:
120
+ provider_b = HUME_AI
121
+ future_audio_b = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_b)
122
 
123
+ audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
 
124
 
125
+ logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
126
+ options = [(audio_a, provider_a), (audio_b, provider_b)]
127
  random.shuffle(options)
128
+ option_a_audio, option_b_audio = options[0][0], options[1][0]
129
+ options_map = { OPTION_A: options[0][1], OPTION_B: options[1][1] }
130
 
131
  return (
132
+ gr.update(value=option_a_audio, autoplay=True),
133
+ gr.update(value=option_b_audio),
134
  options_map,
135
+ option_b_audio,
136
  )
137
  except ElevenLabsError as ee:
138
  logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
 
163
  if not option_mapping or vote_submitted:
164
  return gr.skip(), gr.skip(), gr.skip()
165
 
166
+ is_option_a = selected_button == VOTE_FOR_OPTION_A
167
+ selected_option, other_option = (OPTION_A, OPTION_B) if is_option_a else (OPTION_B, OPTION_A)
168
  selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
169
  other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
170
 
171
  return (
172
  True,
173
+ gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_a
174
  else gr.update(value=other_provider, variant='secondary'),
175
+ gr.update(value=other_provider, variant='secondary') if is_option_a
176
  else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
177
  )
178
 
 
182
 
183
  Returns:
184
  A tuple of updates for:
185
+ - option_a_audio_player (clear audio)
186
+ - option_b_audio_player (clear audio)
187
+ - vote_button_a (disable and reset button text)
188
+ - vote_button_a (disable and reset button text)
189
  - option_mapping_state (reset option map state)
190
  - option2_audio_state (reset option 2 audio state)
191
  - vote_submitted_state (reset submitted vote state)
 
193
  return (
194
  gr.update(value=None),
195
  gr.update(value=None),
196
+ gr.update(interactive=False, value=VOTE_FOR_OPTION_A, variant='secondary'),
197
+ gr.update(interactive=False, value=VOTE_FOR_OPTION_B, variant='secondary'),
198
  None,
199
  None,
200
  False,
 
222
  max_length=PROMPT_MAX_LENGTH,
223
  show_copy_button=True,
224
  )
225
+ generate_button = gr.Button('Generate text', variant='primary')
226
  return instructions, sample_prompt_dropdown, prompt_input, generate_button
227
 
228
 
 
230
  """Builds the output section including generated text, audio players, and vote buttons."""
231
  with gr.Column(variant='compact'):
232
  generated_text = gr.Textbox(
233
+ label='Text',
234
  interactive=False,
235
  autoscroll=False,
236
  lines=5,
 
239
  show_copy_button=True,
240
  )
241
  with gr.Row(equal_height=True):
242
+ option_a_audio_player = gr.Audio(label=OPTION_A, type='filepath', interactive=False)
243
+ option_b_audio_player = gr.Audio(label=OPTION_B, type='filepath', interactive=False)
244
  with gr.Row():
245
+ vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
246
+ vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
247
+ return generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b
248
 
249
 
250
  def build_gradio_interface() -> gr.Blocks:
 
268
  instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
269
 
270
  # Build output section
271
+ generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b = build_output_section()
272
 
273
  # UI state components
274
  option_mapping_state = gr.State() # Track option map (option 1 and option 2 are randomized)
 
298
  fn=reset_ui,
299
  inputs=[],
300
  outputs=[
301
+ option_a_audio_player,
302
+ option_b_audio_player,
303
+ vote_button_a,
304
+ vote_button_b,
305
  option_mapping_state,
306
  option2_audio_state,
307
  vote_submitted_state,
 
314
  fn=text_to_speech,
315
  inputs=[prompt_input, generated_text],
316
  outputs=[
317
+ option_a_audio_player,
318
+ option_b_audio_player,
319
  option_mapping_state,
320
+ option2_audio_state,
321
  ],
322
  ).then(
323
  fn=lambda: gr.update(interactive=True), # Re-enable the button
 
326
  )
327
 
328
  # Vote button click handlers
329
+ vote_button_a.click(
330
  fn=vote,
331
+ inputs=[vote_submitted_state, option_mapping_state, vote_button_a],
332
+ outputs=[vote_submitted_state, vote_button_a, vote_button_b],
333
  )
334
+ vote_button_b.click(
335
  fn=vote,
336
+ inputs=[vote_submitted_state, option_mapping_state, vote_button_b],
337
+ outputs=[vote_submitted_state, vote_button_a, vote_button_b],
338
  )
339
 
340
  # Auto-play second audio after first finishes (workaround for playing audio back-to-back)
341
+ option_a_audio_player.stop(
342
  fn=lambda _: gr.update(value=None),
343
  inputs=[],
344
+ outputs=[option_b_audio_player],
345
  ).then(
346
  fn=lambda audio: gr.update(value=audio, autoplay=True),
347
  inputs=[option2_audio_state],
348
+ outputs=[option_b_audio_player],
349
  )
350
 
351
  # Enable voting after second audio option playback finishes
352
+ option_b_audio_player.stop(
353
  fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
354
  inputs=[],
355
+ outputs=[vote_button_a, vote_button_b, option_b_audio_player],
356
  )
357
 
358
  logger.debug('Gradio interface built successfully')
src/constants.py CHANGED
@@ -5,14 +5,19 @@ This module defines global constants used throughout the project.
5
  """
6
 
7
  # UI constants
 
 
 
 
8
  PROMPT_MIN_LENGTH: int = 10
9
  PROMPT_MAX_LENGTH: int = 400
10
- OPTION_ONE: str = "Option 1"
11
- OPTION_TWO: str = "Option 2"
12
- TROPHY_EMOJI: str = "🏆"
13
- UNKNOWN_PROVIDER: str = "Unknown"
14
- VOTE_FOR_OPTION_ONE: str = "Vote for option 1"
15
- VOTE_FOR_OPTION_TWO: str = "Vote for option 2"
 
16
 
17
  # A collection of pre-defined prompts categorized by theme, used to provide users with
18
  # inspiration for generating creative text for expressive TTS.
 
5
  """
6
 
7
  # UI constants
8
+ HUME_AI: str = 'Hume'
9
+ ELEVENLABS: str = 'ElevenLabs'
10
+ UNKNOWN_PROVIDER: str = 'Unknown'
11
+
12
  PROMPT_MIN_LENGTH: int = 10
13
  PROMPT_MAX_LENGTH: int = 400
14
+
15
+ OPTION_A: str = 'Option A'
16
+ OPTION_B: str = 'Option B'
17
+ TROPHY_EMOJI: str = '🏆'
18
+ VOTE_FOR_OPTION_A: str = 'Vote for option A'
19
+ VOTE_FOR_OPTION_B: str = 'Vote for option B'
20
+
21
 
22
  # A collection of pre-defined prompts categorized by theme, used to provide users with
23
  # inspiration for generating creative text for expressive TTS.
src/integrations/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
  from .anthropic_api import generate_text_with_claude, AnthropicError
2
- from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
3
- from .hume_api import text_to_speech_with_hume, HumeError
 
1
  from .anthropic_api import generate_text_with_claude, AnthropicError
2
+ from .elevenlabs_api import text_to_speech_with_elevenlabs, get_random_elevenlabs_voice_id, ElevenLabsError
3
+ from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
src/integrations/elevenlabs_api.py CHANGED
@@ -22,7 +22,7 @@ Functions:
22
  from dataclasses import dataclass
23
  import logging
24
  import random
25
- from typing import Optional
26
 
27
  # Third-Party Library Imports
28
  from elevenlabs import ElevenLabs
@@ -32,6 +32,12 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
32
  from src.config import logger
33
  from src.utils import validate_env_var, truncate_text
34
 
 
 
 
 
 
 
35
 
36
  @dataclass(frozen=True)
37
  class ElevenLabsConfig:
@@ -39,7 +45,7 @@ class ElevenLabsConfig:
39
  api_key: str = validate_env_var('ELEVENLABS_API_KEY')
40
  model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
41
  output_format: str = 'mp3_44100_128' # Output format of the generated audio
42
- top_voices: list[str] = (
43
  'pNInz6obpgDQGcFmaJgB', # Adam
44
  'ErXwobaYiN019PkySvjV', # Antoni
45
  '21m00Tcm4TlvDq8ikWAM', # Rachel
@@ -54,8 +60,8 @@ class ElevenLabsConfig:
54
  raise ValueError('ElevenLabs Model ID is not set.')
55
  if not self.output_format:
56
  raise ValueError('ElevenLabs Output Format is not set.')
57
- if not self.top_voices:
58
- raise ValueError('ElevenLabs Top Voices are not set.')
59
 
60
  @property
61
  def client(self) -> ElevenLabs:
@@ -72,7 +78,7 @@ class ElevenLabsConfig:
72
  """
73
  Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
74
  """
75
- return random.choice(self.top_voices)
76
 
77
 
78
  class ElevenLabsError(Exception):
@@ -93,12 +99,13 @@ elevenlabs_config = ElevenLabsConfig()
93
  after=after_log(logger, logging.DEBUG),
94
  reraise=True
95
  )
96
- def text_to_speech_with_elevenlabs(text: str) -> bytes:
97
  """
98
- Converts text to speech using the ElevenLabs TTS API.
99
 
100
  Args:
101
- text (str): The text to be converted to speech.
 
102
 
103
  Returns:
104
  bytes: The raw binary audio data for playback.
@@ -112,7 +119,7 @@ def text_to_speech_with_elevenlabs(text: str) -> bytes:
112
  # Synthesize speech using the ElevenLabs SDK
113
  audio_iterator = elevenlabs_config.client.text_to_speech.convert(
114
  text=text,
115
- voice_id=elevenlabs_config.random_voice_id,
116
  model_id=elevenlabs_config.model_id,
117
  output_format=elevenlabs_config.output_format,
118
  )
@@ -138,4 +145,16 @@ def text_to_speech_with_elevenlabs(text: str) -> bytes:
138
  raise ElevenLabsError(
139
  message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
140
  original_exception=e,
141
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  from dataclasses import dataclass
23
  import logging
24
  import random
25
+ from typing import Literal, Optional
26
 
27
  # Third-Party Library Imports
28
  from elevenlabs import ElevenLabs
 
32
  from src.config import logger
33
  from src.utils import validate_env_var, truncate_text
34
 
35
+ ElevenlabsVoiceId = Literal[
36
+ "pNInz6obpgDQGcFmaJgB",
37
+ "ErXwobaYiN019PkySvjV",
38
+ "21m00Tcm4TlvDq8ikWAM",
39
+ "XrExE9yKIg1WjnnlVkGX"
40
+ ]
41
 
42
  @dataclass(frozen=True)
43
  class ElevenLabsConfig:
 
45
  api_key: str = validate_env_var('ELEVENLABS_API_KEY')
46
  model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
47
  output_format: str = 'mp3_44100_128' # Output format of the generated audio
48
+ voice_ids: list[ElevenlabsVoiceId] = (
49
  'pNInz6obpgDQGcFmaJgB', # Adam
50
  'ErXwobaYiN019PkySvjV', # Antoni
51
  '21m00Tcm4TlvDq8ikWAM', # Rachel
 
60
  raise ValueError('ElevenLabs Model ID is not set.')
61
  if not self.output_format:
62
  raise ValueError('ElevenLabs Output Format is not set.')
63
+ if not self.voice_ids:
64
+ raise ValueError('ElevenLabs Voice IDs are not set.')
65
 
66
  @property
67
  def client(self) -> ElevenLabs:
 
78
  """
79
  Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
80
  """
81
+ return random.choice(self.voice_ids)
82
 
83
 
84
  class ElevenLabsError(Exception):
 
99
  after=after_log(logger, logging.DEBUG),
100
  reraise=True
101
  )
102
+ def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> bytes:
103
  """
104
+ Synthesizes text to speech using the ElevenLabs TTS API.
105
 
106
  Args:
107
+ text (str): The text to be synthesized to speech.
108
+ voice_id (str): The voice ID for Elevenlabs to use when synthesizing speech.
109
 
110
  Returns:
111
  bytes: The raw binary audio data for playback.
 
119
  # Synthesize speech using the ElevenLabs SDK
120
  audio_iterator = elevenlabs_config.client.text_to_speech.convert(
121
  text=text,
122
+ voice_id=voice_id,
123
  model_id=elevenlabs_config.model_id,
124
  output_format=elevenlabs_config.output_format,
125
  )
 
145
  raise ElevenLabsError(
146
  message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
147
  original_exception=e,
148
+ )
149
+
150
+ def get_random_elevenlabs_voice_id() -> ElevenlabsVoiceId:
151
+ """
152
+ Get a random Elevenlabs voice ID.
153
+
154
+ Voices:
155
+ - pNInz6obpgDQGcFmaJgB (Adam)
156
+ - ErXwobaYiN019PkySvjV (Antoni)
157
+ - 21m00Tcm4TlvDq8ikWAM (Rachel)
158
+ - XrExE9yKIg1WjnnlVkGX (Matilda)
159
+ """
160
+ return elevenlabs_config.random_voice_id
src/integrations/hume_api.py CHANGED
@@ -22,7 +22,7 @@ Functions:
22
  from dataclasses import dataclass
23
  import logging
24
  import random
25
- from typing import List, Optional
26
 
27
  # Third-Party Library Imports
28
  import requests
@@ -33,12 +33,14 @@ from src.config import logger
33
  from src.utils import validate_env_var, truncate_text
34
 
35
 
 
 
36
  @dataclass(frozen=True)
37
  class HumeConfig:
38
  """Immutable configuration for interacting with the Hume TTS API."""
39
- tts_endpoint_url: str = 'https://api.hume.ai/v0/tts'
40
  api_key: str = validate_env_var('HUME_API_KEY')
41
- voices: List[str] = ('ITO', 'KORA', 'STELLA')
 
42
  audio_format: str = 'wav'
43
  headers: dict = None
44
 
@@ -46,8 +48,10 @@ class HumeConfig:
46
  # Validate required attributes
47
  if not self.api_key:
48
  raise ValueError('Hume API key is not set.')
49
- if not self.voices:
50
- raise ValueError('Hume voices list is empty. Please provide at least one voice.')
 
 
51
  if not self.audio_format:
52
  raise ValueError('Hume audio format is not set.')
53
 
@@ -57,16 +61,6 @@ class HumeConfig:
57
  'Content-Type': 'application/json',
58
  })
59
 
60
- @property
61
- def random_voice(self) -> str:
62
- """
63
- Randomly selects a voice from the available voices.
64
-
65
- Returns:
66
- str: A randomly chosen voice name.
67
- """
68
- return random.choice(self.voices)
69
-
70
 
71
  class HumeError(Exception):
72
  """Custom exception for errors related to the Hume TTS API."""
@@ -86,13 +80,14 @@ hume_config = HumeConfig()
86
  after=after_log(logger, logging.DEBUG),
87
  reraise=True
88
  )
89
- def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
90
  """
91
- Converts text to speech using the Hume TTS API and processes raw binary audio data.
92
 
93
  Args:
94
  prompt (str): The original user prompt (for debugging).
95
  text (str): The generated text to be converted to speech.
 
96
 
97
  Returns:
98
  bytes: The raw binary audio data for playback.
@@ -105,7 +100,7 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
105
  request_body = {
106
  'text': text,
107
  'voice': {
108
- 'name': hume_config.random_voice
109
  },
110
  }
111
 
@@ -135,4 +130,16 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
135
  raise HumeError(
136
  message=f'Failed to synthesize speech from text with Hume: {e}',
137
  original_exception=e,
138
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  from dataclasses import dataclass
23
  import logging
24
  import random
25
+ from typing import List, Literal, Optional, Tuple
26
 
27
  # Third-Party Library Imports
28
  import requests
 
33
  from src.utils import validate_env_var, truncate_text
34
 
35
 
36
+ HumeVoiceName = Literal['ITO', 'KORA', 'STELLA', 'DACHER']
37
+
38
  @dataclass(frozen=True)
39
  class HumeConfig:
40
  """Immutable configuration for interacting with the Hume TTS API."""
 
41
  api_key: str = validate_env_var('HUME_API_KEY')
42
+ tts_endpoint_url: str = 'https://api.hume.ai/v0/tts'
43
+ voice_names: List[HumeVoiceName] = ('ITO', 'KORA', 'STELLA', 'DACHER')
44
  audio_format: str = 'wav'
45
  headers: dict = None
46
 
 
48
  # Validate required attributes
49
  if not self.api_key:
50
  raise ValueError('Hume API key is not set.')
51
+ if not self.tts_endpoint_url:
52
+ raise ValueError('Hume TTS endpoint URL is not set.')
53
+ if not self.voice_names:
54
+ raise ValueError('Hume voice names list is not set.')
55
  if not self.audio_format:
56
  raise ValueError('Hume audio format is not set.')
57
 
 
61
  'Content-Type': 'application/json',
62
  })
63
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  class HumeError(Exception):
66
  """Custom exception for errors related to the Hume TTS API."""
 
80
  after=after_log(logger, logging.DEBUG),
81
  reraise=True
82
  )
83
+ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName) -> bytes:
84
  """
85
+ Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
86
 
87
  Args:
88
  prompt (str): The original user prompt (for debugging).
89
  text (str): The generated text to be converted to speech.
90
+ voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
91
 
92
  Returns:
93
  bytes: The raw binary audio data for playback.
 
100
  request_body = {
101
  'text': text,
102
  'voice': {
103
+ 'name': voice_name
104
  },
105
  }
106
 
 
130
  raise HumeError(
131
  message=f'Failed to synthesize speech from text with Hume: {e}',
132
  original_exception=e,
133
+ )
134
+
135
+ def get_random_hume_voice_names() -> Tuple[str, str]:
136
+ """
137
+ Get two random Hume voice names.
138
+
139
+ Voices:
140
+ - ITO
141
+ - KORA
142
+ - STELLA
143
+ - DACHER
144
+ """
145
+ return tuple(random.sample(hume_config.voice_names, 2))