zach commited on
Commit
7f25817
·
1 Parent(s): 6431bab

Update Hume integration to use OCTAVE TTS endpoint, update Elevenlabs integration to use voice design endpoint, no longer specify voice since voices are now generated

Browse files
src/app.py CHANGED
@@ -35,7 +35,6 @@ from src.integrations import (
35
  AnthropicError,
36
  ElevenLabsError,
37
  generate_text_with_claude,
38
- get_random_hume_voice_names,
39
  HumeError,
40
  text_to_speech_with_elevenlabs,
41
  text_to_speech_with_hume,
@@ -114,34 +113,29 @@ def text_to_speech(
114
  random.random() < 0.5
115
  )
116
 
117
- # Pre-select two Hume voices pre-emptively in case we compare Hume to Hume to ensure we do not select the same voice twice.
118
- hume_voice_a, hume_voice_b = get_random_hume_voice_names()
119
-
120
  try:
121
  with ThreadPoolExecutor(max_workers=2) as executor:
122
  provider_a = HUME_AI
123
- future_audio_a = executor.submit(
124
- text_to_speech_with_hume, prompt, text, hume_voice_a
125
- )
126
 
127
  if compare_hume_with_elevenlabs:
128
  provider_b = ELEVENLABS
129
- future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text)
130
- else:
131
- provider_b = HUME_AI
132
  future_audio_b = executor.submit(
133
- text_to_speech_with_hume, prompt, text, hume_voice_b
134
  )
 
 
 
135
 
136
- voice_a, audio_a = future_audio_a.result()
137
- voice_b, audio_b = future_audio_b.result()
138
 
139
  logger.info(
140
  f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
141
  )
142
  options = [
143
- (audio_a, {"provider": provider_a, "voice": voice_a}),
144
- (audio_b, {"provider": provider_b, "voice": voice_b}),
145
  ]
146
  random.shuffle(options)
147
  option_a_audio, option_b_audio = options[0][0], options[1][0]
@@ -179,16 +173,16 @@ def vote(
179
  option_map (OptionMap): A dictionary mapping option labels to their details.
180
  Expected structure:
181
  {
182
- 'Option A': '{"provider": "Hume AI", "voice": "<voice_name>"}',
183
- 'Option B': '{"provider": "ElevenLabs", "voice": "<voice_name>"}'
184
  }
185
  selected_button (str): The button that was clicked.
186
 
187
  Returns:
188
  A tuple of:
189
  - A boolean indicating if the vote was accepted.
190
- - An update for the selected vote button (showing provider, voice, and trophy emoji).
191
- - An update for the unselected vote button (showing provider and voice).
192
  - An update for enabling vote interactions.
193
  """
194
  if not option_map or vote_submitted:
@@ -198,20 +192,12 @@ def vote(
198
  selected_option, other_option = (
199
  (OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
200
  )
201
-
202
- # Parse selected option details from options map
203
- selected_details = option_map.get(selected_option, {})
204
- selected_provider = selected_details.get("provider", UNKNOWN_PROVIDER)
205
- selected_voice = selected_details.get("voice", "")
206
-
207
- # Parse other option details from options map
208
- other_details = option_map.get(other_option, {})
209
- other_provider = other_details.get("provider", UNKNOWN_PROVIDER)
210
- other_voice = other_details.get("voice", "")
211
 
212
  # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
213
- selected_label = f"{selected_provider} | Voice: {selected_voice} {TROPHY_EMOJI}"
214
- other_label = f"{other_provider} | Voice: {other_voice}"
215
 
216
  return (
217
  True,
@@ -245,7 +231,7 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
245
  """
246
  return (
247
  gr.update(value=None),
248
- gr.update(value=None),
249
  gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
250
  gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
251
  None,
@@ -398,9 +384,13 @@ def build_gradio_interface() -> gr.Blocks:
398
  # 3. Synthesize speech, load audio players, and display vote button
399
  # 4. Enable the "Synthesize speech" button and display vote buttons
400
  synthesize_speech_button.click(
401
- fn=lambda: gr.update(interactive=False),
 
 
 
 
402
  inputs=[],
403
- outputs=[synthesize_speech_button],
404
  ).then(
405
  fn=reset_ui,
406
  inputs=[],
 
35
  AnthropicError,
36
  ElevenLabsError,
37
  generate_text_with_claude,
 
38
  HumeError,
39
  text_to_speech_with_elevenlabs,
40
  text_to_speech_with_hume,
 
113
  random.random() < 0.5
114
  )
115
 
 
 
 
116
  try:
117
  with ThreadPoolExecutor(max_workers=2) as executor:
118
  provider_a = HUME_AI
119
+ future_audio_a = executor.submit(text_to_speech_with_hume, prompt, text)
 
 
120
 
121
  if compare_hume_with_elevenlabs:
122
  provider_b = ELEVENLABS
 
 
 
123
  future_audio_b = executor.submit(
124
+ text_to_speech_with_elevenlabs, prompt, text
125
  )
126
+ else:
127
+ provider_b = HUME_AI
128
+ future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text)
129
 
130
+ audio_a = future_audio_a.result()
131
+ audio_b = future_audio_b.result()
132
 
133
  logger.info(
134
  f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
135
  )
136
  options = [
137
+ (audio_a, provider_a),
138
+ (audio_b, provider_b),
139
  ]
140
  random.shuffle(options)
141
  option_a_audio, option_b_audio = options[0][0], options[1][0]
 
173
  option_map (OptionMap): A dictionary mapping option labels to their details.
174
  Expected structure:
175
  {
176
+ 'Option A': 'Hume AI',
177
+ 'Option B': 'ElevenLabs',
178
  }
179
  selected_button (str): The button that was clicked.
180
 
181
  Returns:
182
  A tuple of:
183
  - A boolean indicating if the vote was accepted.
184
+ - An update for the selected vote button (showing provider and trophy emoji).
185
+ - An update for the unselected vote button (showing provider).
186
  - An update for enabling vote interactions.
187
  """
188
  if not option_map or vote_submitted:
 
192
  selected_option, other_option = (
193
  (OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
194
  )
195
+ selected_provider = option_map.get(selected_option)
196
+ other_provider = option_map.get(other_option)
 
 
 
 
 
 
 
 
197
 
198
  # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
199
+ selected_label = f"{selected_provider} {TROPHY_EMOJI}"
200
+ other_label = f"{other_provider}"
201
 
202
  return (
203
  True,
 
231
  """
232
  return (
233
  gr.update(value=None),
234
+ gr.update(value=None, autoplay=False),
235
  gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
236
  gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
237
  None,
 
384
  # 3. Synthesize speech, load audio players, and display vote button
385
  # 4. Enable the "Synthesize speech" button and display vote buttons
386
  synthesize_speech_button.click(
387
+ fn=lambda: (
388
+ gr.update(interactive=False),
389
+ gr.update(interactive=False),
390
+ gr.update(interactive=False),
391
+ ),
392
  inputs=[],
393
+ outputs=[synthesize_speech_button, vote_button_a, vote_button_b],
394
  ).then(
395
  fn=reset_ui,
396
  inputs=[],
src/integrations/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
  from .anthropic_api import generate_text_with_claude, AnthropicError
2
  from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
3
- from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
 
1
  from .anthropic_api import generate_text_with_claude, AnthropicError
2
  from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
3
+ from .hume_api import text_to_speech_with_hume, HumeError
src/integrations/elevenlabs_api.py CHANGED
@@ -114,58 +114,44 @@ elevenlabs_config = ElevenLabsConfig()
114
  after=after_log(logger, logging.DEBUG),
115
  reraise=True,
116
  )
117
- def text_to_speech_with_elevenlabs(text: str) -> Tuple[ElevenlabsVoiceName, bytes]:
118
  """
119
  Synthesizes text to speech using the ElevenLabs TTS API.
120
 
121
  Args:
 
122
  text (str): The text to be synthesized to speech.
123
 
124
  Returns:
125
- Tuple[ElevenlabsVoiceName, bytes]: A tuple containing the voice name used for speech synthesis
126
- and the raw binary audio data for playback.
127
 
128
  Raises:
129
  ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
130
  """
131
  logger.debug(
132
- f"Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters."
133
  )
134
 
135
- # Get a random voice as an enum member.
136
- voice = elevenlabs_config.random_voice
137
- logger.debug(f"Selected voice: {voice.voice_name}")
138
-
139
  try:
140
  # Synthesize speech using the ElevenLabs SDK
141
- audio_iterator = elevenlabs_config.client.text_to_speech.convert(
 
142
  text=text,
143
- voice_id=voice.voice_id,
144
- model_id=elevenlabs_config.model_id,
145
- output_format=elevenlabs_config.output_format,
146
  )
147
 
148
- # Attempt to combine chunks into a single bytes object.
149
- # If audio_iterator is not iterable or invalid, an exception will be raised.
150
- try:
151
- audio = b"".join(chunk for chunk in audio_iterator)
152
- except Exception as iter_error:
153
- logger.error("Invalid audio iterator response.")
154
- raise ElevenLabsError(
155
- "Invalid audio iterator received from ElevenLabs API."
156
- ) from iter_error
157
-
158
- # Validate audio
159
- if not audio:
160
- logger.error("No audio data received from ElevenLabs API.")
161
- raise ElevenLabsError("Empty audio data received from ElevenLabs API.")
162
 
163
- logger.info(f"Received ElevenLabs audio ({len(audio)} bytes).")
164
- return voice.voice_name, audio
 
165
 
166
  except Exception as e:
167
- logger.exception(f"Error synthesizing speech from text with Elevenlabs: {e}")
168
  raise ElevenLabsError(
169
- message=f"Failed to synthesize speech from text with ElevenLabs: {e}",
170
  original_exception=e,
171
- )
 
114
  after=after_log(logger, logging.DEBUG),
115
  reraise=True,
116
  )
117
+ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
118
  """
119
  Synthesizes text to speech using the ElevenLabs TTS API.
120
 
121
  Args:
122
+ prompt (str): The original user prompt used as the voice description.
123
  text (str): The text to be synthesized to speech.
124
 
125
  Returns:
126
+ bytes: The raw binary audio data for playback.
 
127
 
128
  Raises:
129
  ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
130
  """
131
  logger.debug(
132
+ f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
133
  )
134
 
 
 
 
 
135
  try:
136
  # Synthesize speech using the ElevenLabs SDK
137
+ response = elevenlabs_config.client.text_to_voice.create_previews(
138
+ voice_description=prompt,
139
  text=text,
 
 
 
140
  )
141
 
142
+ previews = response.previews
143
+ if not previews:
144
+ msg = "No previews returned by ElevenLabs API."
145
+ logger.error(msg)
146
+ raise ElevenLabsError(message=msg)
 
 
 
 
 
 
 
 
 
147
 
148
+ base64_audio = previews[0].audio_base64
149
+ audio = base64.b64decode(base64_audio)
150
+ return audio
151
 
152
  except Exception as e:
153
+ logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")
154
  raise ElevenLabsError(
155
+ message=f"Failed to synthesize speech with ElevenLabs: {e}",
156
  original_exception=e,
157
+ ) from e
src/integrations/hume_api.py CHANGED
@@ -19,6 +19,7 @@ Functions:
19
  """
20
 
21
  # Standard Library Imports
 
22
  from dataclasses import dataclass
23
  import logging
24
  import random
@@ -33,17 +34,12 @@ from src.config import logger
33
  from src.utils import validate_env_var, truncate_text
34
 
35
 
36
- HumeVoiceName = Literal["ITO", "KORA", "STELLA", "DACHER"]
37
-
38
-
39
  @dataclass(frozen=True)
40
  class HumeConfig:
41
  """Immutable configuration for interacting with the Hume TTS API."""
42
 
43
  api_key: str = validate_env_var("HUME_API_KEY")
44
- tts_endpoint_url: str = "https://api.hume.ai/v0/tts"
45
- voice_names: List[HumeVoiceName] = ("ITO", "KORA", "STELLA", "DACHER")
46
- audio_format: str = "wav"
47
  headers: dict = None
48
 
49
  def __post_init__(self):
@@ -52,10 +48,6 @@ class HumeConfig:
52
  raise ValueError("Hume API key is not set.")
53
  if not self.tts_endpoint_url:
54
  raise ValueError("Hume TTS endpoint URL is not set.")
55
- if not self.voice_names:
56
- raise ValueError("Hume voice names list is not set.")
57
- if not self.audio_format:
58
- raise ValueError("Hume audio format is not set.")
59
 
60
  # Set headers dynamically after validation
61
  object.__setattr__(
@@ -81,38 +73,31 @@ hume_config = HumeConfig()
81
 
82
 
83
  @retry(
84
- stop=stop_after_attempt(1),
85
  wait=wait_fixed(2),
86
  before=before_log(logger, logging.DEBUG),
87
  after=after_log(logger, logging.DEBUG),
88
  reraise=True,
89
  )
90
- def text_to_speech_with_hume(
91
- prompt: str, text: str, voice_name: HumeVoiceName
92
- ) -> bytes:
93
  """
94
  Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
95
 
96
  Args:
97
- prompt (str): The original user prompt (for debugging).
98
  text (str): The generated text to be converted to speech.
99
- voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
100
 
101
  Returns:
102
- voice_name: The name of the voice used for speech synthesis.
103
  bytes: The raw binary audio data for playback.
104
 
105
  Raises:
106
- HumeError: If there is an error communicating with the Hume TTS API.
107
  """
108
  logger.debug(
109
  f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
110
  )
111
 
112
- request_body = {
113
- "text": text,
114
- "voice": {"name": voice_name},
115
- }
116
 
117
  try:
118
  # Synthesize speech using the Hume TTS API
@@ -121,42 +106,30 @@ def text_to_speech_with_hume(
121
  headers=hume_config.headers,
122
  json=request_body,
123
  )
 
 
 
 
124
 
125
- # Validate response
126
- if response.status_code != 200:
127
- logger.error(
128
- f"Hume TTS API Error: {response.status_code} - {response.text[:200]}... (truncated)"
129
- )
130
- raise HumeError(
131
- f"Hume TTS API responded with status {response.status_code}: {response.text[:200]}"
132
- )
133
-
134
- # Process response audio
135
- if response.headers.get("Content-Type", "").startswith("audio/"):
136
- audio = response.content # Raw binary audio data
137
- logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
138
- return voice_name, audio
139
-
140
- raise HumeError(
141
- f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}'
142
- )
143
-
144
- except Exception as e:
145
- logger.exception(f"Error synthesizing speech from text with Hume: {e}")
146
- raise HumeError(
147
- message=f"Failed to synthesize speech from text with Hume: {e}",
148
- original_exception=e,
149
- )
150
-
151
-
152
- def get_random_hume_voice_names() -> Tuple[HumeVoiceName, HumeVoiceName]:
153
- """
154
- Get two random Hume voice names.
155
 
156
- Voices:
157
- - ITO
158
- - KORA
159
- - STELLA
160
- - DACHER
161
- """
162
- return tuple(random.sample(hume_config.voice_names, 2))
 
 
 
 
 
 
 
 
 
19
  """
20
 
21
  # Standard Library Imports
22
+ import base64
23
  from dataclasses import dataclass
24
  import logging
25
  import random
 
34
  from src.utils import validate_env_var, truncate_text
35
 
36
 
 
 
 
37
  @dataclass(frozen=True)
38
  class HumeConfig:
39
  """Immutable configuration for interacting with the Hume TTS API."""
40
 
41
  api_key: str = validate_env_var("HUME_API_KEY")
42
+ tts_endpoint_url: str = "https://test-api.hume.ai/v0/tts/octave"
 
 
43
  headers: dict = None
44
 
45
  def __post_init__(self):
 
48
  raise ValueError("Hume API key is not set.")
49
  if not self.tts_endpoint_url:
50
  raise ValueError("Hume TTS endpoint URL is not set.")
 
 
 
 
51
 
52
  # Set headers dynamically after validation
53
  object.__setattr__(
 
73
 
74
 
75
  @retry(
76
+ stop=stop_after_attempt(3),
77
  wait=wait_fixed(2),
78
  before=before_log(logger, logging.DEBUG),
79
  after=after_log(logger, logging.DEBUG),
80
  reraise=True,
81
  )
82
+ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
 
 
83
  """
84
  Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
85
 
86
  Args:
87
+ prompt (str): The original user prompt to use as the description for generating the voice.
88
  text (str): The generated text to be converted to speech.
 
89
 
90
  Returns:
 
91
  bytes: The raw binary audio data for playback.
92
 
93
  Raises:
94
+ HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
95
  """
96
  logger.debug(
97
  f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
98
  )
99
 
100
+ request_body = {"utterances": [{"text": text, "description": prompt}]}
 
 
 
101
 
102
  try:
103
  # Synthesize speech using the Hume TTS API
 
106
  headers=hume_config.headers,
107
  json=request_body,
108
  )
109
+ response.raise_for_status()
110
+ except requests.RequestException as re:
111
+ logger.exception(f"Error communicating with Hume TTS API: {re}")
112
+ raise HumeError(f"Error communicating with Hume TTS API: {re}") from re
113
 
114
+ try:
115
+ # Parse JSON response
116
+ response_data = response.json()
117
+ except ValueError as ve:
118
+ logger.exception("Invalid JSON response from Hume TTS API")
119
+ raise HumeError("Invalid JSON response from Hume TTS API") from ve
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ try:
122
+ # Safely extract the generation result from the response JSON
123
+ generations = response_data.get("generations", [])
124
+ if not generations or "audio" not in generations[0]:
125
+ logger.error("Missing 'audio' data in the response.")
126
+ raise HumeError("Missing audio data in response from Hume TTS API")
127
+ base64_audio = generations[0]["audio"]
128
+ # Decode base64 encoded audio
129
+ audio = base64.b64decode(base64_audio)
130
+ except (KeyError, TypeError, base64.binascii.Error) as ae:
131
+ logger.exception(f"Error processing audio data: {ae}")
132
+ raise HumeError(f"Error processing audio data from Hume TTS API: {ae}") from ae
133
+
134
+ logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
135
+ return audio
src/types.py CHANGED
@@ -9,27 +9,14 @@ has a consistent structure including both the provider and the associated voice.
9
  from typing import TypedDict, Literal, Dict
10
 
11
 
12
- TTSProviderName = Literal["Hume AI", "ElevenLabs", "Unknown"]
13
  """TTSProviderName represents the allowed provider names for TTS services."""
14
 
15
 
16
- class OptionDetails(TypedDict):
17
- """
18
- A typed dictionary representing the details of an option.
19
-
20
- Attributes:
21
- provider (TTSProviderName): The name of the provider (either 'Hume AI' or 'ElevenLabs').
22
- voice (str): The name of the voice associated with the option.
23
- """
24
-
25
- provider: TTSProviderName
26
- voice: str
27
-
28
-
29
  OptionKey = Literal["Option A", "Option B"]
30
  """OptionKey is restricted to the literal values 'Option A' or 'Option B'."""
31
 
32
 
33
- OptionMap = Dict[OptionKey, OptionDetails]
34
  """OptionMap defines the structure of the options mapping, where each key is an OptionKey
35
  and the value is an OptionDetails dictionary."""
 
9
  from typing import TypedDict, Literal, Dict
10
 
11
 
12
+ TTSProviderName = Literal["Hume AI", "ElevenLabs"]
13
  """TTSProviderName represents the allowed provider names for TTS services."""
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  OptionKey = Literal["Option A", "Option B"]
17
  """OptionKey is restricted to the literal values 'Option A' or 'Option B'."""
18
 
19
 
20
+ OptionMap = Dict[OptionKey, TTSProviderName]
21
  """OptionMap defines the structure of the options mapping, where each key is an OptionKey
22
  and the value is an OptionDetails dictionary."""