zach commited on
Commit
36b195f
·
1 Parent(s): 514de3d

Restore encapsulation for ElevenLabs integration, update TTS functions to return the voice name in addition to the audio

Browse files
src/app.py CHANGED
@@ -35,7 +35,6 @@ from src.integrations import (
35
  AnthropicError,
36
  ElevenLabsError,
37
  generate_text_with_claude,
38
- get_random_elevenlabs_voice_id,
39
  get_random_hume_voice_names,
40
  HumeError,
41
  text_to_speech_with_elevenlabs,
@@ -106,9 +105,7 @@ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[g
106
  # If not using generated text, then only compare Hume to Hume
107
  compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
108
 
109
- elevenlabs_voice = get_random_elevenlabs_voice_id()
110
- # Get two Hume voices preemptively in case we compare Hume with Hume
111
- # to remove chance synthesizing speech twice with the same voice
112
  hume_voice_a, hume_voice_b = get_random_hume_voice_names()
113
 
114
  try:
@@ -118,12 +115,13 @@ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[g
118
 
119
  if compare_hume_with_elevenlabs:
120
  provider_b = ELEVENLABS
121
- future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text, elevenlabs_voice)
122
  else:
123
  provider_b = HUME_AI
124
  future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
125
 
126
- audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
 
127
 
128
  logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
129
  options = [(audio_a, provider_a), (audio_b, provider_b)]
 
35
  AnthropicError,
36
  ElevenLabsError,
37
  generate_text_with_claude,
 
38
  get_random_hume_voice_names,
39
  HumeError,
40
  text_to_speech_with_elevenlabs,
 
105
  # If not using generated text, then only compare Hume to Hume
106
  compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
107
 
108
+ # Pre-select two Hume voices pre-emptively in case we compare Hume to Hume to ensure we do not select the same voice twice.
 
 
109
  hume_voice_a, hume_voice_b = get_random_hume_voice_names()
110
 
111
  try:
 
115
 
116
  if compare_hume_with_elevenlabs:
117
  provider_b = ELEVENLABS
118
+ future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text)
119
  else:
120
  provider_b = HUME_AI
121
  future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
122
 
123
+ voice_a, audio_a = future_audio_a.result()
124
+ voice_b, audio_b = future_audio_b.result()
125
 
126
  logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
127
  options = [(audio_a, provider_a), (audio_b, provider_b)]
src/integrations/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
  from .anthropic_api import generate_text_with_claude, AnthropicError
2
- from .elevenlabs_api import text_to_speech_with_elevenlabs, get_random_elevenlabs_voice_id, ElevenLabsError
3
  from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
 
1
  from .anthropic_api import generate_text_with_claude, AnthropicError
2
+ from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
3
  from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
src/integrations/elevenlabs_api.py CHANGED
@@ -20,9 +20,10 @@ Functions:
20
 
21
  # Standard Library Imports
22
  from dataclasses import dataclass
 
23
  import logging
24
  import random
25
- from typing import Literal, Optional
26
 
27
  # Third-Party Library Imports
28
  from elevenlabs import ElevenLabs
@@ -30,27 +31,34 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
30
 
31
  # Local Application Imports
32
  from src.config import logger
33
- from src.utils import validate_env_var, truncate_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- ElevenlabsVoiceId = Literal[
36
- "pNInz6obpgDQGcFmaJgB",
37
- "ErXwobaYiN019PkySvjV",
38
- "21m00Tcm4TlvDq8ikWAM",
39
- "XrExE9yKIg1WjnnlVkGX"
40
- ]
41
 
42
  @dataclass(frozen=True)
43
  class ElevenLabsConfig:
44
  """Immutable configuration for interacting with the ElevenLabs TTS API."""
45
  api_key: str = validate_env_var('ELEVENLABS_API_KEY')
46
- model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
47
- output_format: str = 'mp3_44100_128' # Output format of the generated audio
48
- voice_ids: list[ElevenlabsVoiceId] = (
49
- 'pNInz6obpgDQGcFmaJgB', # Adam
50
- 'ErXwobaYiN019PkySvjV', # Antoni
51
- '21m00Tcm4TlvDq8ikWAM', # Rachel
52
- 'XrExE9yKIg1WjnnlVkGX', # Matilda
53
- )
54
 
55
  def __post_init__(self):
56
  # Validate that required attributes are set
@@ -60,8 +68,6 @@ class ElevenLabsConfig:
60
  raise ValueError('ElevenLabs Model ID is not set.')
61
  if not self.output_format:
62
  raise ValueError('ElevenLabs Output Format is not set.')
63
- if not self.voice_ids:
64
- raise ValueError('ElevenLabs Voice IDs are not set.')
65
 
66
  @property
67
  def client(self) -> ElevenLabs:
@@ -74,11 +80,14 @@ class ElevenLabsConfig:
74
  return ElevenLabs(api_key=self.api_key)
75
 
76
  @property
77
- def random_voice_id(self) -> str:
78
  """
79
- Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
 
 
 
80
  """
81
- return random.choice(self.voice_ids)
82
 
83
 
84
  class ElevenLabsError(Exception):
@@ -99,38 +108,42 @@ elevenlabs_config = ElevenLabsConfig()
99
  after=after_log(logger, logging.DEBUG),
100
  reraise=True
101
  )
102
- def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> bytes:
103
  """
104
  Synthesizes text to speech using the ElevenLabs TTS API.
105
 
106
  Args:
107
  text (str): The text to be synthesized to speech.
108
- voice_id (str): The voice ID for Elevenlabs to use when synthesizing speech.
109
 
110
  Returns:
111
- bytes: The raw binary audio data for playback.
 
112
 
113
  Raises:
114
  ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
115
  """
116
  logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')
117
 
 
 
 
 
118
  try:
119
  # Synthesize speech using the ElevenLabs SDK
120
  audio_iterator = elevenlabs_config.client.text_to_speech.convert(
121
  text=text,
122
- voice_id=voice_id,
123
  model_id=elevenlabs_config.model_id,
124
  output_format=elevenlabs_config.output_format,
125
  )
126
 
127
- # Ensure the response is an iterator
128
- if not hasattr(audio_iterator, '__iter__') or not hasattr(audio_iterator, '__next__'):
 
 
 
129
  logger.error('Invalid audio iterator response.')
130
- raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.')
131
-
132
- # Combine chunks into a single bytes object
133
- audio = b''.join(chunk for chunk in audio_iterator)
134
 
135
  # Validate audio
136
  if not audio:
@@ -138,23 +151,11 @@ def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> by
138
  raise ElevenLabsError('Empty audio data received from ElevenLabs API.')
139
 
140
  logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
141
- return audio
142
 
143
  except Exception as e:
144
  logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
145
  raise ElevenLabsError(
146
  message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
147
  original_exception=e,
148
- )
149
-
150
- def get_random_elevenlabs_voice_id() -> ElevenlabsVoiceId:
151
- """
152
- Get a random Elevenlabs voice ID.
153
-
154
- Voices:
155
- - pNInz6obpgDQGcFmaJgB (Adam)
156
- - ErXwobaYiN019PkySvjV (Antoni)
157
- - 21m00Tcm4TlvDq8ikWAM (Rachel)
158
- - XrExE9yKIg1WjnnlVkGX (Matilda)
159
- """
160
- return elevenlabs_config.random_voice_id
 
20
 
21
  # Standard Library Imports
22
  from dataclasses import dataclass
23
+ from enum import Enum
24
  import logging
25
  import random
26
+ from typing import Literal, Optional, Tuple
27
 
28
  # Third-Party Library Imports
29
  from elevenlabs import ElevenLabs
 
31
 
32
  # Local Application Imports
33
  from src.config import logger
34
+ from src.utils import validate_env_var
35
+
36
+
37
+ ElevenlabsVoiceName = Literal['Adam', 'Antoni', 'Rachel', 'Matilda']
38
+
39
+ class ElevenLabsVoice(Enum):
40
+ ADAM = ('Adam', 'pNInz6obpgDQGcFmaJgB')
41
+ ANTONI = ('Antoni', 'ErXwobaYiN019PkySvjV')
42
+ RACHEL = ('Rachel', '21m00Tcm4TlvDq8ikWAM')
43
+ MATILDA = ('Matilda', 'XrExE9yKIg1WjnnlVkGX')
44
+
45
+ @property
46
+ def voice_name(self) -> ElevenlabsVoiceName:
47
+ """Returns the display name of the voice."""
48
+ return self.value[0]
49
+
50
+ @property
51
+ def voice_id(self) -> str:
52
+ """Returns the ElevenLabs voice ID."""
53
+ return self.value[1]
54
 
 
 
 
 
 
 
55
 
56
  @dataclass(frozen=True)
57
  class ElevenLabsConfig:
58
  """Immutable configuration for interacting with the ElevenLabs TTS API."""
59
  api_key: str = validate_env_var('ELEVENLABS_API_KEY')
60
+ model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
61
+ output_format: str = 'mp3_44100_128' # Output format of the generated audio
 
 
 
 
 
 
62
 
63
  def __post_init__(self):
64
  # Validate that required attributes are set
 
68
  raise ValueError('ElevenLabs Model ID is not set.')
69
  if not self.output_format:
70
  raise ValueError('ElevenLabs Output Format is not set.')
 
 
71
 
72
  @property
73
  def client(self) -> ElevenLabs:
 
80
  return ElevenLabs(api_key=self.api_key)
81
 
82
  @property
83
+ def random_voice(self) -> ElevenLabsVoice:
84
  """
85
+ Selects a random ElevenLabs voice.
86
+
87
+ Returns:
88
+ ElevenLabsVoice: A randomly selected voice enum member.
89
  """
90
+ return random.choice(list(ElevenLabsVoice))
91
 
92
 
93
  class ElevenLabsError(Exception):
 
108
  after=after_log(logger, logging.DEBUG),
109
  reraise=True
110
  )
111
+ def text_to_speech_with_elevenlabs(text: str) -> Tuple[ElevenlabsVoiceName, bytes]:
112
  """
113
  Synthesizes text to speech using the ElevenLabs TTS API.
114
 
115
  Args:
116
  text (str): The text to be synthesized to speech.
 
117
 
118
  Returns:
119
+ Tuple[ElevenlabsVoiceName, bytes]: A tuple containing the voice name used for speech synthesis
120
+ and the raw binary audio data for playback.
121
 
122
  Raises:
123
  ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
124
  """
125
  logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')
126
 
127
+ # Get a random voice as an enum member.
128
+ voice = elevenlabs_config.random_voice
129
+ logger.debug(f"Selected voice: {voice.voice_name}")
130
+
131
  try:
132
  # Synthesize speech using the ElevenLabs SDK
133
  audio_iterator = elevenlabs_config.client.text_to_speech.convert(
134
  text=text,
135
+ voice_id=voice.voice_id,
136
  model_id=elevenlabs_config.model_id,
137
  output_format=elevenlabs_config.output_format,
138
  )
139
 
140
+ # Attempt to combine chunks into a single bytes object.
141
+ # If audio_iterator is not iterable or invalid, an exception will be raised.
142
+ try:
143
+ audio = b''.join(chunk for chunk in audio_iterator)
144
+ except Exception as iter_error:
145
  logger.error('Invalid audio iterator response.')
146
+ raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.') from iter_error
 
 
 
147
 
148
  # Validate audio
149
  if not audio:
 
151
  raise ElevenLabsError('Empty audio data received from ElevenLabs API.')
152
 
153
  logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
154
+ return voice.voice_name, audio
155
 
156
  except Exception as e:
157
  logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
158
  raise ElevenLabsError(
159
  message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
160
  original_exception=e,
161
+ )
 
 
 
 
 
 
 
 
 
 
 
 
src/integrations/hume_api.py CHANGED
@@ -90,6 +90,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
90
  voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
91
 
92
  Returns:
 
93
  bytes: The raw binary audio data for playback.
94
 
95
  Raises:
@@ -121,7 +122,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
121
  if response.headers.get('Content-Type', '').startswith('audio/'):
122
  audio = response.content # Raw binary audio data
123
  logger.info(f'Received audio data from Hume ({len(audio)} bytes).')
124
- return audio
125
 
126
  raise HumeError(f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}')
127
 
@@ -132,7 +133,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
132
  original_exception=e,
133
  )
134
 
135
- def get_random_hume_voice_names() -> Tuple[str, str]:
136
  """
137
  Get two random Hume voice names.
138
 
 
90
  voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
91
 
92
  Returns:
93
+ voice_name: The name of the voice used for speech synthesis.
94
  bytes: The raw binary audio data for playback.
95
 
96
  Raises:
 
122
  if response.headers.get('Content-Type', '').startswith('audio/'):
123
  audio = response.content # Raw binary audio data
124
  logger.info(f'Received audio data from Hume ({len(audio)} bytes).')
125
+ return voice_name, audio
126
 
127
  raise HumeError(f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}')
128
 
 
133
  original_exception=e,
134
  )
135
 
136
+ def get_random_hume_voice_names() -> Tuple[HumeVoiceName, HumeVoiceName]:
137
  """
138
  Get two random Hume voice names.
139