Spaces:
Running
Running
zach
commited on
Commit
·
36b195f
1
Parent(s):
514de3d
Restore encapsulation for ElevenLabs integration, update TTS functions to return the voice name in addition to the audio
Browse files- src/app.py +4 -6
- src/integrations/__init__.py +1 -1
- src/integrations/elevenlabs_api.py +46 -45
- src/integrations/hume_api.py +3 -2
src/app.py
CHANGED
@@ -35,7 +35,6 @@ from src.integrations import (
|
|
35 |
AnthropicError,
|
36 |
ElevenLabsError,
|
37 |
generate_text_with_claude,
|
38 |
-
get_random_elevenlabs_voice_id,
|
39 |
get_random_hume_voice_names,
|
40 |
HumeError,
|
41 |
text_to_speech_with_elevenlabs,
|
@@ -106,9 +105,7 @@ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[g
|
|
106 |
# If not using generated text, then only compare Hume to Hume
|
107 |
compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
|
108 |
|
109 |
-
|
110 |
-
# Get two Hume voices preemptively in case we compare Hume with Hume
|
111 |
-
# to remove chance synthesizing speech twice with the same voice
|
112 |
hume_voice_a, hume_voice_b = get_random_hume_voice_names()
|
113 |
|
114 |
try:
|
@@ -118,12 +115,13 @@ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[g
|
|
118 |
|
119 |
if compare_hume_with_elevenlabs:
|
120 |
provider_b = ELEVENLABS
|
121 |
-
future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text
|
122 |
else:
|
123 |
provider_b = HUME_AI
|
124 |
future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
|
125 |
|
126 |
-
|
|
|
127 |
|
128 |
logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
|
129 |
options = [(audio_a, provider_a), (audio_b, provider_b)]
|
|
|
35 |
AnthropicError,
|
36 |
ElevenLabsError,
|
37 |
generate_text_with_claude,
|
|
|
38 |
get_random_hume_voice_names,
|
39 |
HumeError,
|
40 |
text_to_speech_with_elevenlabs,
|
|
|
105 |
# If not using generated text, then only compare Hume to Hume
|
106 |
compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
|
107 |
|
108 |
+
# Pre-select two Hume voices pre-emptively in case we compare Hume to Hume to ensure we do not select the same voice twice.
|
|
|
|
|
109 |
hume_voice_a, hume_voice_b = get_random_hume_voice_names()
|
110 |
|
111 |
try:
|
|
|
115 |
|
116 |
if compare_hume_with_elevenlabs:
|
117 |
provider_b = ELEVENLABS
|
118 |
+
future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text)
|
119 |
else:
|
120 |
provider_b = HUME_AI
|
121 |
future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
|
122 |
|
123 |
+
voice_a, audio_a = future_audio_a.result()
|
124 |
+
voice_b, audio_b = future_audio_b.result()
|
125 |
|
126 |
logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
|
127 |
options = [(audio_a, provider_a), (audio_b, provider_b)]
|
src/integrations/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
from .anthropic_api import generate_text_with_claude, AnthropicError
|
2 |
-
from .elevenlabs_api import text_to_speech_with_elevenlabs,
|
3 |
from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
|
|
|
1 |
from .anthropic_api import generate_text_with_claude, AnthropicError
|
2 |
+
from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
|
3 |
from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
|
src/integrations/elevenlabs_api.py
CHANGED
@@ -20,9 +20,10 @@ Functions:
|
|
20 |
|
21 |
# Standard Library Imports
|
22 |
from dataclasses import dataclass
|
|
|
23 |
import logging
|
24 |
import random
|
25 |
-
from typing import Literal, Optional
|
26 |
|
27 |
# Third-Party Library Imports
|
28 |
from elevenlabs import ElevenLabs
|
@@ -30,27 +31,34 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
|
|
30 |
|
31 |
# Local Application Imports
|
32 |
from src.config import logger
|
33 |
-
from src.utils import validate_env_var
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
ElevenlabsVoiceId = Literal[
|
36 |
-
"pNInz6obpgDQGcFmaJgB",
|
37 |
-
"ErXwobaYiN019PkySvjV",
|
38 |
-
"21m00Tcm4TlvDq8ikWAM",
|
39 |
-
"XrExE9yKIg1WjnnlVkGX"
|
40 |
-
]
|
41 |
|
42 |
@dataclass(frozen=True)
|
43 |
class ElevenLabsConfig:
|
44 |
"""Immutable configuration for interacting with the ElevenLabs TTS API."""
|
45 |
api_key: str = validate_env_var('ELEVENLABS_API_KEY')
|
46 |
-
model_id: str = 'eleven_multilingual_v2'
|
47 |
-
output_format: str = 'mp3_44100_128'
|
48 |
-
voice_ids: list[ElevenlabsVoiceId] = (
|
49 |
-
'pNInz6obpgDQGcFmaJgB', # Adam
|
50 |
-
'ErXwobaYiN019PkySvjV', # Antoni
|
51 |
-
'21m00Tcm4TlvDq8ikWAM', # Rachel
|
52 |
-
'XrExE9yKIg1WjnnlVkGX', # Matilda
|
53 |
-
)
|
54 |
|
55 |
def __post_init__(self):
|
56 |
# Validate that required attributes are set
|
@@ -60,8 +68,6 @@ class ElevenLabsConfig:
|
|
60 |
raise ValueError('ElevenLabs Model ID is not set.')
|
61 |
if not self.output_format:
|
62 |
raise ValueError('ElevenLabs Output Format is not set.')
|
63 |
-
if not self.voice_ids:
|
64 |
-
raise ValueError('ElevenLabs Voice IDs are not set.')
|
65 |
|
66 |
@property
|
67 |
def client(self) -> ElevenLabs:
|
@@ -74,11 +80,14 @@ class ElevenLabsConfig:
|
|
74 |
return ElevenLabs(api_key=self.api_key)
|
75 |
|
76 |
@property
|
77 |
-
def
|
78 |
"""
|
79 |
-
|
|
|
|
|
|
|
80 |
"""
|
81 |
-
return random.choice(
|
82 |
|
83 |
|
84 |
class ElevenLabsError(Exception):
|
@@ -99,38 +108,42 @@ elevenlabs_config = ElevenLabsConfig()
|
|
99 |
after=after_log(logger, logging.DEBUG),
|
100 |
reraise=True
|
101 |
)
|
102 |
-
def text_to_speech_with_elevenlabs(text: str
|
103 |
"""
|
104 |
Synthesizes text to speech using the ElevenLabs TTS API.
|
105 |
|
106 |
Args:
|
107 |
text (str): The text to be synthesized to speech.
|
108 |
-
voice_id (str): The voice ID for Elevenlabs to use when synthesizing speech.
|
109 |
|
110 |
Returns:
|
111 |
-
bytes:
|
|
|
112 |
|
113 |
Raises:
|
114 |
ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
|
115 |
"""
|
116 |
logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')
|
117 |
|
|
|
|
|
|
|
|
|
118 |
try:
|
119 |
# Synthesize speech using the ElevenLabs SDK
|
120 |
audio_iterator = elevenlabs_config.client.text_to_speech.convert(
|
121 |
text=text,
|
122 |
-
voice_id=voice_id,
|
123 |
model_id=elevenlabs_config.model_id,
|
124 |
output_format=elevenlabs_config.output_format,
|
125 |
)
|
126 |
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
129 |
logger.error('Invalid audio iterator response.')
|
130 |
-
raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.')
|
131 |
-
|
132 |
-
# Combine chunks into a single bytes object
|
133 |
-
audio = b''.join(chunk for chunk in audio_iterator)
|
134 |
|
135 |
# Validate audio
|
136 |
if not audio:
|
@@ -138,23 +151,11 @@ def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> by
|
|
138 |
raise ElevenLabsError('Empty audio data received from ElevenLabs API.')
|
139 |
|
140 |
logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
|
141 |
-
return audio
|
142 |
|
143 |
except Exception as e:
|
144 |
logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
|
145 |
raise ElevenLabsError(
|
146 |
message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
|
147 |
original_exception=e,
|
148 |
-
)
|
149 |
-
|
150 |
-
def get_random_elevenlabs_voice_id() -> ElevenlabsVoiceId:
|
151 |
-
"""
|
152 |
-
Get a random Elevenlabs voice ID.
|
153 |
-
|
154 |
-
Voices:
|
155 |
-
- pNInz6obpgDQGcFmaJgB (Adam)
|
156 |
-
- ErXwobaYiN019PkySvjV (Antoni)
|
157 |
-
- 21m00Tcm4TlvDq8ikWAM (Rachel)
|
158 |
-
- XrExE9yKIg1WjnnlVkGX (Matilda)
|
159 |
-
"""
|
160 |
-
return elevenlabs_config.random_voice_id
|
|
|
20 |
|
21 |
# Standard Library Imports
|
22 |
from dataclasses import dataclass
|
23 |
+
from enum import Enum
|
24 |
import logging
|
25 |
import random
|
26 |
+
from typing import Literal, Optional, Tuple
|
27 |
|
28 |
# Third-Party Library Imports
|
29 |
from elevenlabs import ElevenLabs
|
|
|
31 |
|
32 |
# Local Application Imports
|
33 |
from src.config import logger
|
34 |
+
from src.utils import validate_env_var
|
35 |
+
|
36 |
+
|
37 |
+
ElevenlabsVoiceName = Literal['Adam', 'Antoni', 'Rachel', 'Matilda']
|
38 |
+
|
39 |
+
class ElevenLabsVoice(Enum):
|
40 |
+
ADAM = ('Adam', 'pNInz6obpgDQGcFmaJgB')
|
41 |
+
ANTONI = ('Antoni', 'ErXwobaYiN019PkySvjV')
|
42 |
+
RACHEL = ('Rachel', '21m00Tcm4TlvDq8ikWAM')
|
43 |
+
MATILDA = ('Matilda', 'XrExE9yKIg1WjnnlVkGX')
|
44 |
+
|
45 |
+
@property
|
46 |
+
def voice_name(self) -> ElevenlabsVoiceName:
|
47 |
+
"""Returns the display name of the voice."""
|
48 |
+
return self.value[0]
|
49 |
+
|
50 |
+
@property
|
51 |
+
def voice_id(self) -> str:
|
52 |
+
"""Returns the ElevenLabs voice ID."""
|
53 |
+
return self.value[1]
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
@dataclass(frozen=True)
|
57 |
class ElevenLabsConfig:
|
58 |
"""Immutable configuration for interacting with the ElevenLabs TTS API."""
|
59 |
api_key: str = validate_env_var('ELEVENLABS_API_KEY')
|
60 |
+
model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
|
61 |
+
output_format: str = 'mp3_44100_128' # Output format of the generated audio
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
def __post_init__(self):
|
64 |
# Validate that required attributes are set
|
|
|
68 |
raise ValueError('ElevenLabs Model ID is not set.')
|
69 |
if not self.output_format:
|
70 |
raise ValueError('ElevenLabs Output Format is not set.')
|
|
|
|
|
71 |
|
72 |
@property
|
73 |
def client(self) -> ElevenLabs:
|
|
|
80 |
return ElevenLabs(api_key=self.api_key)
|
81 |
|
82 |
@property
|
83 |
+
def random_voice(self) -> ElevenLabsVoice:
|
84 |
"""
|
85 |
+
Selects a random ElevenLabs voice.
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
ElevenLabsVoice: A randomly selected voice enum member.
|
89 |
"""
|
90 |
+
return random.choice(list(ElevenLabsVoice))
|
91 |
|
92 |
|
93 |
class ElevenLabsError(Exception):
|
|
|
108 |
after=after_log(logger, logging.DEBUG),
|
109 |
reraise=True
|
110 |
)
|
111 |
+
def text_to_speech_with_elevenlabs(text: str) -> Tuple[ElevenlabsVoiceName, bytes]:
|
112 |
"""
|
113 |
Synthesizes text to speech using the ElevenLabs TTS API.
|
114 |
|
115 |
Args:
|
116 |
text (str): The text to be synthesized to speech.
|
|
|
117 |
|
118 |
Returns:
|
119 |
+
Tuple[ElevenlabsVoiceName, bytes]: A tuple containing the voice name used for speech synthesis
|
120 |
+
and the raw binary audio data for playback.
|
121 |
|
122 |
Raises:
|
123 |
ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
|
124 |
"""
|
125 |
logger.debug(f'Synthesizing speech from text with ElevenLabs. Text length: {len(text)} characters.')
|
126 |
|
127 |
+
# Get a random voice as an enum member.
|
128 |
+
voice = elevenlabs_config.random_voice
|
129 |
+
logger.debug(f"Selected voice: {voice.voice_name}")
|
130 |
+
|
131 |
try:
|
132 |
# Synthesize speech using the ElevenLabs SDK
|
133 |
audio_iterator = elevenlabs_config.client.text_to_speech.convert(
|
134 |
text=text,
|
135 |
+
voice_id=voice.voice_id,
|
136 |
model_id=elevenlabs_config.model_id,
|
137 |
output_format=elevenlabs_config.output_format,
|
138 |
)
|
139 |
|
140 |
+
# Attempt to combine chunks into a single bytes object.
|
141 |
+
# If audio_iterator is not iterable or invalid, an exception will be raised.
|
142 |
+
try:
|
143 |
+
audio = b''.join(chunk for chunk in audio_iterator)
|
144 |
+
except Exception as iter_error:
|
145 |
logger.error('Invalid audio iterator response.')
|
146 |
+
raise ElevenLabsError('Invalid audio iterator received from ElevenLabs API.') from iter_error
|
|
|
|
|
|
|
147 |
|
148 |
# Validate audio
|
149 |
if not audio:
|
|
|
151 |
raise ElevenLabsError('Empty audio data received from ElevenLabs API.')
|
152 |
|
153 |
logger.info(f'Received ElevenLabs audio ({len(audio)} bytes).')
|
154 |
+
return voice.voice_name, audio
|
155 |
|
156 |
except Exception as e:
|
157 |
logger.exception(f'Error synthesizing speech from text with Elevenlabs: {e}')
|
158 |
raise ElevenLabsError(
|
159 |
message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
|
160 |
original_exception=e,
|
161 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/integrations/hume_api.py
CHANGED
@@ -90,6 +90,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
|
|
90 |
voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
|
91 |
|
92 |
Returns:
|
|
|
93 |
bytes: The raw binary audio data for playback.
|
94 |
|
95 |
Raises:
|
@@ -121,7 +122,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
|
|
121 |
if response.headers.get('Content-Type', '').startswith('audio/'):
|
122 |
audio = response.content # Raw binary audio data
|
123 |
logger.info(f'Received audio data from Hume ({len(audio)} bytes).')
|
124 |
-
return audio
|
125 |
|
126 |
raise HumeError(f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}')
|
127 |
|
@@ -132,7 +133,7 @@ def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName)
|
|
132 |
original_exception=e,
|
133 |
)
|
134 |
|
135 |
-
def get_random_hume_voice_names() -> Tuple[
|
136 |
"""
|
137 |
Get two random Hume voice names.
|
138 |
|
|
|
90 |
voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
|
91 |
|
92 |
Returns:
|
93 |
+
voice_name: The name of the voice used for speech synthesis.
|
94 |
bytes: The raw binary audio data for playback.
|
95 |
|
96 |
Raises:
|
|
|
122 |
if response.headers.get('Content-Type', '').startswith('audio/'):
|
123 |
audio = response.content # Raw binary audio data
|
124 |
logger.info(f'Received audio data from Hume ({len(audio)} bytes).')
|
125 |
+
return voice_name, audio
|
126 |
|
127 |
raise HumeError(f'Unexpected Content-Type: {response.headers.get("Content-Type", "Unknown")}')
|
128 |
|
|
|
133 |
original_exception=e,
|
134 |
)
|
135 |
|
136 |
+
def get_random_hume_voice_names() -> Tuple[HumeVoiceName, HumeVoiceName]:
|
137 |
"""
|
138 |
Get two random Hume voice names.
|
139 |
|