Spaces:
Running
Running
zach
commited on
Commit
·
8047063
1
Parent(s):
bc5091e
Update UI to compare Hume vs Elevenlabs 50% of the time, and Hume vs Hume 50% of the time
Browse files- src/app.py +71 -53
- src/constants.py +11 -6
- src/integrations/__init__.py +2 -2
- src/integrations/elevenlabs_api.py +29 -10
- src/integrations/hume_api.py +26 -19
src/app.py
CHANGED
@@ -19,23 +19,27 @@ import gradio as gr
|
|
19 |
# Local Application Imports
|
20 |
from src.config import logger
|
21 |
from src.constants import (
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
PROMPT_MAX_LENGTH,
|
25 |
PROMPT_MIN_LENGTH,
|
26 |
SAMPLE_PROMPTS,
|
27 |
TROPHY_EMOJI,
|
28 |
UNKNOWN_PROVIDER,
|
29 |
-
|
30 |
-
|
31 |
)
|
32 |
from src.integrations import (
|
33 |
AnthropicError,
|
34 |
ElevenLabsError,
|
35 |
generate_text_with_claude,
|
|
|
|
|
36 |
HumeError,
|
37 |
text_to_speech_with_elevenlabs,
|
38 |
-
text_to_speech_with_hume
|
39 |
)
|
40 |
from src.theme import CustomTheme
|
41 |
from src.utils import truncate_text, validate_prompt_length
|
@@ -76,7 +80,9 @@ def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
|
|
76 |
|
77 |
def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
|
78 |
"""
|
79 |
-
Synthesizes
|
|
|
|
|
80 |
|
81 |
Args:
|
82 |
prompt (str): The original prompt.
|
@@ -96,25 +102,37 @@ def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.upda
|
|
96 |
logger.warning('Skipping text-to-speech due to empty text.')
|
97 |
return gr.skip(), gr.skip(), gr.skip(), gr.skip()
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
try:
|
100 |
with ThreadPoolExecutor(max_workers=2) as executor:
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
|
105 |
-
elevenlabs_audio = future_elevenlabs.result()
|
106 |
|
107 |
-
logger.info(f'TTS generated:
|
108 |
-
options = [(
|
109 |
random.shuffle(options)
|
110 |
-
|
111 |
-
options_map = {
|
112 |
|
113 |
return (
|
114 |
-
gr.update(value=
|
115 |
-
gr.update(value=
|
116 |
options_map,
|
117 |
-
|
118 |
)
|
119 |
except ElevenLabsError as ee:
|
120 |
logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
|
@@ -145,16 +163,16 @@ def vote(vote_submitted: bool, option_mapping: dict, selected_button: str) -> Tu
|
|
145 |
if not option_mapping or vote_submitted:
|
146 |
return gr.skip(), gr.skip(), gr.skip()
|
147 |
|
148 |
-
|
149 |
-
selected_option, other_option = (
|
150 |
selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
|
151 |
other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
|
152 |
|
153 |
return (
|
154 |
True,
|
155 |
-
gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if
|
156 |
else gr.update(value=other_provider, variant='secondary'),
|
157 |
-
gr.update(value=other_provider, variant='secondary') if
|
158 |
else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
|
159 |
)
|
160 |
|
@@ -164,10 +182,10 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
|
|
164 |
|
165 |
Returns:
|
166 |
A tuple of updates for:
|
167 |
-
-
|
168 |
-
-
|
169 |
-
-
|
170 |
-
-
|
171 |
- option_mapping_state (reset option map state)
|
172 |
- option2_audio_state (reset option 2 audio state)
|
173 |
- vote_submitted_state (reset submitted vote state)
|
@@ -175,8 +193,8 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
|
|
175 |
return (
|
176 |
gr.update(value=None),
|
177 |
gr.update(value=None),
|
178 |
-
gr.update(interactive=False, value=
|
179 |
-
gr.update(interactive=False, value=
|
180 |
None,
|
181 |
None,
|
182 |
False,
|
@@ -204,7 +222,7 @@ def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Butt
|
|
204 |
max_length=PROMPT_MAX_LENGTH,
|
205 |
show_copy_button=True,
|
206 |
)
|
207 |
-
generate_button = gr.Button('Generate', variant='primary')
|
208 |
return instructions, sample_prompt_dropdown, prompt_input, generate_button
|
209 |
|
210 |
|
@@ -212,7 +230,7 @@ def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, g
|
|
212 |
"""Builds the output section including generated text, audio players, and vote buttons."""
|
213 |
with gr.Column(variant='compact'):
|
214 |
generated_text = gr.Textbox(
|
215 |
-
label='
|
216 |
interactive=False,
|
217 |
autoscroll=False,
|
218 |
lines=5,
|
@@ -221,12 +239,12 @@ def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, g
|
|
221 |
show_copy_button=True,
|
222 |
)
|
223 |
with gr.Row(equal_height=True):
|
224 |
-
|
225 |
-
|
226 |
with gr.Row():
|
227 |
-
|
228 |
-
|
229 |
-
return generated_text,
|
230 |
|
231 |
|
232 |
def build_gradio_interface() -> gr.Blocks:
|
@@ -250,7 +268,7 @@ def build_gradio_interface() -> gr.Blocks:
|
|
250 |
instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
|
251 |
|
252 |
# Build output section
|
253 |
-
generated_text,
|
254 |
|
255 |
# UI state components
|
256 |
option_mapping_state = gr.State() # Track option map (option 1 and option 2 are randomized)
|
@@ -280,10 +298,10 @@ def build_gradio_interface() -> gr.Blocks:
|
|
280 |
fn=reset_ui,
|
281 |
inputs=[],
|
282 |
outputs=[
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
option_mapping_state,
|
288 |
option2_audio_state,
|
289 |
vote_submitted_state,
|
@@ -296,10 +314,10 @@ def build_gradio_interface() -> gr.Blocks:
|
|
296 |
fn=text_to_speech,
|
297 |
inputs=[prompt_input, generated_text],
|
298 |
outputs=[
|
299 |
-
|
300 |
-
|
301 |
option_mapping_state,
|
302 |
-
option2_audio_state
|
303 |
],
|
304 |
).then(
|
305 |
fn=lambda: gr.update(interactive=True), # Re-enable the button
|
@@ -308,33 +326,33 @@ def build_gradio_interface() -> gr.Blocks:
|
|
308 |
)
|
309 |
|
310 |
# Vote button click handlers
|
311 |
-
|
312 |
fn=vote,
|
313 |
-
inputs=[vote_submitted_state, option_mapping_state,
|
314 |
-
outputs=[vote_submitted_state,
|
315 |
)
|
316 |
-
|
317 |
fn=vote,
|
318 |
-
inputs=[vote_submitted_state, option_mapping_state,
|
319 |
-
outputs=[vote_submitted_state,
|
320 |
)
|
321 |
|
322 |
# Auto-play second audio after first finishes (workaround for playing audio back-to-back)
|
323 |
-
|
324 |
fn=lambda _: gr.update(value=None),
|
325 |
inputs=[],
|
326 |
-
outputs=[
|
327 |
).then(
|
328 |
fn=lambda audio: gr.update(value=audio, autoplay=True),
|
329 |
inputs=[option2_audio_state],
|
330 |
-
outputs=[
|
331 |
)
|
332 |
|
333 |
# Enable voting after second audio option playback finishes
|
334 |
-
|
335 |
fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
|
336 |
inputs=[],
|
337 |
-
outputs=[
|
338 |
)
|
339 |
|
340 |
logger.debug('Gradio interface built successfully')
|
|
|
19 |
# Local Application Imports
|
20 |
from src.config import logger
|
21 |
from src.constants import (
|
22 |
+
ELEVENLABS,
|
23 |
+
HUME_AI,
|
24 |
+
OPTION_A,
|
25 |
+
OPTION_B,
|
26 |
PROMPT_MAX_LENGTH,
|
27 |
PROMPT_MIN_LENGTH,
|
28 |
SAMPLE_PROMPTS,
|
29 |
TROPHY_EMOJI,
|
30 |
UNKNOWN_PROVIDER,
|
31 |
+
VOTE_FOR_OPTION_A,
|
32 |
+
VOTE_FOR_OPTION_B,
|
33 |
)
|
34 |
from src.integrations import (
|
35 |
AnthropicError,
|
36 |
ElevenLabsError,
|
37 |
generate_text_with_claude,
|
38 |
+
get_random_elevenlabs_voice_id,
|
39 |
+
get_random_hume_voice_names,
|
40 |
HumeError,
|
41 |
text_to_speech_with_elevenlabs,
|
42 |
+
text_to_speech_with_hume,
|
43 |
)
|
44 |
from src.theme import CustomTheme
|
45 |
from src.utils import truncate_text, validate_prompt_length
|
|
|
80 |
|
81 |
def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
|
82 |
"""
|
83 |
+
Synthesizes two text to speech outputs and loads the two audio players in the UI with the output audio.
|
84 |
+
- 50% of the time one Hume tts output and one Elevenlabs output will be synthesized.
|
85 |
+
= 50% of the time two Hume tts outputs will be synthesized.
|
86 |
|
87 |
Args:
|
88 |
prompt (str): The original prompt.
|
|
|
102 |
logger.warning('Skipping text-to-speech due to empty text.')
|
103 |
return gr.skip(), gr.skip(), gr.skip(), gr.skip()
|
104 |
|
105 |
+
# compare_hume_with_elevenlabs = random.random() < 0.5
|
106 |
+
compare_hume_with_elevenlabs = False
|
107 |
+
|
108 |
+
elevenlabs_voice = get_random_elevenlabs_voice_id()
|
109 |
+
hume_voice_a, hume_voice_b = get_random_hume_voice_names() # We get two Hume voices preemptively in case we compare Hume with Hume
|
110 |
+
|
111 |
try:
|
112 |
with ThreadPoolExecutor(max_workers=2) as executor:
|
113 |
+
provider_a = HUME_AI
|
114 |
+
future_audio_a = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_a)
|
115 |
+
|
116 |
+
if compare_hume_with_elevenlabs:
|
117 |
+
provider_b = ELEVENLABS
|
118 |
+
future_audio_b = executor.submit(text_to_speech_with_elevenlabs, generated_text, elevenlabs_voice)
|
119 |
+
else:
|
120 |
+
provider_b = HUME_AI
|
121 |
+
future_audio_b = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_b)
|
122 |
|
123 |
+
audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
|
|
|
124 |
|
125 |
+
logger.info(f'TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes')
|
126 |
+
options = [(audio_a, provider_a), (audio_b, provider_b)]
|
127 |
random.shuffle(options)
|
128 |
+
option_a_audio, option_b_audio = options[0][0], options[1][0]
|
129 |
+
options_map = { OPTION_A: options[0][1], OPTION_B: options[1][1] }
|
130 |
|
131 |
return (
|
132 |
+
gr.update(value=option_a_audio, autoplay=True),
|
133 |
+
gr.update(value=option_b_audio),
|
134 |
options_map,
|
135 |
+
option_b_audio,
|
136 |
)
|
137 |
except ElevenLabsError as ee:
|
138 |
logger.error(f'ElevenLabsError while synthesizing speech from text: {str(ee)}')
|
|
|
163 |
if not option_mapping or vote_submitted:
|
164 |
return gr.skip(), gr.skip(), gr.skip()
|
165 |
|
166 |
+
is_option_a = selected_button == VOTE_FOR_OPTION_A
|
167 |
+
selected_option, other_option = (OPTION_A, OPTION_B) if is_option_a else (OPTION_B, OPTION_A)
|
168 |
selected_provider = option_mapping.get(selected_option, UNKNOWN_PROVIDER)
|
169 |
other_provider = option_mapping.get(other_option, UNKNOWN_PROVIDER)
|
170 |
|
171 |
return (
|
172 |
True,
|
173 |
+
gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_a
|
174 |
else gr.update(value=other_provider, variant='secondary'),
|
175 |
+
gr.update(value=other_provider, variant='secondary') if is_option_a
|
176 |
else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
|
177 |
)
|
178 |
|
|
|
182 |
|
183 |
Returns:
|
184 |
A tuple of updates for:
|
185 |
+
- option_a_audio_player (clear audio)
|
186 |
+
- option_b_audio_player (clear audio)
|
187 |
+
- vote_button_a (disable and reset button text)
|
188 |
+
- vote_button_a (disable and reset button text)
|
189 |
- option_mapping_state (reset option map state)
|
190 |
- option2_audio_state (reset option 2 audio state)
|
191 |
- vote_submitted_state (reset submitted vote state)
|
|
|
193 |
return (
|
194 |
gr.update(value=None),
|
195 |
gr.update(value=None),
|
196 |
+
gr.update(interactive=False, value=VOTE_FOR_OPTION_A, variant='secondary'),
|
197 |
+
gr.update(interactive=False, value=VOTE_FOR_OPTION_B, variant='secondary'),
|
198 |
None,
|
199 |
None,
|
200 |
False,
|
|
|
222 |
max_length=PROMPT_MAX_LENGTH,
|
223 |
show_copy_button=True,
|
224 |
)
|
225 |
+
generate_button = gr.Button('Generate text', variant='primary')
|
226 |
return instructions, sample_prompt_dropdown, prompt_input, generate_button
|
227 |
|
228 |
|
|
|
230 |
"""Builds the output section including generated text, audio players, and vote buttons."""
|
231 |
with gr.Column(variant='compact'):
|
232 |
generated_text = gr.Textbox(
|
233 |
+
label='Text',
|
234 |
interactive=False,
|
235 |
autoscroll=False,
|
236 |
lines=5,
|
|
|
239 |
show_copy_button=True,
|
240 |
)
|
241 |
with gr.Row(equal_height=True):
|
242 |
+
option_a_audio_player = gr.Audio(label=OPTION_A, type='filepath', interactive=False)
|
243 |
+
option_b_audio_player = gr.Audio(label=OPTION_B, type='filepath', interactive=False)
|
244 |
with gr.Row():
|
245 |
+
vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
|
246 |
+
vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
|
247 |
+
return generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b
|
248 |
|
249 |
|
250 |
def build_gradio_interface() -> gr.Blocks:
|
|
|
268 |
instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
|
269 |
|
270 |
# Build output section
|
271 |
+
generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b = build_output_section()
|
272 |
|
273 |
# UI state components
|
274 |
option_mapping_state = gr.State() # Track option map (option 1 and option 2 are randomized)
|
|
|
298 |
fn=reset_ui,
|
299 |
inputs=[],
|
300 |
outputs=[
|
301 |
+
option_a_audio_player,
|
302 |
+
option_b_audio_player,
|
303 |
+
vote_button_a,
|
304 |
+
vote_button_b,
|
305 |
option_mapping_state,
|
306 |
option2_audio_state,
|
307 |
vote_submitted_state,
|
|
|
314 |
fn=text_to_speech,
|
315 |
inputs=[prompt_input, generated_text],
|
316 |
outputs=[
|
317 |
+
option_a_audio_player,
|
318 |
+
option_b_audio_player,
|
319 |
option_mapping_state,
|
320 |
+
option2_audio_state,
|
321 |
],
|
322 |
).then(
|
323 |
fn=lambda: gr.update(interactive=True), # Re-enable the button
|
|
|
326 |
)
|
327 |
|
328 |
# Vote button click handlers
|
329 |
+
vote_button_a.click(
|
330 |
fn=vote,
|
331 |
+
inputs=[vote_submitted_state, option_mapping_state, vote_button_a],
|
332 |
+
outputs=[vote_submitted_state, vote_button_a, vote_button_b],
|
333 |
)
|
334 |
+
vote_button_b.click(
|
335 |
fn=vote,
|
336 |
+
inputs=[vote_submitted_state, option_mapping_state, vote_button_b],
|
337 |
+
outputs=[vote_submitted_state, vote_button_a, vote_button_b],
|
338 |
)
|
339 |
|
340 |
# Auto-play second audio after first finishes (workaround for playing audio back-to-back)
|
341 |
+
option_a_audio_player.stop(
|
342 |
fn=lambda _: gr.update(value=None),
|
343 |
inputs=[],
|
344 |
+
outputs=[option_b_audio_player],
|
345 |
).then(
|
346 |
fn=lambda audio: gr.update(value=audio, autoplay=True),
|
347 |
inputs=[option2_audio_state],
|
348 |
+
outputs=[option_b_audio_player],
|
349 |
)
|
350 |
|
351 |
# Enable voting after second audio option playback finishes
|
352 |
+
option_b_audio_player.stop(
|
353 |
fn=lambda _: (gr.update(interactive=True), gr.update(interactive=True), gr.update(autoplay=False)),
|
354 |
inputs=[],
|
355 |
+
outputs=[vote_button_a, vote_button_b, option_b_audio_player],
|
356 |
)
|
357 |
|
358 |
logger.debug('Gradio interface built successfully')
|
src/constants.py
CHANGED
@@ -5,14 +5,19 @@ This module defines global constants used throughout the project.
|
|
5 |
"""
|
6 |
|
7 |
# UI constants
|
|
|
|
|
|
|
|
|
8 |
PROMPT_MIN_LENGTH: int = 10
|
9 |
PROMPT_MAX_LENGTH: int = 400
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
16 |
|
17 |
# A collection of pre-defined prompts categorized by theme, used to provide users with
|
18 |
# inspiration for generating creative text for expressive TTS.
|
|
|
5 |
"""
|
6 |
|
7 |
# UI constants
|
8 |
+
HUME_AI: str = 'Hume'
|
9 |
+
ELEVENLABS: str = 'ElevenLabs'
|
10 |
+
UNKNOWN_PROVIDER: str = 'Unknown'
|
11 |
+
|
12 |
PROMPT_MIN_LENGTH: int = 10
|
13 |
PROMPT_MAX_LENGTH: int = 400
|
14 |
+
|
15 |
+
OPTION_A: str = 'Option A'
|
16 |
+
OPTION_B: str = 'Option B'
|
17 |
+
TROPHY_EMOJI: str = '🏆'
|
18 |
+
VOTE_FOR_OPTION_A: str = 'Vote for option A'
|
19 |
+
VOTE_FOR_OPTION_B: str = 'Vote for option B'
|
20 |
+
|
21 |
|
22 |
# A collection of pre-defined prompts categorized by theme, used to provide users with
|
23 |
# inspiration for generating creative text for expressive TTS.
|
src/integrations/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
from .anthropic_api import generate_text_with_claude, AnthropicError
|
2 |
-
from .elevenlabs_api import text_to_speech_with_elevenlabs, ElevenLabsError
|
3 |
-
from .hume_api import text_to_speech_with_hume, HumeError
|
|
|
1 |
from .anthropic_api import generate_text_with_claude, AnthropicError
|
2 |
+
from .elevenlabs_api import text_to_speech_with_elevenlabs, get_random_elevenlabs_voice_id, ElevenLabsError
|
3 |
+
from .hume_api import text_to_speech_with_hume, get_random_hume_voice_names, HumeError
|
src/integrations/elevenlabs_api.py
CHANGED
@@ -22,7 +22,7 @@ Functions:
|
|
22 |
from dataclasses import dataclass
|
23 |
import logging
|
24 |
import random
|
25 |
-
from typing import Optional
|
26 |
|
27 |
# Third-Party Library Imports
|
28 |
from elevenlabs import ElevenLabs
|
@@ -32,6 +32,12 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
|
|
32 |
from src.config import logger
|
33 |
from src.utils import validate_env_var, truncate_text
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
@dataclass(frozen=True)
|
37 |
class ElevenLabsConfig:
|
@@ -39,7 +45,7 @@ class ElevenLabsConfig:
|
|
39 |
api_key: str = validate_env_var('ELEVENLABS_API_KEY')
|
40 |
model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
|
41 |
output_format: str = 'mp3_44100_128' # Output format of the generated audio
|
42 |
-
|
43 |
'pNInz6obpgDQGcFmaJgB', # Adam
|
44 |
'ErXwobaYiN019PkySvjV', # Antoni
|
45 |
'21m00Tcm4TlvDq8ikWAM', # Rachel
|
@@ -54,8 +60,8 @@ class ElevenLabsConfig:
|
|
54 |
raise ValueError('ElevenLabs Model ID is not set.')
|
55 |
if not self.output_format:
|
56 |
raise ValueError('ElevenLabs Output Format is not set.')
|
57 |
-
if not self.
|
58 |
-
raise ValueError('ElevenLabs
|
59 |
|
60 |
@property
|
61 |
def client(self) -> ElevenLabs:
|
@@ -72,7 +78,7 @@ class ElevenLabsConfig:
|
|
72 |
"""
|
73 |
Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
|
74 |
"""
|
75 |
-
return random.choice(self.
|
76 |
|
77 |
|
78 |
class ElevenLabsError(Exception):
|
@@ -93,12 +99,13 @@ elevenlabs_config = ElevenLabsConfig()
|
|
93 |
after=after_log(logger, logging.DEBUG),
|
94 |
reraise=True
|
95 |
)
|
96 |
-
def text_to_speech_with_elevenlabs(text: str) -> bytes:
|
97 |
"""
|
98 |
-
|
99 |
|
100 |
Args:
|
101 |
-
text (str): The text to be
|
|
|
102 |
|
103 |
Returns:
|
104 |
bytes: The raw binary audio data for playback.
|
@@ -112,7 +119,7 @@ def text_to_speech_with_elevenlabs(text: str) -> bytes:
|
|
112 |
# Synthesize speech using the ElevenLabs SDK
|
113 |
audio_iterator = elevenlabs_config.client.text_to_speech.convert(
|
114 |
text=text,
|
115 |
-
voice_id=
|
116 |
model_id=elevenlabs_config.model_id,
|
117 |
output_format=elevenlabs_config.output_format,
|
118 |
)
|
@@ -138,4 +145,16 @@ def text_to_speech_with_elevenlabs(text: str) -> bytes:
|
|
138 |
raise ElevenLabsError(
|
139 |
message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
|
140 |
original_exception=e,
|
141 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
from dataclasses import dataclass
|
23 |
import logging
|
24 |
import random
|
25 |
+
from typing import Literal, Optional
|
26 |
|
27 |
# Third-Party Library Imports
|
28 |
from elevenlabs import ElevenLabs
|
|
|
32 |
from src.config import logger
|
33 |
from src.utils import validate_env_var, truncate_text
|
34 |
|
35 |
+
ElevenlabsVoiceId = Literal[
|
36 |
+
"pNInz6obpgDQGcFmaJgB",
|
37 |
+
"ErXwobaYiN019PkySvjV",
|
38 |
+
"21m00Tcm4TlvDq8ikWAM",
|
39 |
+
"XrExE9yKIg1WjnnlVkGX"
|
40 |
+
]
|
41 |
|
42 |
@dataclass(frozen=True)
|
43 |
class ElevenLabsConfig:
|
|
|
45 |
api_key: str = validate_env_var('ELEVENLABS_API_KEY')
|
46 |
model_id: str = 'eleven_multilingual_v2' # ElevenLab's most emotionally expressive model
|
47 |
output_format: str = 'mp3_44100_128' # Output format of the generated audio
|
48 |
+
voice_ids: list[ElevenlabsVoiceId] = (
|
49 |
'pNInz6obpgDQGcFmaJgB', # Adam
|
50 |
'ErXwobaYiN019PkySvjV', # Antoni
|
51 |
'21m00Tcm4TlvDq8ikWAM', # Rachel
|
|
|
60 |
raise ValueError('ElevenLabs Model ID is not set.')
|
61 |
if not self.output_format:
|
62 |
raise ValueError('ElevenLabs Output Format is not set.')
|
63 |
+
if not self.voice_ids:
|
64 |
+
raise ValueError('ElevenLabs Voice IDs are not set.')
|
65 |
|
66 |
@property
|
67 |
def client(self) -> ElevenLabs:
|
|
|
78 |
"""
|
79 |
Randomly selects a voice ID from the top default voices, ensuring different voices across calls.
|
80 |
"""
|
81 |
+
return random.choice(self.voice_ids)
|
82 |
|
83 |
|
84 |
class ElevenLabsError(Exception):
|
|
|
99 |
after=after_log(logger, logging.DEBUG),
|
100 |
reraise=True
|
101 |
)
|
102 |
+
def text_to_speech_with_elevenlabs(text: str, voice_id: ElevenlabsVoiceId) -> bytes:
|
103 |
"""
|
104 |
+
Synthesizes text to speech using the ElevenLabs TTS API.
|
105 |
|
106 |
Args:
|
107 |
+
text (str): The text to be synthesized to speech.
|
108 |
+
voice_id (str): The voice ID for Elevenlabs to use when synthesizing speech.
|
109 |
|
110 |
Returns:
|
111 |
bytes: The raw binary audio data for playback.
|
|
|
119 |
# Synthesize speech using the ElevenLabs SDK
|
120 |
audio_iterator = elevenlabs_config.client.text_to_speech.convert(
|
121 |
text=text,
|
122 |
+
voice_id=voice_id,
|
123 |
model_id=elevenlabs_config.model_id,
|
124 |
output_format=elevenlabs_config.output_format,
|
125 |
)
|
|
|
145 |
raise ElevenLabsError(
|
146 |
message=f'Failed to synthesize speech from text with ElevenLabs: {e}',
|
147 |
original_exception=e,
|
148 |
+
)
|
149 |
+
|
150 |
+
def get_random_elevenlabs_voice_id() -> ElevenlabsVoiceId:
|
151 |
+
"""
|
152 |
+
Get a random Elevenlabs voice ID.
|
153 |
+
|
154 |
+
Voices:
|
155 |
+
- pNInz6obpgDQGcFmaJgB (Adam)
|
156 |
+
- ErXwobaYiN019PkySvjV (Antoni)
|
157 |
+
- 21m00Tcm4TlvDq8ikWAM (Rachel)
|
158 |
+
- XrExE9yKIg1WjnnlVkGX (Matilda)
|
159 |
+
"""
|
160 |
+
return elevenlabs_config.random_voice_id
|
src/integrations/hume_api.py
CHANGED
@@ -22,7 +22,7 @@ Functions:
|
|
22 |
from dataclasses import dataclass
|
23 |
import logging
|
24 |
import random
|
25 |
-
from typing import List, Optional
|
26 |
|
27 |
# Third-Party Library Imports
|
28 |
import requests
|
@@ -33,12 +33,14 @@ from src.config import logger
|
|
33 |
from src.utils import validate_env_var, truncate_text
|
34 |
|
35 |
|
|
|
|
|
36 |
@dataclass(frozen=True)
|
37 |
class HumeConfig:
|
38 |
"""Immutable configuration for interacting with the Hume TTS API."""
|
39 |
-
tts_endpoint_url: str = 'https://api.hume.ai/v0/tts'
|
40 |
api_key: str = validate_env_var('HUME_API_KEY')
|
41 |
-
|
|
|
42 |
audio_format: str = 'wav'
|
43 |
headers: dict = None
|
44 |
|
@@ -46,8 +48,10 @@ class HumeConfig:
|
|
46 |
# Validate required attributes
|
47 |
if not self.api_key:
|
48 |
raise ValueError('Hume API key is not set.')
|
49 |
-
if not self.
|
50 |
-
raise ValueError('Hume
|
|
|
|
|
51 |
if not self.audio_format:
|
52 |
raise ValueError('Hume audio format is not set.')
|
53 |
|
@@ -57,16 +61,6 @@ class HumeConfig:
|
|
57 |
'Content-Type': 'application/json',
|
58 |
})
|
59 |
|
60 |
-
@property
|
61 |
-
def random_voice(self) -> str:
|
62 |
-
"""
|
63 |
-
Randomly selects a voice from the available voices.
|
64 |
-
|
65 |
-
Returns:
|
66 |
-
str: A randomly chosen voice name.
|
67 |
-
"""
|
68 |
-
return random.choice(self.voices)
|
69 |
-
|
70 |
|
71 |
class HumeError(Exception):
|
72 |
"""Custom exception for errors related to the Hume TTS API."""
|
@@ -86,13 +80,14 @@ hume_config = HumeConfig()
|
|
86 |
after=after_log(logger, logging.DEBUG),
|
87 |
reraise=True
|
88 |
)
|
89 |
-
def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
|
90 |
"""
|
91 |
-
|
92 |
|
93 |
Args:
|
94 |
prompt (str): The original user prompt (for debugging).
|
95 |
text (str): The generated text to be converted to speech.
|
|
|
96 |
|
97 |
Returns:
|
98 |
bytes: The raw binary audio data for playback.
|
@@ -105,7 +100,7 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
|
|
105 |
request_body = {
|
106 |
'text': text,
|
107 |
'voice': {
|
108 |
-
'name':
|
109 |
},
|
110 |
}
|
111 |
|
@@ -135,4 +130,16 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
|
|
135 |
raise HumeError(
|
136 |
message=f'Failed to synthesize speech from text with Hume: {e}',
|
137 |
original_exception=e,
|
138 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
from dataclasses import dataclass
|
23 |
import logging
|
24 |
import random
|
25 |
+
from typing import List, Literal, Optional, Tuple
|
26 |
|
27 |
# Third-Party Library Imports
|
28 |
import requests
|
|
|
33 |
from src.utils import validate_env_var, truncate_text
|
34 |
|
35 |
|
36 |
+
HumeVoiceName = Literal['ITO', 'KORA', 'STELLA', 'DACHER']
|
37 |
+
|
38 |
@dataclass(frozen=True)
|
39 |
class HumeConfig:
|
40 |
"""Immutable configuration for interacting with the Hume TTS API."""
|
|
|
41 |
api_key: str = validate_env_var('HUME_API_KEY')
|
42 |
+
tts_endpoint_url: str = 'https://api.hume.ai/v0/tts'
|
43 |
+
voice_names: List[HumeVoiceName] = ('ITO', 'KORA', 'STELLA', 'DACHER')
|
44 |
audio_format: str = 'wav'
|
45 |
headers: dict = None
|
46 |
|
|
|
48 |
# Validate required attributes
|
49 |
if not self.api_key:
|
50 |
raise ValueError('Hume API key is not set.')
|
51 |
+
if not self.tts_endpoint_url:
|
52 |
+
raise ValueError('Hume TTS endpoint URL is not set.')
|
53 |
+
if not self.voice_names:
|
54 |
+
raise ValueError('Hume voice names list is not set.')
|
55 |
if not self.audio_format:
|
56 |
raise ValueError('Hume audio format is not set.')
|
57 |
|
|
|
61 |
'Content-Type': 'application/json',
|
62 |
})
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
class HumeError(Exception):
|
66 |
"""Custom exception for errors related to the Hume TTS API."""
|
|
|
80 |
after=after_log(logger, logging.DEBUG),
|
81 |
reraise=True
|
82 |
)
|
83 |
+
def text_to_speech_with_hume(prompt: str, text: str, voice_name: HumeVoiceName) -> bytes:
|
84 |
"""
|
85 |
+
Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
|
86 |
|
87 |
Args:
|
88 |
prompt (str): The original user prompt (for debugging).
|
89 |
text (str): The generated text to be converted to speech.
|
90 |
+
voice_name (HumeVoiceName): Name of the voice Hume will use when synthesizing speech.
|
91 |
|
92 |
Returns:
|
93 |
bytes: The raw binary audio data for playback.
|
|
|
100 |
request_body = {
|
101 |
'text': text,
|
102 |
'voice': {
|
103 |
+
'name': voice_name
|
104 |
},
|
105 |
}
|
106 |
|
|
|
130 |
raise HumeError(
|
131 |
message=f'Failed to synthesize speech from text with Hume: {e}',
|
132 |
original_exception=e,
|
133 |
+
)
|
134 |
+
|
135 |
+
def get_random_hume_voice_names() -> Tuple[str, str]:
|
136 |
+
"""
|
137 |
+
Get two random Hume voice names.
|
138 |
+
|
139 |
+
Voices:
|
140 |
+
- ITO
|
141 |
+
- KORA
|
142 |
+
- STELLA
|
143 |
+
- DACHER
|
144 |
+
"""
|
145 |
+
return tuple(random.sample(hume_config.voice_names, 2))
|