Spaces:
Running
Running
zach
commited on
Commit
·
ba3994f
1
Parent(s):
2f050a8
Move business logic out of app.py, refactor code to call hume once specifying 2 generations instead of call Hume twice
Browse files- src/app.py +70 -54
- src/integrations/elevenlabs_api.py +10 -7
- src/integrations/hume_api.py +79 -21
- src/types.py +20 -1
- src/utils.py +107 -8
src/app.py
CHANGED
@@ -30,8 +30,12 @@ from src.integrations import (
|
|
30 |
text_to_speech_with_hume,
|
31 |
)
|
32 |
from src.theme import CustomTheme
|
33 |
-
from src.types import ComparisonType, OptionMap, VotingResults
|
34 |
-
from src.utils import
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
def generate_text(
|
@@ -73,73 +77,85 @@ def generate_text(
|
|
73 |
|
74 |
def text_to_speech(
|
75 |
character_description: str, text: str, generated_text_state: str
|
76 |
-
) -> Tuple[gr.update, gr.update, dict,
|
77 |
"""
|
78 |
-
Synthesizes two text
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
Args:
|
84 |
-
character_description (str): The
|
85 |
-
text (str): The text to
|
|
|
86 |
|
87 |
Returns:
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
Raises:
|
95 |
-
gr.Error:
|
96 |
"""
|
97 |
if not text:
|
98 |
logger.warning("Skipping text-to-speech due to empty text.")
|
99 |
raise gr.Error("Please generate or enter text to synthesize.")
|
100 |
|
101 |
-
#
|
102 |
-
provider_a = constants.HUME_AI
|
103 |
-
# If not using generated text, then only compare Hume to Hume
|
104 |
text_modified = text != generated_text_state
|
105 |
-
provider_b
|
106 |
-
constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
|
107 |
-
)
|
108 |
|
109 |
try:
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
143 |
|
144 |
return (
|
145 |
gr.update(value=option_a_audio, visible=True, autoplay=True),
|
|
|
30 |
text_to_speech_with_hume,
|
31 |
)
|
32 |
from src.theme import CustomTheme
|
33 |
+
from src.types import ComparisonType, Option, OptionMap, VotingResults
|
34 |
+
from src.utils import (
|
35 |
+
choose_providers,
|
36 |
+
create_shuffled_tts_options,
|
37 |
+
validate_character_description_length,
|
38 |
+
)
|
39 |
|
40 |
|
41 |
def generate_text(
|
|
|
77 |
|
78 |
def text_to_speech(
|
79 |
character_description: str, text: str, generated_text_state: str
|
80 |
+
) -> Tuple[gr.update, gr.update, dict, str, ComparisonType, str, str, bool, str, str]:
|
81 |
"""
|
82 |
+
Synthesizes two text-to-speech outputs, updates UI state components, and returns additional TTS metadata.
|
83 |
+
|
84 |
+
This function generates TTS outputs using different providers based on the input text and its modification
|
85 |
+
state. Depending on the selected providers, it may:
|
86 |
+
- Synthesize one Hume and one ElevenLabs output (50% chance), or
|
87 |
+
- Synthesize two Hume outputs (50% chance).
|
88 |
+
|
89 |
+
The outputs are processed and shuffled, and the corresponding UI components for two audio players are updated.
|
90 |
+
Additional metadata such as the generation IDs, comparison type, and state information are also returned.
|
91 |
|
92 |
Args:
|
93 |
+
character_description (str): The description of the character used for generating the voice.
|
94 |
+
text (str): The text content to be synthesized into speech.
|
95 |
+
generated_text_state (str): The previously generated text state, used to determine if the text has been modified.
|
96 |
|
97 |
Returns:
|
98 |
+
Tuple containing:
|
99 |
+
- gr.update: Update for the first audio player (with autoplay enabled).
|
100 |
+
- gr.update: Update for the second audio player.
|
101 |
+
- dict: A mapping of option constants to their corresponding TTS providers.
|
102 |
+
- str: The raw audio value (relative file path) for option B.
|
103 |
+
- ComparisonType: The comparison type between the selected TTS providers.
|
104 |
+
- str: Generation ID for option A.
|
105 |
+
- str: Generation ID for option B.
|
106 |
+
- bool: Flag indicating whether the text was modified.
|
107 |
+
- str: The original text that was synthesized.
|
108 |
+
- str: The original character description.
|
109 |
|
110 |
Raises:
|
111 |
+
gr.Error: If any API or unexpected errors occur during the TTS synthesis process.
|
112 |
"""
|
113 |
if not text:
|
114 |
logger.warning("Skipping text-to-speech due to empty text.")
|
115 |
raise gr.Error("Please generate or enter text to synthesize.")
|
116 |
|
117 |
+
# Select 2 TTS providers based on whether the text has been modified.
|
|
|
|
|
118 |
text_modified = text != generated_text_state
|
119 |
+
comparison_type, provider_a, provider_b = choose_providers(text_modified)
|
|
|
|
|
120 |
|
121 |
try:
|
122 |
+
if provider_b == constants.HUME_AI:
|
123 |
+
# If generating 2 Hume outputs, do so in a single API call
|
124 |
+
(
|
125 |
+
generation_id_a,
|
126 |
+
audio_a,
|
127 |
+
generation_id_b,
|
128 |
+
audio_b,
|
129 |
+
) = text_to_speech_with_hume(character_description, text, 2)
|
130 |
+
else:
|
131 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
132 |
+
# Generate a single Hume output
|
133 |
+
future_audio_a = executor.submit(
|
134 |
+
text_to_speech_with_hume, character_description, text
|
135 |
+
)
|
136 |
+
# Generate a second TTS output from the second provider
|
137 |
+
match provider_b:
|
138 |
+
case constants.ELEVENLABS:
|
139 |
+
future_audio_b = executor.submit(
|
140 |
+
text_to_speech_with_elevenlabs, character_description, text
|
141 |
+
)
|
142 |
+
case _:
|
143 |
+
# Additional TTS Providers can be added here
|
144 |
+
raise ValueError(f"Unsupported provider: {provider_b}")
|
145 |
+
|
146 |
+
generation_id_a, audio_a = future_audio_a.result()
|
147 |
+
generation_id_b, audio_b = future_audio_b.result()
|
148 |
+
|
149 |
+
# Shuffle options so that placement of options in the UI will always be random
|
150 |
+
(
|
151 |
+
option_a_audio,
|
152 |
+
option_b_audio,
|
153 |
+
option_a_generation_id,
|
154 |
+
option_b_generation_id,
|
155 |
+
options_map,
|
156 |
+
) = create_shuffled_tts_options(
|
157 |
+
provider_a, audio_a, generation_id_a, provider_b, audio_b, generation_id_b
|
158 |
+
)
|
159 |
|
160 |
return (
|
161 |
gr.update(value=option_a_audio, visible=True, autoplay=True),
|
src/integrations/elevenlabs_api.py
CHANGED
@@ -23,7 +23,7 @@ Functions:
|
|
23 |
from dataclasses import dataclass
|
24 |
import logging
|
25 |
import random
|
26 |
-
from typing import Optional
|
27 |
|
28 |
# Third-Party Library Imports
|
29 |
from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
|
@@ -85,18 +85,20 @@ elevenlabs_config = ElevenLabsConfig()
|
|
85 |
after=after_log(logger, logging.DEBUG),
|
86 |
reraise=True,
|
87 |
)
|
88 |
-
def text_to_speech_with_elevenlabs(
|
|
|
|
|
89 |
"""
|
90 |
-
Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes
|
91 |
|
92 |
Args:
|
93 |
-
character_description (str): The
|
94 |
-
text (str): The text to be synthesized
|
95 |
|
96 |
Returns:
|
97 |
Tuple[None, str]: A tuple containing:
|
98 |
- generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
|
99 |
-
- file_path (str): The relative path to the file where the synthesized
|
100 |
|
101 |
Raises:
|
102 |
ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
|
@@ -124,9 +126,10 @@ def text_to_speech_with_elevenlabs(character_description: str, text: str) -> byt
|
|
124 |
generated_voice_id = preview.generated_voice_id
|
125 |
base64_audio = preview.audio_base_64
|
126 |
filename = f"{generated_voice_id}.mp3"
|
|
|
127 |
|
128 |
# Write audio to file and return the relative path
|
129 |
-
return None,
|
130 |
|
131 |
except Exception as e:
|
132 |
if isinstance(e, ApiError):
|
|
|
23 |
from dataclasses import dataclass
|
24 |
import logging
|
25 |
import random
|
26 |
+
from typing import Optional, Union
|
27 |
|
28 |
# Third-Party Library Imports
|
29 |
from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
|
|
|
85 |
after=after_log(logger, logging.DEBUG),
|
86 |
reraise=True,
|
87 |
)
|
88 |
+
def text_to_speech_with_elevenlabs(
|
89 |
+
character_description: str, text: str
|
90 |
+
) -> Tuple[None, str]:
|
91 |
"""
|
92 |
+
Synthesizes text to speech using the ElevenLabs TTS API, processes the audio data, and writes it to a file.
|
93 |
|
94 |
Args:
|
95 |
+
character_description (str): The character description used as the voice description.
|
96 |
+
text (str): The text to be synthesized into speech.
|
97 |
|
98 |
Returns:
|
99 |
Tuple[None, str]: A tuple containing:
|
100 |
- generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
|
101 |
+
- file_path (str): The relative file path to the audio file where the synthesized speech was saved.
|
102 |
|
103 |
Raises:
|
104 |
ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
|
|
|
126 |
generated_voice_id = preview.generated_voice_id
|
127 |
base64_audio = preview.audio_base_64
|
128 |
filename = f"{generated_voice_id}.mp3"
|
129 |
+
audio_file_path = save_base64_audio_to_file(base64_audio, filename)
|
130 |
|
131 |
# Write audio to file and return the relative path
|
132 |
+
return None, audio_file_path
|
133 |
|
134 |
except Exception as e:
|
135 |
if isinstance(e, ApiError):
|
src/integrations/hume_api.py
CHANGED
@@ -23,7 +23,7 @@ from dataclasses import dataclass
|
|
23 |
import logging
|
24 |
import os
|
25 |
import random
|
26 |
-
from typing import Literal, Optional
|
27 |
|
28 |
# Third-Party Library Imports
|
29 |
import requests
|
@@ -96,28 +96,50 @@ hume_config = HumeConfig()
|
|
96 |
after=after_log(logger, logging.DEBUG),
|
97 |
reraise=True,
|
98 |
)
|
99 |
-
def text_to_speech_with_hume(
|
|
|
|
|
100 |
"""
|
101 |
Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
Args:
|
104 |
-
character_description (str):
|
105 |
-
|
|
|
|
|
|
|
|
|
106 |
|
107 |
Returns:
|
108 |
-
Tuple[str, str]
|
109 |
-
-
|
110 |
-
-
|
111 |
|
112 |
Raises:
|
113 |
-
|
|
|
|
|
|
|
114 |
"""
|
115 |
logger.debug(
|
116 |
f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. Text length: {len(text)} characters."
|
117 |
)
|
118 |
|
|
|
|
|
|
|
119 |
request_body = {
|
120 |
-
"utterances": [{"text": text, "description": character_description}]
|
|
|
|
|
|
|
|
|
121 |
}
|
122 |
|
123 |
try:
|
@@ -137,22 +159,58 @@ def text_to_speech_with_hume(character_description: str, text: str) -> bytes:
|
|
137 |
raise HumeError(msg)
|
138 |
|
139 |
# Extract the base64 encoded audio and generation ID from the generation
|
140 |
-
|
141 |
-
|
142 |
-
base64_audio = generation.get("audio")
|
143 |
-
filename = f"{generation_id}.mp3"
|
144 |
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
147 |
|
148 |
except Exception as e:
|
149 |
if isinstance(e, HTTPError):
|
150 |
if e.response.status_code >= 400 and e.response.status_code < 500:
|
151 |
raise UnretryableHumeError(
|
152 |
-
message=f'"{e.response.text}"',
|
153 |
-
original_exception=e,
|
154 |
) from e
|
155 |
-
raise HumeError(
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
import logging
|
24 |
import os
|
25 |
import random
|
26 |
+
from typing import Any, Dict, Literal, Optional, Tuple, Union
|
27 |
|
28 |
# Third-Party Library Imports
|
29 |
import requests
|
|
|
96 |
after=after_log(logger, logging.DEBUG),
|
97 |
reraise=True,
|
98 |
)
|
99 |
+
def text_to_speech_with_hume(
|
100 |
+
character_description: str, text: str, num_generations: int = 1
|
101 |
+
) -> Union[Tuple[str, str], Tuple[str, str, str, str]]:
|
102 |
"""
|
103 |
Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
|
104 |
|
105 |
+
This function sends a POST request to the Hume TTS API with a character description and text
|
106 |
+
to be converted to speech. Depending on the specified number of generations (allowed values: 1 or 2),
|
107 |
+
the API returns one or two generations. For each generation, the function extracts the base64-encoded
|
108 |
+
audio and the generation ID, saves the audio as an MP3 file via the `save_base64_audio_to_file` helper,
|
109 |
+
and returns the relevant details.
|
110 |
+
|
111 |
Args:
|
112 |
+
character_description (str): A description of the character, which is used as contextual input
|
113 |
+
for generating the voice.
|
114 |
+
text (str): The text to be converted to speech.
|
115 |
+
num_generations (int, optional): The number of audio generations to request from the API.
|
116 |
+
Allowed values are 1 or 2. If 1, only a single generation is processed; if 2, a second
|
117 |
+
generation is expected in the API response. Defaults to 1.
|
118 |
|
119 |
Returns:
|
120 |
+
Union[Tuple[str, str], Tuple[str, str, str, str]]:
|
121 |
+
- If num_generations == 1: A tuple in the form (generation_a_id, audio_a_path).
|
122 |
+
- If num_generations == 2: A tuple in the form (generation_a_id, audio_a_path, generation_b_id, audio_b_path).
|
123 |
|
124 |
Raises:
|
125 |
+
ValueError: If num_generations is not 1 or 2.
|
126 |
+
HumeError: If there is an error communicating with the Hume TTS API or parsing its response.
|
127 |
+
UnretryableHumeError: If a client-side HTTP error (status code in the 4xx range) is encountered.
|
128 |
+
Exception: Any other exceptions raised during the request or processing will be wrapped and re-raised as HumeError.
|
129 |
"""
|
130 |
logger.debug(
|
131 |
f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. Text length: {len(text)} characters."
|
132 |
)
|
133 |
|
134 |
+
if num_generations < 1 or num_generations > 2:
|
135 |
+
raise ValueError("Invalid number of generations specified. Must be 1 or 2.")
|
136 |
+
|
137 |
request_body = {
|
138 |
+
"utterances": [{"text": text, "description": character_description}],
|
139 |
+
"format": {
|
140 |
+
"type": hume_config.file_format,
|
141 |
+
},
|
142 |
+
"num_generations": num_generations,
|
143 |
}
|
144 |
|
145 |
try:
|
|
|
159 |
raise HumeError(msg)
|
160 |
|
161 |
# Extract the base64 encoded audio and generation ID from the generation
|
162 |
+
generation_a = generations[0]
|
163 |
+
generation_a_id, audio_a_path = parse_hume_tts_generation(generation_a)
|
|
|
|
|
164 |
|
165 |
+
if num_generations == 1:
|
166 |
+
return (generation_a_id, audio_a_path)
|
167 |
+
|
168 |
+
generation_b = generations[1]
|
169 |
+
generation_b_id, audio_b_path = parse_hume_tts_generation(generation_b)
|
170 |
+
return (generation_a_id, audio_a_path, generation_b_id, audio_b_path)
|
171 |
|
172 |
except Exception as e:
|
173 |
if isinstance(e, HTTPError):
|
174 |
if e.response.status_code >= 400 and e.response.status_code < 500:
|
175 |
raise UnretryableHumeError(
|
176 |
+
message=f'"{e.response.text}"', original_exception=e
|
|
|
177 |
) from e
|
178 |
+
raise HumeError(message=f"{e}", original_exception=e) from e
|
179 |
+
|
180 |
+
|
181 |
+
def parse_hume_tts_generation(generation: Dict[str, Any]) -> Tuple[str, str]:
|
182 |
+
"""
|
183 |
+
Parse a Hume TTS generation response and save the decoded audio as an MP3 file.
|
184 |
+
|
185 |
+
This function extracts the generation ID and the base64-encoded audio from the provided
|
186 |
+
dictionary. It then decodes and saves the audio data to an MP3 file, naming the file using
|
187 |
+
the generation ID. Finally, it returns a tuple containing the generation ID and the file path
|
188 |
+
of the saved audio.
|
189 |
+
|
190 |
+
Args:
|
191 |
+
generation (Dict[str, Any]): A dictionary representing the TTS generation response from Hume.
|
192 |
+
Expected keys are:
|
193 |
+
- "generation_id" (str): A unique identifier for the generated audio.
|
194 |
+
- "audio" (str): A base64 encoded string of the audio data.
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
Tuple[str, str]: A tuple containing:
|
198 |
+
- generation_id (str): The unique identifier for the audio generation.
|
199 |
+
- audio_path (str): The filesystem path where the audio file was saved.
|
200 |
+
|
201 |
+
Raises:
|
202 |
+
KeyError: If the "generation_id" or "audio" key is missing from the generation dictionary.
|
203 |
+
Exception: Propagates any exceptions raised by save_base64_audio_to_file, such as errors during
|
204 |
+
the decoding or file saving process.
|
205 |
+
"""
|
206 |
+
generation_id = generation.get("generation_id")
|
207 |
+
if generation_id is None:
|
208 |
+
raise KeyError("The generation dictionary is missing the 'generation_id' key.")
|
209 |
+
|
210 |
+
base64_audio = generation.get("audio")
|
211 |
+
if base64_audio is None:
|
212 |
+
raise KeyError("The generation dictionary is missing the 'audio' key.")
|
213 |
+
|
214 |
+
filename = f"{generation_id}.mp3"
|
215 |
+
audio_file_path = save_base64_audio_to_file(base64_audio, filename)
|
216 |
+
return generation_id, audio_file_path
|
src/types.py
CHANGED
@@ -5,7 +5,7 @@ This module defines custom types for the application.
|
|
5 |
"""
|
6 |
|
7 |
# Standard Library Imports
|
8 |
-
from typing import Dict, Literal, TypedDict
|
9 |
|
10 |
|
11 |
TTSProviderName = Literal["Hume AI", "ElevenLabs"]
|
@@ -24,6 +24,25 @@ OptionMap = Dict[OptionKey, TTSProviderName]
|
|
24 |
"""OptionMap defines the structure of the options mapping, where each key is an OptionKey and the value is a TTS provider."""
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
class VotingResults(TypedDict):
|
28 |
"""Voting results data structure representing values we want to persist to the votes DB"""
|
29 |
|
|
|
5 |
"""
|
6 |
|
7 |
# Standard Library Imports
|
8 |
+
from typing import Dict, Literal, NamedTuple, TypedDict
|
9 |
|
10 |
|
11 |
TTSProviderName = Literal["Hume AI", "ElevenLabs"]
|
|
|
24 |
"""OptionMap defines the structure of the options mapping, where each key is an OptionKey and the value is a TTS provider."""
|
25 |
|
26 |
|
27 |
+
class Option(NamedTuple):
|
28 |
+
"""
|
29 |
+
Represents a text-to-speech generation option.
|
30 |
+
|
31 |
+
This type encapsulates the details for a generated text-to-speech (TTS) option,
|
32 |
+
including the provider that produced the audio, the relative file path to the generated
|
33 |
+
audio file, and the unique generation identifier associated with the TTS output.
|
34 |
+
|
35 |
+
Attributes:
|
36 |
+
provider (TTSProviderName): The TTS provider that generated the audio.
|
37 |
+
audio (str): The relative file path to the audio file produced by the TTS provider.
|
38 |
+
generation_id (str): The unique identifier for this TTS generation.
|
39 |
+
"""
|
40 |
+
|
41 |
+
provider: TTSProviderName
|
42 |
+
audio: str
|
43 |
+
generation_id: str
|
44 |
+
|
45 |
+
|
46 |
class VotingResults(TypedDict):
|
47 |
"""Voting results data structure representing values we want to persist to the votes DB"""
|
48 |
|
src/utils.py
CHANGED
@@ -13,13 +13,13 @@ Functions:
|
|
13 |
# Standard Library Imports
|
14 |
import base64
|
15 |
import os
|
|
|
|
|
16 |
|
17 |
# Local Application Imports
|
|
|
18 |
from src.config import AUDIO_DIR, logger
|
19 |
-
from src.
|
20 |
-
CHARACTER_DESCRIPTION_MIN_LENGTH,
|
21 |
-
CHARACTER_DESCRIPTION_MAX_LENGTH,
|
22 |
-
)
|
23 |
|
24 |
|
25 |
def truncate_text(text: str, max_length: int = 50) -> str:
|
@@ -108,14 +108,14 @@ def validate_character_description_length(character_description: str) -> None:
|
|
108 |
f"Voice description length being validated: {character_description_length} characters"
|
109 |
)
|
110 |
|
111 |
-
if character_description_length < CHARACTER_DESCRIPTION_MIN_LENGTH:
|
112 |
raise ValueError(
|
113 |
-
f"Your character description is too short. Please enter at least {CHARACTER_DESCRIPTION_MIN_LENGTH} characters. "
|
114 |
f"(Current length: {character_description_length})"
|
115 |
)
|
116 |
-
if character_description_length > CHARACTER_DESCRIPTION_MAX_LENGTH:
|
117 |
raise ValueError(
|
118 |
-
f"Your character description is too long. Please limit it to {CHARACTER_DESCRIPTION_MAX_LENGTH} characters. "
|
119 |
f"(Current length: {character_description_length})"
|
120 |
)
|
121 |
logger.debug(
|
@@ -162,3 +162,102 @@ def save_base64_audio_to_file(base64_audio: str, filename: str) -> str:
|
|
162 |
logger.debug(f"Audio file relative path: {relative_path}")
|
163 |
|
164 |
return relative_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# Standard Library Imports
|
14 |
import base64
|
15 |
import os
|
16 |
+
import random
|
17 |
+
from typing import Tuple
|
18 |
|
19 |
# Local Application Imports
|
20 |
+
from src import constants
|
21 |
from src.config import AUDIO_DIR, logger
|
22 |
+
from src.types import ComparisonType, Option, OptionMap, TTSProviderName
|
|
|
|
|
|
|
23 |
|
24 |
|
25 |
def truncate_text(text: str, max_length: int = 50) -> str:
|
|
|
108 |
f"Voice description length being validated: {character_description_length} characters"
|
109 |
)
|
110 |
|
111 |
+
if character_description_length < constants.CHARACTER_DESCRIPTION_MIN_LENGTH:
|
112 |
raise ValueError(
|
113 |
+
f"Your character description is too short. Please enter at least {constants.CHARACTER_DESCRIPTION_MIN_LENGTH} characters. "
|
114 |
f"(Current length: {character_description_length})"
|
115 |
)
|
116 |
+
if character_description_length > constants.CHARACTER_DESCRIPTION_MAX_LENGTH:
|
117 |
raise ValueError(
|
118 |
+
f"Your character description is too long. Please limit it to {constants.CHARACTER_DESCRIPTION_MAX_LENGTH} characters. "
|
119 |
f"(Current length: {character_description_length})"
|
120 |
)
|
121 |
logger.debug(
|
|
|
162 |
logger.debug(f"Audio file relative path: {relative_path}")
|
163 |
|
164 |
return relative_path
|
165 |
+
|
166 |
+
|
167 |
+
def choose_providers(
|
168 |
+
text_modified: bool,
|
169 |
+
) -> Tuple[ComparisonType, TTSProviderName, TTSProviderName]:
|
170 |
+
"""
|
171 |
+
Select two TTS providers based on whether the text has been modified.
|
172 |
+
|
173 |
+
The first provider is always set to "Hume AI". For the second provider, the function
|
174 |
+
selects "Hume AI" if the text has been modified; otherwise, it randomly chooses one from
|
175 |
+
the TTS_PROVIDERS list.
|
176 |
+
|
177 |
+
Args:
|
178 |
+
text_modified (bool): A flag indicating whether the text has been modified.
|
179 |
+
- If True, both providers will be "Hume AI".
|
180 |
+
- If False, the second provider is randomly selected from TTS_PROVIDERS.
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
Tuple[TTSProviderName, TTSProviderName]: A tuple containing two TTS provider names,
|
184 |
+
where the first is always "Hume AI" and the second is determined by the text_modified
|
185 |
+
flag and random selection.
|
186 |
+
"""
|
187 |
+
provider_a = constants.HUME_AI
|
188 |
+
provider_b = (
|
189 |
+
constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
|
190 |
+
)
|
191 |
+
|
192 |
+
match provider_b:
|
193 |
+
case constants.HUME_AI:
|
194 |
+
comparison_type = constants.HUME_TO_HUME
|
195 |
+
case constants.ELEVENLABS:
|
196 |
+
comparison_type = constants.HUME_TO_ELEVENLABS
|
197 |
+
|
198 |
+
return comparison_type, provider_a, provider_b
|
199 |
+
|
200 |
+
|
201 |
+
def create_shuffled_tts_options(
|
202 |
+
provider_a: TTSProviderName,
|
203 |
+
audio_a: str,
|
204 |
+
generation_id_a: str,
|
205 |
+
provider_b: TTSProviderName,
|
206 |
+
audio_b: str,
|
207 |
+
generation_id_b: str,
|
208 |
+
) -> Tuple[str, str, str, str, OptionMap]:
|
209 |
+
"""
|
210 |
+
Create and shuffle TTS generation options.
|
211 |
+
|
212 |
+
This function creates two Option instances from the provided TTS details, shuffles them,
|
213 |
+
and then extracts the audio file paths and generation IDs from the shuffled options.
|
214 |
+
It also returns a mapping from option constants to the corresponding TTS providers.
|
215 |
+
|
216 |
+
Args:
|
217 |
+
provider_a (TTSProviderName): The TTS provider for the first generation.
|
218 |
+
audio_a (str): The relative file path to the audio file for the first generation.
|
219 |
+
generation_id_a (str): The generation ID for the first generation.
|
220 |
+
provider_b (TTSProviderName): The TTS provider for the second generation.
|
221 |
+
audio_b (str): The relative file path to the audio file for the second generation.
|
222 |
+
generation_id_b (str): The generation ID for the second generation.
|
223 |
+
|
224 |
+
Returns:
|
225 |
+
Tuple[str, str, str, str, OptionMap]:
|
226 |
+
A tuple containing:
|
227 |
+
- option_a_audio (str): Audio file path for the first shuffled option.
|
228 |
+
- option_b_audio (str): Audio file path for the second shuffled option.
|
229 |
+
- option_a_generation_id (str): Generation ID for the first shuffled option.
|
230 |
+
- option_b_generation_id (str): Generation ID for the second shuffled option.
|
231 |
+
- options_map (OptionMap): Mapping from option constants to their TTS providers.
|
232 |
+
"""
|
233 |
+
# Create a list of Option instances for the available providers.
|
234 |
+
options = [
|
235 |
+
Option(provider=provider_a, audio=audio_a, generation_id=generation_id_a),
|
236 |
+
Option(provider=provider_b, audio=audio_b, generation_id=generation_id_b),
|
237 |
+
]
|
238 |
+
|
239 |
+
# Randomly shuffle the list of options.
|
240 |
+
random.shuffle(options)
|
241 |
+
|
242 |
+
# Unpack the two options.
|
243 |
+
option_a, option_b = options
|
244 |
+
|
245 |
+
# Extract audio file paths and generation IDs.
|
246 |
+
option_a_audio = option_a.audio
|
247 |
+
option_b_audio = option_b.audio
|
248 |
+
option_a_generation_id = option_a.generation_id
|
249 |
+
option_b_generation_id = option_b.generation_id
|
250 |
+
|
251 |
+
# Build a mapping from option constants to the corresponding providers.
|
252 |
+
options_map: OptionMap = {
|
253 |
+
constants.OPTION_A: option_a.provider,
|
254 |
+
constants.OPTION_B: option_b.provider,
|
255 |
+
}
|
256 |
+
|
257 |
+
return (
|
258 |
+
option_a_audio,
|
259 |
+
option_b_audio,
|
260 |
+
option_a_generation_id,
|
261 |
+
option_b_generation_id,
|
262 |
+
options_map,
|
263 |
+
)
|