Spaces:
Running
Running
File size: 6,551 Bytes
3ce989d f5237ec 3ce989d 4ea25cd 136ff40 f5237ec 4ea25cd 3ce989d a807c4d 4ea25cd 7d79ca4 a807c4d 3ce989d 4ea25cd 3ce989d f5237ec 3ce989d f5237ec 3ce989d f5237ec 3ce989d e9bcee8 4ea25cd 3ce989d 4ea25cd 3ce989d adecb62 4ea25cd 3ce989d f5237ec adecb62 f5237ec 136ff40 f5237ec adecb62 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec adecb62 3ce989d e9bcee8 4ea25cd 3ce989d f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd 3ce989d 4ea25cd 3ce989d f5237ec 3ce989d f5237ec 4ea25cd adecb62 f5237ec adecb62 3ce989d f5237ec 3ce989d 96154e7 f5237ec 136ff40 4ea25cd 96154e7 3ce989d e9bcee8 f5237ec 3ce989d f5237ec 3ce989d f5237ec 3ce989d f5237ec 4ea25cd adecb62 e9bcee8 adecb62 4ea25cd adecb62 3ce989d 4ea25cd f5237ec 4ea25cd 96154e7 4ea25cd 96154e7 3ce989d 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd f5237ec 4ea25cd 3ce989d e9bcee8 3ce989d e9bcee8 3ce989d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
"""
app.py
Gradio UI for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
Users enter a prompt, which is processed using Claude by Anthropic to generate text.
The text is then converted into speech using both Hume and ElevenLabs TTS APIs.
Users can compare the outputs in an interactive UI.
"""
# Standard Library Imports
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import random
# Third-Party Library Imports
import gradio as gr
# Local Application Imports
from src.config import logger
from src.constants import PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH, SAMPLE_PROMPTS
from src.integrations import generate_text_with_claude, text_to_speech_with_hume, text_to_speech_with_elevenlabs
from src.utils import truncate_text, validate_prompt_length
def process_prompt(prompt: str):
"""
Generates text from Claude API and converts it to speech using Hume and ElevenLabs.
Args:
prompt (str): User-provided text prompt.
Returns:
tuple: Generated text, two audio file paths (Hume & ElevenLabs), and
a dictionary mapping audio options to providers.
"""
logger.info(f'Processing prompt: {truncate_text(prompt, max_length=100)}')
try:
# Validate prompt length
validate_prompt_length(prompt, PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH)
# Generate text
generated_text = generate_text_with_claude(prompt)
logger.info(f'Generated text ({len(generated_text)} characters).')
# Generate TTS output in parallel
with ThreadPoolExecutor(max_workers=2) as executor:
hume_audio, elevenlabs_audio = executor.map(
lambda func: func(),
[partial(text_to_speech_with_hume, prompt, generated_text),
partial(text_to_speech_with_elevenlabs, generated_text)]
)
logger.info(
f'TTS generated: Hume={len(hume_audio)} bytes, '
f'ElevenLabs={len(elevenlabs_audio)} bytes'
)
# Randomize audio order
options = [(hume_audio, 'Hume TTS'), (elevenlabs_audio, 'ElevenLabs TTS')]
random.shuffle(options)
return (
generated_text,
options[0][0], # Option 1 audio
options[1][0], # Option 2 audio
{'Option 1': options[0][1], 'Option 2': options[1][1]}, # Mapping
)
except ValueError as ve:
logger.warning(f'Validation error: {ve}')
return str(ve), None, None, {}
except Exception as e:
logger.error(f'Unexpected error: {e}')
return 'An error occurred. Please try again.', None, None, {}
def run_process_prompt(prompt: str):
"""
Manages UI state while processing a prompt.
Args:
prompt (str): User input prompt.
Yields:
tuple: UI state updates in three stages:
1. Disables UI and clears previous outputs.
2. Displays generated content.
3. Re-enables UI after processing.
"""
# Disable UI, clear previous outputs
yield (
gr.update(interactive=False),
gr.update(value=None),
gr.update(value=None),
gr.update(value=None),
gr.update(value=None),
None,
)
# Process the prompt
generated_text, option1_audio, option2_audio, option_mapping = process_prompt(prompt)
# Display generated text and audio
yield (
gr.update(interactive=True),
gr.update(value=generated_text),
gr.update(value=option1_audio, autoplay=True),
gr.update(value=option2_audio),
gr.update(value=option_mapping),
option2_audio,
)
def build_gradio_interface() -> gr.Blocks:
"""
Constructs the Gradio user interface.
Returns:
gr.Blocks: The Gradio UI layout.
"""
with gr.Blocks() as demo:
# Title and instructions
gr.Markdown('# TTS Arena')
gr.Markdown(
'Generate text using **Claude by Anthropic**, then compare text-to-speech outputs '
'from **Hume TTS API** and **ElevenLabs TTS API**.'
)
# Input: Sample prompt selection & textbox
with gr.Row():
sample_prompt_dropdown = gr.Dropdown(
choices=list(SAMPLE_PROMPTS.keys()),
label='Choose a sample prompt (or enter your own)',
value=None,
interactive=True,
)
with gr.Row():
prompt_input = gr.Textbox(
label='Enter your prompt',
placeholder='Or type your own...',
lines=2,
max_lines=2,
)
# Generate Button
generate_button = gr.Button('Generate')
# Output: Text & audio
with gr.Column():
output_text = gr.Textbox(
label='Generated Text',
interactive=False,
lines=8,
max_lines=12,
)
with gr.Row():
option1_audio_player = gr.Audio(label='Option 1', type='filepath', interactive=False)
option2_audio_player = gr.Audio(label='Option 2', type='filepath', interactive=False)
# UI state components
option_mapping_state = gr.State()
option2_audio_state = gr.State()
# Event handlers
sample_prompt_dropdown.change(
fn=lambda choice: SAMPLE_PROMPTS.get(choice, ""),
inputs=[sample_prompt_dropdown],
outputs=[prompt_input],
)
generate_button.click(
fn=run_process_prompt,
inputs=[prompt_input],
outputs=[
generate_button,
output_text,
option1_audio_player,
option2_audio_player,
option_mapping_state,
option2_audio_state,
],
)
# Auto-play second audio after first finishes
option1_audio_player.stop(
fn=lambda _: gr.update(value=None), # Reset first audio before playing second
inputs=[],
outputs=[option2_audio_player],
).then(
fn=lambda audio: gr.update(value=audio, autoplay=True),
inputs=[option2_audio_state],
outputs=[option2_audio_player],
)
logger.debug('Gradio interface built successfully')
return demo
if __name__ == '__main__':
logger.info('Launching TTS Arena Gradio app...')
demo = build_gradio_interface()
demo.launch() |