zach commited on
Commit
b41805f
·
1 Parent(s): d1e1e15

Decouple text generation and speech synthesis and enable editing of text

Browse files
Files changed (1) hide show
  1. src/app.py +119 -69
src/app.py CHANGED
@@ -45,7 +45,7 @@ from src.theme import CustomTheme
45
  from src.utils import truncate_text, validate_prompt_length
46
 
47
 
48
- def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
49
  """
50
  Validates the prompt and generates text using Anthropic API.
51
 
@@ -54,8 +54,8 @@ def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
54
 
55
  Returns:
56
  Tuple containing:
57
- - The generated text (as a gr.update) if successful,
58
- - An update for the "Generate" button.
59
 
60
  Raises:
61
  gr.Error: On validation or API errors.
@@ -69,7 +69,7 @@ def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
69
  try:
70
  generated_text = generate_text_with_claude(prompt)
71
  logger.info(f'Generated text ({len(generated_text)} characters).')
72
- return gr.update(value=generated_text)
73
  except AnthropicError as ae:
74
  logger.error(f'AnthropicError while generating text: {str(ae)}')
75
  raise gr.Error('There was an issue communicating with the Anthropic API. Please try again later.')
@@ -78,7 +78,7 @@ def generate_text(prompt: str) -> Tuple[Union[str, gr.update], gr.update]:
78
  raise gr.Error('Failed to generate text. Please try again.')
79
 
80
 
81
- def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
82
  """
83
  Synthesizes two text to speech outputs, loads the two audio players with the
84
  output audio, and updates related UI state components.
@@ -87,39 +87,42 @@ def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.upda
87
 
88
  Args:
89
  prompt (str): The original prompt.
90
- generated_text (str): The generated text.
91
 
92
  Returns:
93
  A tuple of:
94
  - Update for first audio player (with autoplay)
95
  - Update for second audio player
96
  - A dictionary mapping options to providers
97
- - The raw audio value for option 2 (if needed)
98
 
99
  Raises:
100
  gr.Error: On API or unexpected errors.
101
  """
102
- if not generated_text:
103
  logger.warning('Skipping text-to-speech due to empty text.')
104
- return gr.skip(), gr.skip(), gr.skip(), gr.skip()
105
 
106
- # compare_hume_with_elevenlabs = random.random() < 0.5
 
107
  compare_hume_with_elevenlabs = False
108
 
109
  elevenlabs_voice = get_random_elevenlabs_voice_id()
110
- hume_voice_a, hume_voice_b = get_random_hume_voice_names() # We get two Hume voices preemptively in case we compare Hume with Hume
 
 
111
 
112
  try:
113
  with ThreadPoolExecutor(max_workers=2) as executor:
114
  provider_a = HUME_AI
115
- future_audio_a = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_a)
116
 
117
  if compare_hume_with_elevenlabs:
118
  provider_b = ELEVENLABS
119
- future_audio_b = executor.submit(text_to_speech_with_elevenlabs, generated_text, elevenlabs_voice)
120
  else:
121
  provider_b = HUME_AI
122
- future_audio_b = executor.submit(text_to_speech_with_hume, prompt, generated_text, hume_voice_b)
123
 
124
  audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
125
 
@@ -130,8 +133,8 @@ def text_to_speech(prompt: str, generated_text: str) -> Tuple[gr.update, gr.upda
130
  options_map = { OPTION_A: options[0][1], OPTION_B: options[1][1] }
131
 
132
  return (
133
- gr.update(value=option_a_audio, autoplay=True),
134
- gr.update(value=option_b_audio),
135
  options_map,
136
  option_b_audio,
137
  )
@@ -171,10 +174,11 @@ def vote(vote_submitted: bool, option_mapping: dict, selected_button: str) -> Tu
171
 
172
  return (
173
  True,
174
- gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary') if is_option_a
175
- else gr.update(value=other_provider, variant='secondary'),
176
- gr.update(value=other_provider, variant='secondary') if is_option_a
177
- else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary'),
 
178
  )
179
 
180
  def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None, bool]:
@@ -188,14 +192,14 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
188
  - vote_button_a (disable and reset button text)
189
  - vote_button_a (disable and reset button text)
190
  - option_mapping_state (reset option map state)
191
- - option2_audio_state (reset option 2 audio state)
192
  - vote_submitted_state (reset submitted vote state)
193
  """
194
  return (
195
  gr.update(value=None),
196
  gr.update(value=None),
197
- gr.update(interactive=False, value=VOTE_FOR_OPTION_A, variant='secondary'),
198
- gr.update(interactive=False, value=VOTE_FOR_OPTION_B, variant='secondary'),
199
  None,
200
  None,
201
  False,
@@ -203,12 +207,15 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
203
 
204
 
205
  def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
206
- """Builds the input section including instructions, sample prompt dropdown, prompt input, and generate button"""
207
- with gr.Column(variant='compact'):
208
- instructions = gr.Markdown(
209
- 'Generate text with **Claude by Anthropic**, listen to text-to-speech outputs '
210
- 'from **Hume AI** and **ElevenLabs**, and vote for your favorite!'
211
- )
 
 
 
212
  sample_prompt_dropdown = gr.Dropdown(
213
  choices=list(SAMPLE_PROMPTS.keys()),
214
  label='Choose a sample prompt (or enter your own)',
@@ -223,29 +230,43 @@ def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Butt
223
  max_length=PROMPT_MAX_LENGTH,
224
  show_copy_button=True,
225
  )
226
- generate_button = gr.Button('Generate text', variant='primary')
227
- return instructions, sample_prompt_dropdown, prompt_input, generate_button
 
 
 
 
 
228
 
229
 
230
- def build_output_section() -> Tuple[gr.Textbox, gr.Audio, gr.Audio, gr.Button, gr.Button]:
231
- """Builds the output section including generated text, audio players, and vote buttons."""
232
- with gr.Column(variant='compact'):
233
- generated_text = gr.Textbox(
234
  label='Text',
235
- interactive=False,
 
236
  autoscroll=False,
237
  lines=5,
238
  max_lines=5,
239
  max_length=PROMPT_MAX_LENGTH,
240
  show_copy_button=True,
241
  )
 
242
  with gr.Row(equal_height=True):
243
  option_a_audio_player = gr.Audio(label=OPTION_A, type='filepath', interactive=False)
244
  option_b_audio_player = gr.Audio(label=OPTION_B, type='filepath', interactive=False)
245
- with gr.Row():
246
- vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
247
- vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
248
- return generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b
 
 
 
 
 
 
 
249
 
250
 
251
  def build_gradio_interface() -> gr.Blocks:
@@ -260,20 +281,29 @@ def build_gradio_interface() -> gr.Blocks:
260
  title='Expressive TTS Arena',
261
  theme=custom_theme,
262
  fill_width=True,
263
- css_paths='src/assets/styles.css'
264
  ) as demo:
265
  # Title
266
  gr.Markdown('# Expressive TTS Arena')
267
 
268
- # Build input section
269
- instructions, sample_prompt_dropdown, prompt_input, generate_button = build_input_section()
 
 
 
270
 
271
- # Build output section
272
- generated_text, option_a_audio_player, option_b_audio_player, vote_button_a, vote_button_b = build_output_section()
 
 
 
 
 
273
 
274
  # UI state components
275
- option_mapping_state = gr.State() # Track option map (option 1 and option 2 are randomized)
276
- option2_audio_state = gr.State() # Track generated audio for option 2 for playing automatically after option 1 audio finishes
 
277
  vote_submitted_state = gr.State(False) # Track whether the user has voted on an option
278
 
279
  # --- Register event handlers ---
@@ -285,16 +315,33 @@ def build_gradio_interface() -> gr.Blocks:
285
  outputs=[prompt_input],
286
  )
287
 
288
- # Generate Button Click Handler Chain:
289
- # 1. Disable the Generate button
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  # 2. Reset UI state
291
- # 3. Generate text
292
- # 4. Synthesize TTS
293
- # 5. Re-enable the Generate button
294
- generate_button.click(
295
- fn=lambda: gr.update(interactive=False), # Disable the button immediately
296
  inputs=[],
297
- outputs=[generate_button]
298
  ).then(
299
  fn=reset_ui,
300
  inputs=[],
@@ -304,48 +351,51 @@ def build_gradio_interface() -> gr.Blocks:
304
  vote_button_a,
305
  vote_button_b,
306
  option_mapping_state,
307
- option2_audio_state,
308
  vote_submitted_state,
309
  ],
310
- ).then(
311
- fn=generate_text,
312
- inputs=[prompt_input],
313
- outputs=[generated_text],
314
  ).then(
315
  fn=text_to_speech,
316
- inputs=[prompt_input, generated_text],
317
  outputs=[
318
  option_a_audio_player,
319
  option_b_audio_player,
320
  option_mapping_state,
321
- option2_audio_state,
322
  ],
323
  ).then(
324
- fn=lambda: gr.update(interactive=True), # Re-enable the button
 
 
 
 
325
  inputs=[],
326
- outputs=[generate_button]
327
  )
328
 
329
- # Vote button click handlers
330
  vote_button_a.click(
331
  fn=vote,
332
  inputs=[vote_submitted_state, option_mapping_state, vote_button_a],
333
- outputs=[vote_submitted_state, vote_button_a, vote_button_b],
334
  )
335
  vote_button_b.click(
336
  fn=vote,
337
  inputs=[vote_submitted_state, option_mapping_state, vote_button_b],
338
- outputs=[vote_submitted_state, vote_button_a, vote_button_b],
339
  )
340
 
341
- # Auto-play second audio after first finishes (workaround for playing audio back-to-back)
 
 
 
342
  option_a_audio_player.stop(
343
  fn=lambda _: gr.update(value=None),
344
  inputs=[],
345
  outputs=[option_b_audio_player],
346
  ).then(
347
  fn=lambda audio: gr.update(value=audio, autoplay=True),
348
- inputs=[option2_audio_state],
349
  outputs=[option_b_audio_player],
350
  )
351
 
 
45
  from src.utils import truncate_text, validate_prompt_length
46
 
47
 
48
+ def generate_text(prompt: str,) -> Tuple[Union[str, gr.update], gr.update]:
49
  """
50
  Validates the prompt and generates text using Anthropic API.
51
 
 
54
 
55
  Returns:
56
  Tuple containing:
57
+ - The generated text (as a gr.update).
58
+ - An update for the generated text state.
59
 
60
  Raises:
61
  gr.Error: On validation or API errors.
 
69
  try:
70
  generated_text = generate_text_with_claude(prompt)
71
  logger.info(f'Generated text ({len(generated_text)} characters).')
72
+ return gr.update(value=generated_text), generated_text
73
  except AnthropicError as ae:
74
  logger.error(f'AnthropicError while generating text: {str(ae)}')
75
  raise gr.Error('There was an issue communicating with the Anthropic API. Please try again later.')
 
78
  raise gr.Error('Failed to generate text. Please try again.')
79
 
80
 
81
+ def text_to_speech(prompt: str, text: str, generated_text_state: str) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
82
  """
83
  Synthesizes two text to speech outputs, loads the two audio players with the
84
  output audio, and updates related UI state components.
 
87
 
88
  Args:
89
  prompt (str): The original prompt.
90
+ text (str): The text to synthesize to speech.
91
 
92
  Returns:
93
  A tuple of:
94
  - Update for first audio player (with autoplay)
95
  - Update for second audio player
96
  - A dictionary mapping options to providers
97
+ - The raw audio value for option B
98
 
99
  Raises:
100
  gr.Error: On API or unexpected errors.
101
  """
102
+ if not text:
103
  logger.warning('Skipping text-to-speech due to empty text.')
104
+ raise gr.Error('Please generate or enter text to synthesize.')
105
 
106
+ # If not using generated text, then only compare Hume to Hume
107
+ # compare_hume_with_elevenlabs = (text == generated_text_state) and (random.random() < 0.5)
108
  compare_hume_with_elevenlabs = False
109
 
110
  elevenlabs_voice = get_random_elevenlabs_voice_id()
111
+ # Get two Hume voices preemptively in case we compare Hume with Hume
112
+ # to remove chance synthesizing speech twice with the same voice
113
+ hume_voice_a, hume_voice_b = get_random_hume_voice_names()
114
 
115
  try:
116
  with ThreadPoolExecutor(max_workers=2) as executor:
117
  provider_a = HUME_AI
118
+ future_audio_a = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_a)
119
 
120
  if compare_hume_with_elevenlabs:
121
  provider_b = ELEVENLABS
122
+ future_audio_b = executor.submit(text_to_speech_with_elevenlabs, text, elevenlabs_voice)
123
  else:
124
  provider_b = HUME_AI
125
+ future_audio_b = executor.submit(text_to_speech_with_hume, prompt, text, hume_voice_b)
126
 
127
  audio_a, audio_b = future_audio_a.result(), future_audio_b.result()
128
 
 
133
  options_map = { OPTION_A: options[0][1], OPTION_B: options[1][1] }
134
 
135
  return (
136
+ gr.update(value=option_a_audio, visible=True, autoplay=True),
137
+ gr.update(value=option_b_audio, visible=True),
138
  options_map,
139
  option_b_audio,
140
  )
 
174
 
175
  return (
176
  True,
177
+ gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary', interactive=False) if is_option_a
178
+ else gr.update(value=other_provider, variant='secondary', interactive=False),
179
+ gr.update(value=other_provider, variant='secondary', interactive=False) if is_option_a
180
+ else gr.update(value=f'{selected_provider} {TROPHY_EMOJI}', variant='primary', interactive=False),
181
+ gr.update(interactive=True)
182
  )
183
 
184
  def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None, bool]:
 
192
  - vote_button_a (disable and reset button text)
193
  - vote_button_a (disable and reset button text)
194
  - option_mapping_state (reset option map state)
195
+ - option_b_audio_state (reset option B audio state)
196
  - vote_submitted_state (reset submitted vote state)
197
  """
198
  return (
199
  gr.update(value=None),
200
  gr.update(value=None),
201
+ gr.update(value=VOTE_FOR_OPTION_A, variant='secondary'),
202
+ gr.update(value=VOTE_FOR_OPTION_B, variant='secondary'),
203
  None,
204
  None,
205
  False,
 
207
 
208
 
209
  def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
210
+ """ Builds the input section including instructions, sample prompt dropdown, prompt input, and generate button """
211
+ with gr.Column(variant='panel'):
212
+ instructions = gr.Markdown("""
213
+ **Instructions**
214
+ 1. **Enter or Generate Text:** Type directly in the Text box, or optionally enter a Prompt, click "Generate text", and edit if needed.
215
+ 2. **Synthesize Speech:** Click "Synthesize speech" to generate two audio outputs.
216
+ 3. **Listen & Compare:** Playback both options (A & B) to hear the differences.
217
+ 4. **Vote for Your Favorite:** Click "Vote for option A" or "Vote for option B" to choose the best one.
218
+ """)
219
  sample_prompt_dropdown = gr.Dropdown(
220
  choices=list(SAMPLE_PROMPTS.keys()),
221
  label='Choose a sample prompt (or enter your own)',
 
230
  max_length=PROMPT_MAX_LENGTH,
231
  show_copy_button=True,
232
  )
233
+ generate_text_button = gr.Button('Generate text', variant='secondary')
234
+ return (
235
+ instructions,
236
+ sample_prompt_dropdown,
237
+ prompt_input,
238
+ generate_text_button,
239
+ )
240
 
241
 
242
+ def build_output_section() -> Tuple[gr.Textbox, gr.Button, gr.Audio, gr.Audio, gr.Button, gr.Button]:
243
+ """ Builds the output section including generated text, audio players, and vote buttons. """
244
+ with gr.Column(variant='panel'):
245
+ text_input = gr.Textbox(
246
  label='Text',
247
+ placeholder='Enter text to synthesize speech...',
248
+ interactive=True,
249
  autoscroll=False,
250
  lines=5,
251
  max_lines=5,
252
  max_length=PROMPT_MAX_LENGTH,
253
  show_copy_button=True,
254
  )
255
+ synthesize_speech_button = gr.Button('Synthesize speech', variant='primary')
256
  with gr.Row(equal_height=True):
257
  option_a_audio_player = gr.Audio(label=OPTION_A, type='filepath', interactive=False)
258
  option_b_audio_player = gr.Audio(label=OPTION_B, type='filepath', interactive=False)
259
+ with gr.Row(equal_height=True):
260
+ vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
261
+ vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
262
+ return (
263
+ text_input,
264
+ synthesize_speech_button,
265
+ option_a_audio_player,
266
+ option_b_audio_player,
267
+ vote_button_a,
268
+ vote_button_b,
269
+ )
270
 
271
 
272
  def build_gradio_interface() -> gr.Blocks:
 
281
  title='Expressive TTS Arena',
282
  theme=custom_theme,
283
  fill_width=True,
284
+ css_paths='src/assets/styles.css',
285
  ) as demo:
286
  # Title
287
  gr.Markdown('# Expressive TTS Arena')
288
 
289
+ # Build generate text section
290
+ (instructions,
291
+ sample_prompt_dropdown,
292
+ prompt_input,
293
+ generate_text_button) = build_input_section()
294
 
295
+ # Build synthesize speech section
296
+ (text_input,
297
+ synthesize_speech_button,
298
+ option_a_audio_player,
299
+ option_b_audio_player,
300
+ vote_button_a,
301
+ vote_button_b) = build_output_section()
302
 
303
  # UI state components
304
+ generated_text_state = gr.State('') # Track generated text state
305
+ option_b_audio_state = gr.State() # Track generated audio for option B for playing automatically after option 1 audio finishes
306
+ option_mapping_state = gr.State() # Track option map (option A and option B are randomized)
307
  vote_submitted_state = gr.State(False) # Track whether the user has voted on an option
308
 
309
  # --- Register event handlers ---
 
315
  outputs=[prompt_input],
316
  )
317
 
318
+ # Generate text button click handler chain:
319
+ # 1. Disable the "Generate text" button
320
+ # 2. Generate text
321
+ # 3. Enable the "Generate text" button
322
+ generate_text_button.click(
323
+ fn=lambda: gr.update(interactive=False),
324
+ inputs=[],
325
+ outputs=[generate_text_button]
326
+ ).then(
327
+ fn=generate_text,
328
+ inputs=[prompt_input],
329
+ outputs=[text_input, generated_text_state],
330
+ ).then(
331
+ fn=lambda: gr.update(interactive=True),
332
+ inputs=[],
333
+ outputs=[generate_text_button]
334
+ )
335
+
336
+ # Synthesize speech button click event handler chain:
337
+ # 1. Disable the "Synthesize speech" button
338
  # 2. Reset UI state
339
+ # 3. Synthesize speech, load audio players, and display vote button
340
+ # 4. Enable the "Synthesize speech" button and display vote buttons
341
+ synthesize_speech_button.click(
342
+ fn=lambda: gr.update(interactive=False),
 
343
  inputs=[],
344
+ outputs=[synthesize_speech_button]
345
  ).then(
346
  fn=reset_ui,
347
  inputs=[],
 
351
  vote_button_a,
352
  vote_button_b,
353
  option_mapping_state,
354
+ option_b_audio_state,
355
  vote_submitted_state,
356
  ],
 
 
 
 
357
  ).then(
358
  fn=text_to_speech,
359
+ inputs=[prompt_input, text_input, generated_text_state],
360
  outputs=[
361
  option_a_audio_player,
362
  option_b_audio_player,
363
  option_mapping_state,
364
+ option_b_audio_state,
365
  ],
366
  ).then(
367
+ fn=lambda: (
368
+ gr.update(interactive=True),
369
+ gr.update(interactive=True),
370
+ gr.update(interactive=True)
371
+ ),
372
  inputs=[],
373
+ outputs=[synthesize_speech_button, vote_button_a, vote_button_b]
374
  )
375
 
376
+ # Vote button click event handlers
377
  vote_button_a.click(
378
  fn=vote,
379
  inputs=[vote_submitted_state, option_mapping_state, vote_button_a],
380
+ outputs=[vote_submitted_state, vote_button_a, vote_button_b, synthesize_speech_button],
381
  )
382
  vote_button_b.click(
383
  fn=vote,
384
  inputs=[vote_submitted_state, option_mapping_state, vote_button_b],
385
+ outputs=[vote_submitted_state, vote_button_a, vote_button_b, synthesize_speech_button],
386
  )
387
 
388
+ # Auto-play second audio after first finishes (Workaround to play audio back-to-back)
389
+ # Audio player A stop event handler chain:
390
+ # 1. Clear the audio player A
391
+ # 2. Load audio player A with audio and set auto play to True
392
  option_a_audio_player.stop(
393
  fn=lambda _: gr.update(value=None),
394
  inputs=[],
395
  outputs=[option_b_audio_player],
396
  ).then(
397
  fn=lambda audio: gr.update(value=audio, autoplay=True),
398
+ inputs=[option_b_audio_state],
399
  outputs=[option_b_audio_player],
400
  )
401