Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files Community

esl-dialogue-tts / app.py

abocha

Update app.py

2e02a22 verified 3 months ago

raw

history blame contribute delete

11.2 kB

	# FILE: app.py
	import gradio as gr
	import os
	import asyncio
	from openai import AsyncOpenAI
	from functools import partial
	import datetime

	# Remove create_examples_ui from ui_layout imports if it's not used elsewhere
	from ui_layout import (
	create_main_input_components, create_speaker_config_components,
	create_action_and_output_components, # Removed create_examples_ui
	TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES,
	DEFAULT_GLOBAL_VOICE, VIBE_CHOICES, DEFAULT_VIBE, PREDEFINED_VIBES
	)

	from event_handlers import (
	handle_script_processing, handle_calculate_cost,
	handle_speaker_config_method_visibility_change,
	handle_tts_model_change,
	handle_load_refresh_per_speaker_ui_trigger,
	handle_dynamic_accordion_input_change,
	get_speakers_from_script
	)

	# --- Secrets and Client Setup (Same as before) ---
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
	MODEL_DEFAULT_FROM_ENV = os.getenv("MODEL_DEFAULT", MODEL_DEFAULT_ENV)
	EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_ENV if MODEL_DEFAULT_FROM_ENV in TTS_MODELS_AVAILABLE else MODEL_DEFAULT_ENV
	async_openai_client = None
	if not OPENAI_API_KEY:
	# ... (secret loading logic) ...
	pass
	if OPENAI_API_KEY:
	async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
	else:
	print("CRITICAL ERROR: OPENAI_API_KEY secret is not set.")


	# --- Main Blocks UI Definition ---
	with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
	gr.Markdown("# Dialogue Script to Speech (OpenAI TTS) 💪💪💪 TTS = Teachers Together Strong 💪💪💪")
	if not OPENAI_API_KEY or not async_openai_client:
	gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail.</h3>")

	speaker_configs_state = gr.State({})

	# --- Create Main UI Components ---
	(script_input, tts_model_dropdown, pause_input,
	global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)

	(speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
	detailed_per_speaker_ui_group_container,
	load_per_speaker_ui_button) = create_speaker_config_components()

	(calculate_cost_button, generate_button, cost_output,
	individual_lines_zip_output, merged_dialogue_mp3_output,
	status_output) = create_action_and_output_components()

	# --- Dynamic UI (@gr.render) Definition (Same as before) ---
	with detailed_per_speaker_ui_group_container:
	@gr.render(
	inputs=[script_input, speaker_configs_state, tts_model_dropdown],
	triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
	)
	def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
	# ... (Full @gr.render implementation from previous correct step) ...
	print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs,dict) else 'Not a dict'}")
	unique_speakers = get_speakers_from_script(current_script_text)
	if not unique_speakers:
	gr.Markdown("<p style='color: #888; margin-top:10px;'>Enter script & click 'Load/Refresh' for per-speaker settings.</p>")
	return
	for speaker_idx, speaker_name in enumerate(unique_speakers):
	if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
	speaker_specific_config = current_speaker_configs.get(speaker_name, {})
	accordion_elem_id = f"accordion_spk_{speaker_idx}_{speaker_name.replace(' ','_').lower()}"
	with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False, elem_id=accordion_elem_id):
	gr.Markdown(f"Configure voice for {speaker_name} using {current_tts_model} model.")
	default_voice = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
	voice_dd_elem_id = f"voice_dd_spk_{speaker_idx}"
	voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, value=default_voice, label="Voice", elem_id=voice_dd_elem_id)
	voice_dropdown.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="voice"), inputs=[voice_dropdown, speaker_configs_state], outputs=[speaker_configs_state])
	if current_tts_model in ["tts-1", "tts-1-hd"]:
	default_speed = float(speaker_specific_config.get("speed", 1.0))
	speed_slider_elem_id = f"speed_slider_spk_{speaker_idx}"
	speed_slider = gr.Slider(minimum=0.25, maximum=4.0, value=default_speed, step=0.05, label="Speed", elem_id=speed_slider_elem_id)
	speed_slider.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="speed"), inputs=[speed_slider, speaker_configs_state], outputs=[speaker_configs_state])
	elif current_tts_model == "gpt-4o-mini-tts":
	default_vibe = speaker_specific_config.get("vibe", DEFAULT_VIBE)
	vibe_dd_elem_id = f"vibe_dd_spk_{speaker_idx}"
	vibe_dropdown = gr.Dropdown(VIBE_CHOICES, value=default_vibe, label="Vibe/Emotion", elem_id=vibe_dd_elem_id)
	default_custom_instructions = speaker_specific_config.get("custom_instructions", "")
	custom_instr_tb_elem_id = f"custom_instr_tb_spk_{speaker_idx}"
	custom_instructions_textbox = gr.Textbox(label="Custom Instructions", value=default_custom_instructions, placeholder="e.g., Speak slightly hesitant.", lines=2, visible=(default_vibe == "Custom..."), elem_id=custom_instr_tb_elem_id)
	vibe_dropdown.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="vibe"), inputs=[vibe_dropdown, speaker_configs_state], outputs=[speaker_configs_state]).then(fn=lambda vibe_val: gr.update(visible=(vibe_val == "Custom...")), inputs=[vibe_dropdown], outputs=[custom_instructions_textbox])
	custom_instructions_textbox.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="custom_instructions"), inputs=[custom_instructions_textbox, speaker_configs_state], outputs=[speaker_configs_state])


	# --- Event Listeners (Same as before) ---
	tts_model_dropdown.change(fn=handle_tts_model_change, inputs=[tts_model_dropdown, speaker_configs_state], outputs=[global_speed_input, global_instructions_input, speaker_configs_state])
	speaker_config_method_dropdown.change(fn=handle_speaker_config_method_visibility_change, inputs=[speaker_config_method_dropdown], outputs=[single_voice_group, detailed_per_speaker_ui_group_container])
	load_per_speaker_ui_button.click(fn=handle_load_refresh_per_speaker_ui_trigger, inputs=[script_input, speaker_configs_state, tts_model_dropdown], outputs=[speaker_configs_state])
	calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
	generate_button_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
	generate_button.click(fn=generate_button_fn, inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown, speaker_configs_state, global_speed_input, global_instructions_input], outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])

	# --- Examples Section Definition (Moved here) ---
	gr.Markdown("## Example Scripts") # Keep the header if desired

	# Define the lists needed for Examples right here
	example_inputs_list_comps = [
	script_input, tts_model_dropdown, pause_input,
	speaker_config_method_dropdown, global_voice_dropdown,
	global_speed_input, global_instructions_input
	]
	example_outputs_list_comps = [individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
	example_process_fn_actual = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE) if OPENAI_API_KEY else None

	# Define the example data directly
	examples_data = [
	[
	"""[Alice] Hello Bob, this is a test using the detailed configuration method.
	[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.
	[Alice] Let's see how this sounds.""",
	"tts-1-hd",
	300,
	"Random per Speaker",
	DEFAULT_GLOBAL_VOICE,
	1.0,
	""
	],
	[
	"""[Narrator] Once upon a time, there was a gentle breeze over the hills.
	[Narrator] The village below prepared for the annual festival as the sun set.""",
	"gpt-4o-mini-tts",
	200,
	"Detailed Configuration (Per Speaker UI)",
	DEFAULT_GLOBAL_VOICE,
	1.0,
	"Speak with a gentle, storytelling tone."
	],
	[
	"""[Solo] This is a quick single‑voice demo for testing purposes.""",
	"tts-1",
	0,
	"Single Voice (Global)",
	"fable",
	1.2,
	""
	],
	]


	# Validate example data length against input components length
	num_inputs_expected = len(example_inputs_list_comps)
	valid_examples_data_inline = []
	for ex_data in examples_data:
	if len(ex_data) == num_inputs_expected:
	valid_examples_data_inline.append(ex_data)
	else:
	print(f"Warning (Inline Examples): Example data mismatch. Expected {num_inputs_expected}, got {len(ex_data)}. Skipping.")

	# Directly instantiate gr.Examples if valid data exists
	if valid_examples_data_inline:
	if example_process_fn_actual:
	gr.Examples(
	examples=valid_examples_data_inline,
	inputs=example_inputs_list_comps,
	outputs=example_outputs_list_comps,
	fn=example_process_fn_actual,
	cache_examples=False,
	examples_per_page=5,
	label="Example Scripts (Click to Load)", # Label is optional if header exists
	run_on_click=False
	)
	else:
	gr.Examples(
	examples=valid_examples_data_inline,
	inputs=example_inputs_list_comps,
	examples_per_page=5,
	label="Example Scripts (Click to Load Inputs)", # Label is optional if header exists
	)
	else:
	gr.Markdown("<p style='color: orange;'>No valid examples could be loaded due to configuration mismatch.</p>")


	# --- Launch ---
	if __name__ == "__main__":
	if os.name == 'nt':
	asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
	demo.queue().launch(debug=True, share=False)