Spaces:

abocha
/

esl-dialogue-tts

Running

File size: 11,156 Bytes

e5a707f
1190db4
 
 
 
635a2fb
49a48a4
8468afb
66f012e
8468afb
 
0b876d2
49a48a4
66f012e
8468afb
 
 
 
92c9b3d
 
 
7633d98
49a48a4
8468afb
 
7633d98
66f012e
 
 
 
 
 
 
 
 
 
 
 
 
d44dfc0
7633d98
92c9b3d
2e02a22
66f012e
 
1190db4
92c9b3d
d44dfc0
0b876d2
8468afb
 
d44dfc0
8468afb
49a48a4
92c9b3d
d44dfc0
8468afb
 
 
5c85d81
7633d98
 
66f012e
 
 
 
92c9b3d
66f012e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b876d2
92c9b3d
7633d98
0b876d2
 
 
 
b7680b4
0b876d2
 
66f012e
 
0b876d2
66f012e
 
 
 
 
8468afb
62d5317
66f012e
 
 
 
b95b786
 
8ccb15b
 
b95b786
 
b1347ef
b95b786
 
 
 
 
 
8ccb15b
b95b786
 
b1347ef
b95b786
 
 
 
 
 
 
 
 
 
 
 
 
66f012e
b1347ef
62d5317
66f012e
 
0b876d2
66f012e
0b876d2
 
 
66f012e
0b876d2
66f012e
0b876d2
66f012e
 
 
 
 
 
 
 
b1347ef
 
66f012e
 
 
 
 
 
 
 
0b876d2
66f012e
0b876d2
b7680b4
7633d98
1190db4
49a48a4
d48101f
b7680b4

# FILE: app.py
import gradio as gr
import os
import asyncio
from openai import AsyncOpenAI
from functools import partial 
import datetime 

# Remove create_examples_ui from ui_layout imports if it's not used elsewhere
from ui_layout import (
    create_main_input_components, create_speaker_config_components,
    create_action_and_output_components, # Removed create_examples_ui
    TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES, 
    DEFAULT_GLOBAL_VOICE, VIBE_CHOICES, DEFAULT_VIBE, PREDEFINED_VIBES
)

from event_handlers import (
    handle_script_processing, handle_calculate_cost,
    handle_speaker_config_method_visibility_change, 
    handle_tts_model_change, 
    handle_load_refresh_per_speaker_ui_trigger,
    handle_dynamic_accordion_input_change, 
    get_speakers_from_script
)

# --- Secrets and Client Setup (Same as before) ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
MODEL_DEFAULT_FROM_ENV = os.getenv("MODEL_DEFAULT", MODEL_DEFAULT_ENV)
EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_ENV if MODEL_DEFAULT_FROM_ENV in TTS_MODELS_AVAILABLE else MODEL_DEFAULT_ENV
async_openai_client = None
if not OPENAI_API_KEY:
    # ... (secret loading logic) ...
    pass 
if OPENAI_API_KEY:
    async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
else:
    print("CRITICAL ERROR: OPENAI_API_KEY secret is not set.")


# --- Main Blocks UI Definition ---
with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
    gr.Markdown("# Dialogue Script to Speech (OpenAI TTS) 💪💪💪 TTS = Teachers Together Strong 💪💪💪")
    if not OPENAI_API_KEY or not async_openai_client:
        gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail.</h3>")

    speaker_configs_state = gr.State({}) 

    # --- Create Main UI Components ---
    (script_input, tts_model_dropdown, pause_input,
     global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)

    (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
     detailed_per_speaker_ui_group_container,
     load_per_speaker_ui_button) = create_speaker_config_components()

    (calculate_cost_button, generate_button, cost_output,
     individual_lines_zip_output, merged_dialogue_mp3_output,
     status_output) = create_action_and_output_components()

    # --- Dynamic UI (@gr.render) Definition (Same as before) ---
    with detailed_per_speaker_ui_group_container: 
        @gr.render(
            inputs=[script_input, speaker_configs_state, tts_model_dropdown],
            triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
        )
        def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
            # ... (Full @gr.render implementation from previous correct step) ...
            print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs,dict) else 'Not a dict'}")
            unique_speakers = get_speakers_from_script(current_script_text)
            if not unique_speakers:
                gr.Markdown("<p style='color: #888; margin-top:10px;'>Enter script & click 'Load/Refresh' for per-speaker settings.</p>")
                return
            for speaker_idx, speaker_name in enumerate(unique_speakers):
                if not isinstance(current_speaker_configs, dict): current_speaker_configs = {} 
                speaker_specific_config = current_speaker_configs.get(speaker_name, {})
                accordion_elem_id = f"accordion_spk_{speaker_idx}_{speaker_name.replace(' ','_').lower()}"
                with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False, elem_id=accordion_elem_id):
                    gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")
                    default_voice = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
                    voice_dd_elem_id = f"voice_dd_spk_{speaker_idx}"
                    voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, value=default_voice, label="Voice", elem_id=voice_dd_elem_id)
                    voice_dropdown.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="voice"), inputs=[voice_dropdown, speaker_configs_state], outputs=[speaker_configs_state])
                    if current_tts_model in ["tts-1", "tts-1-hd"]:
                        default_speed = float(speaker_specific_config.get("speed", 1.0))
                        speed_slider_elem_id = f"speed_slider_spk_{speaker_idx}"
                        speed_slider = gr.Slider(minimum=0.25, maximum=4.0, value=default_speed, step=0.05, label="Speed", elem_id=speed_slider_elem_id)
                        speed_slider.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="speed"), inputs=[speed_slider, speaker_configs_state], outputs=[speaker_configs_state])
                    elif current_tts_model == "gpt-4o-mini-tts":
                        default_vibe = speaker_specific_config.get("vibe", DEFAULT_VIBE)
                        vibe_dd_elem_id = f"vibe_dd_spk_{speaker_idx}"
                        vibe_dropdown = gr.Dropdown(VIBE_CHOICES, value=default_vibe, label="Vibe/Emotion", elem_id=vibe_dd_elem_id)
                        default_custom_instructions = speaker_specific_config.get("custom_instructions", "")
                        custom_instr_tb_elem_id = f"custom_instr_tb_spk_{speaker_idx}"
                        custom_instructions_textbox = gr.Textbox(label="Custom Instructions", value=default_custom_instructions, placeholder="e.g., Speak slightly hesitant.", lines=2, visible=(default_vibe == "Custom..."), elem_id=custom_instr_tb_elem_id)
                        vibe_dropdown.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="vibe"), inputs=[vibe_dropdown, speaker_configs_state], outputs=[speaker_configs_state]).then(fn=lambda vibe_val: gr.update(visible=(vibe_val == "Custom...")), inputs=[vibe_dropdown], outputs=[custom_instructions_textbox])
                        custom_instructions_textbox.change(fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="custom_instructions"), inputs=[custom_instructions_textbox, speaker_configs_state], outputs=[speaker_configs_state])


    # --- Event Listeners (Same as before) ---
    tts_model_dropdown.change(fn=handle_tts_model_change, inputs=[tts_model_dropdown, speaker_configs_state], outputs=[global_speed_input, global_instructions_input, speaker_configs_state])
    speaker_config_method_dropdown.change(fn=handle_speaker_config_method_visibility_change, inputs=[speaker_config_method_dropdown], outputs=[single_voice_group, detailed_per_speaker_ui_group_container])
    load_per_speaker_ui_button.click(fn=handle_load_refresh_per_speaker_ui_trigger, inputs=[script_input, speaker_configs_state, tts_model_dropdown], outputs=[speaker_configs_state])
    calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
    generate_button_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
    generate_button.click(fn=generate_button_fn, inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown, speaker_configs_state, global_speed_input, global_instructions_input], outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])

    # --- Examples Section Definition (Moved here) ---
    gr.Markdown("## Example Scripts") # Keep the header if desired

    # Define the lists needed for Examples right here
    example_inputs_list_comps = [
        script_input, tts_model_dropdown, pause_input,
        speaker_config_method_dropdown, global_voice_dropdown,
        global_speed_input, global_instructions_input
    ]
    example_outputs_list_comps = [individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
    example_process_fn_actual = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE) if OPENAI_API_KEY else None

    # Define the example data directly
    examples_data = [
        [
            """[Alice] Hello Bob, this is a test using the detailed configuration method.
[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.
[Alice] Let's see how this sounds.""",
            "tts-1-hd",
            300,
            "Random per Speaker",
            DEFAULT_GLOBAL_VOICE,
            1.0,
            ""
        ],
        [
            """[Narrator] Once upon a time, there was a gentle breeze over the hills.
[Narrator] The village below prepared for the annual festival as the sun set.""",
            "gpt-4o-mini-tts",
            200,
            "Detailed Configuration (Per Speaker UI)",
            DEFAULT_GLOBAL_VOICE,
            1.0,
            "Speak with a gentle, storytelling tone."
        ],
        [
            """[Solo] This is a quick single‑voice demo for testing purposes.""",
            "tts-1",
            0,
            "Single Voice (Global)",
            "fable",
            1.2,
            ""
        ],
    ]

    
    # Validate example data length against input components length
    num_inputs_expected = len(example_inputs_list_comps)
    valid_examples_data_inline = []
    for ex_data in examples_data:
        if len(ex_data) == num_inputs_expected:
            valid_examples_data_inline.append(ex_data)
        else:
             print(f"Warning (Inline Examples): Example data mismatch. Expected {num_inputs_expected}, got {len(ex_data)}. Skipping.")

    # Directly instantiate gr.Examples if valid data exists
    if valid_examples_data_inline:
        if example_process_fn_actual:
             gr.Examples(
                examples=valid_examples_data_inline,
                inputs=example_inputs_list_comps,
                outputs=example_outputs_list_comps,
                fn=example_process_fn_actual,
                cache_examples=False,
                examples_per_page=5,
                label="Example Scripts (Click to Load)", # Label is optional if header exists
                run_on_click=False                
            )
        else:
            gr.Examples(
                examples=valid_examples_data_inline,
                inputs=example_inputs_list_comps,
                examples_per_page=5,
                label="Example Scripts (Click to Load Inputs)", # Label is optional if header exists
            )
    else:
         gr.Markdown("<p style='color: orange;'>No valid examples could be loaded due to configuration mismatch.</p>")


# --- Launch ---
if __name__ == "__main__":
    if os.name == 'nt':
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    demo.queue().launch(debug=True, share=False)