Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files Community

abocha commited on May 7

Commit

8468afb

1 Parent(s): d48101f

refactor + ui fix

Browse files

Files changed (3) hide show

app.py +104 -388
event_handlers.py +332 -0
ui_layout.py +144 -0

app.py CHANGED Viewed

@@ -1,439 +1,155 @@
 import gradio as gr
 import os
 import asyncio
-import tempfile
-import shutil
-import zipfile
-import random
-import json # Keep for other potential uses, though not primary for this config
-import pandas as pd # Keep for now, in case other features might use it
 from openai import AsyncOpenAI
-from functools import partial # For dynamic event handlers
-from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
-from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
-from utils.merge_audio import merge_mp3_files
-# --- Configuration ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
-MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
 if not OPENAI_API_KEY:
     try:
         from huggingface_hub import HfApi
         api = HfApi()
-        space_id = os.getenv("SPACE_ID")
         if space_id:
             secrets = api.get_space_secrets(repo_id=space_id)
             OPENAI_API_KEY = secrets.get("OPENAI_API_KEY")
             NSFW_API_URL_TEMPLATE = secrets.get("NSFW_API_URL_TEMPLATE", NSFW_API_URL_TEMPLATE)
-            MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
     except Exception as e:
-        print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
-async_openai_client = None
 if OPENAI_API_KEY:
     async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
 else:
-    print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
-TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
-if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE:
-    MODEL_DEFAULT = "tts-1-hd"
-SPEAKER_CONFIG_METHODS = [
-    "Single Voice (Global)",
-    "Random per Speaker",
-    "A/B Round Robin",
-    "Detailed Configuration (Per Speaker UI)"
-]
-DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
-APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy() # Uses the extended list from openai_tts.py
-PREDEFINED_VIBES = {
-    "None": "",
-    "Calm": "Speak in a calm, composed, and relaxed manner.",
-    "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
-    "Happy": "Speak with a cheerful, bright, and joyful voice.",
-    "Sad": "Speak with a sorrowful, melancholic, and dejected tone.",
-    "Whisper": "Speak softly, as if whispering.",
-    "Angry": "Speak with a strong, firm, and possibly agitated voice.",
-    "Fearful": "Speak with a trembling, hesitant, and scared voice.",
-    "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
-    "Authoritative": "Speak with a commanding, confident, and firm voice.",
-    "Friendly": "Speak in a warm, approachable, and amiable manner.",
-    "Custom...": "CUSTOM"
-}
-VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
-DEFAULT_VIBE = "None"
-def get_speakers_from_script(script_text):
-    if not script_text.strip(): return []
-    try:
-        parsed_lines, _ = parse_dialogue_script(script_text)
-        # Return unique speakers in order of appearance (though order doesn't strictly matter for this use)
-        seen_speakers = set()
-        ordered_unique_speakers = []
-        for p in parsed_lines:
-            if p["speaker"] not in seen_speakers:
-                ordered_unique_speakers.append(p["speaker"])
-                seen_speakers.add(p["speaker"])
-        return ordered_unique_speakers
-    except ValueError: return []
-def handle_dynamic_input_change(new_value, current_configs_state_dict, speaker_name, config_key, tts_model):
-    if speaker_name not in current_configs_state_dict:
-        current_configs_state_dict[speaker_name] = {}
-    current_configs_state_dict[speaker_name][config_key] = new_value
-    return current_configs_state_dict
-def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_model):
-    unique_speakers = get_speakers_from_script(script_text)
-    new_ui_components = []
-    if current_configs_state_dict is None:
-        current_configs_state_dict = {}
-    for speaker_name in unique_speakers:
-        if speaker_name not in current_configs_state_dict:
-            current_configs_state_dict[speaker_name] = {
-                "voice": APP_AVAILABLE_VOICES[0],
-                "speed": 1.0,
-                "vibe": DEFAULT_VIBE,
-                "custom_instructions": ""
-            }
-        current_configs_state_dict[speaker_name].setdefault("voice", APP_AVAILABLE_VOICES[0])
-        current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
-        current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
-        current_configs_state_dict[speaker_name].setdefault("custom_instructions", "")
-    if not unique_speakers:
-        new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again."))
-        return new_ui_components, current_configs_state_dict
-    for speaker_name in unique_speakers:
-        speaker_cfg = current_configs_state_dict[speaker_name]
-        speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
-        instructions_relevant = tts_model == "gpt-4o-mini-tts"
-        with gr.Accordion(label=f"Settings for: {speaker_name}", open=False) as speaker_accordion:
-            voice_dd = gr.Dropdown(
-                label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg["voice"], interactive=True
-            )
-            voice_dd.change(
-                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
-                inputs=[voice_dd, speaker_configs_state],
-                outputs=[speaker_configs_state]
-            )
-            speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
-            speed_slider = gr.Slider(
-                label=speed_slider_label, minimum=0.25, maximum=4.0, value=speaker_cfg["speed"],
-                step=0.05, interactive=speed_interactive
-            )
-            if speed_interactive:
-                speed_slider.release(
-                    fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
-                    inputs=[speed_slider, speaker_configs_state],
-                    outputs=[speaker_configs_state]
-                )
-            vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
-            vibe_dd = gr.Dropdown(
-                label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg["vibe"], interactive=True
-            )
-            vibe_dd.change(
-                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="vibe", tts_model=tts_model),
-                inputs=[vibe_dd, speaker_configs_state],
-                outputs=[speaker_configs_state]
-            )
-            custom_instr_label = "Custom Instructions"
-            custom_instr_placeholder = "Only used if Vibe is 'Custom...'. Overrides Vibe."
-            custom_instr_tb = gr.Textbox(
-                label=custom_instr_label,
-                value=speaker_cfg["custom_instructions"],
-                placeholder=custom_instr_placeholder,
-                lines=2, interactive=True
-            )
-            custom_instr_tb.input(
-                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
-                inputs=[custom_instr_tb, speaker_configs_state],
-                outputs=[speaker_configs_state]
-            )
-        new_ui_components.append(speaker_accordion)
-    return new_ui_components, current_configs_state_dict
-async def handle_script_processing(
-    dialogue_script: str, tts_model: str, pause_ms: int,
-    speaker_config_method: str, global_voice_selection: str,
-    speaker_configs_state_dict: dict,
-    global_speed: float,
-    global_instructions: str, progress=gr.Progress(track_tqdm=True)):
-    if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
-    if not dialogue_script.strip(): return None, None, "Error: Script empty."
-    # Create a job-specific temporary directory and ensure it's clean
-    job_audio_path_prefix = os.path.join(tempfile.gettempdir(), f"dialogue_tts_job_{random.randint(10000, 99999)}")
-    if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
-    os.makedirs(job_audio_path_prefix, exist_ok=True)
-    try:
-        parsed_lines, _ = parse_dialogue_script(dialogue_script)
-        if not parsed_lines:
-            shutil.rmtree(job_audio_path_prefix)
-            return None, None, "Error: No valid lines found in script."
-    except ValueError as e:
-        shutil.rmtree(job_audio_path_prefix)
-        return None, None, f"Script parsing error: {str(e)}"
-    if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
-    # --- Voice assignment map for Random and A/B per Speaker ---
-    speaker_voice_map = {}
-    if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
-        unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
-        if speaker_config_method == "Random per Speaker":
-            for spk_name in unique_script_speakers_for_map:
-                speaker_voice_map[spk_name] = random.choice(APP_AVAILABLE_VOICES)
-        elif speaker_config_method == "A/B Round Robin":
-            for i, spk_name in enumerate(unique_script_speakers_for_map):
-                # Ensure APP_AVAILABLE_VOICES is not empty to prevent modulo by zero
-                if APP_AVAILABLE_VOICES:
-                     speaker_voice_map[spk_name] = APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]
-                else: # Fallback if voice list is somehow empty
-                     speaker_voice_map[spk_name] = "alloy" # Default OpenAI voice
-    # --- End voice assignment map ---
-    tasks, line_audio_files = [], [None] * len(parsed_lines)
-    for i, line_data in enumerate(parsed_lines):
-        speaker_name = line_data["speaker"]
-        line_voice = global_voice_selection # Default for "Single Voice (Global)" or fallback
-        line_speed = global_speed
-        line_instructions = global_instructions if global_instructions and global_instructions.strip() else None
-        if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
-            spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
-            line_voice = spk_cfg.get("voice", global_voice_selection)
-            if tts_model in ["tts-1", "tts-1-hd"]:
-                line_speed = spk_cfg.get("speed", global_speed)
-            if tts_model == "gpt-4o-mini-tts":
-                vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
-                custom_instr = spk_cfg.get("custom_instructions", "").strip()
-                if vibe == "Custom..." and custom_instr: line_instructions = custom_instr
-                elif vibe != "None" and vibe != "Custom...": line_instructions = PREDEFINED_VIBES.get(vibe, "")
-                if not line_instructions and global_instructions and global_instructions.strip(): line_instructions = global_instructions
-                elif not line_instructions : line_instructions = None
-        elif speaker_config_method == "Random per Speaker" or speaker_config_method == "A/B Round Robin":
-            line_voice = speaker_voice_map.get(speaker_name, global_voice_selection) # Use mapped voice
-        if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
-        out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
-        progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
-        tasks.append(synthesize_speech_line(
-            client=async_openai_client, text=line_data["text"], voice=line_voice,
-            output_path=out_fn, model=tts_model, speed=line_speed,
-            instructions=line_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
-            line_index=line_data['id']))
-    results = await asyncio.gather(*tasks, return_exceptions=True)
-    for idx, res in enumerate(results):
-        if isinstance(res, Exception): print(f"Error synthesizing line {parsed_lines[idx]['id']}: {res}")
-        elif res is None: print(f"Skipped or failed synthesizing line {parsed_lines[idx]['id']}")
-        else: line_audio_files[parsed_lines[idx]['id']] = res # Store by original line ID if non-sequential
-    # Filter for valid, existing files, using the original parsed_lines order for merge
-    files_for_merge = []
-    for p_line in parsed_lines:
-        file_path = line_audio_files[p_line['id']]
-        if file_path and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
-            files_for_merge.append(file_path)
-        else:
-            files_for_merge.append(None) # Keep placeholder for correct ordering if a line failed
-    valid_files_for_zip = [f for f in files_for_merge if f]
-    if not valid_files_for_zip:
-        shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio was successfully synthesized."
-    zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
-    with zipfile.ZipFile(zip_fn, 'w') as zf:
-        for f_path in valid_files_for_zip:
-            zf.write(f_path, os.path.basename(f_path))
-    merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
-    # Pass only existing files to merge_mp3_files, maintaining order
-    ordered_files_to_merge = [f for f in files_for_merge if f]
-    merged_path = merge_mp3_files(ordered_files_to_merge, merged_fn, pause_ms)
-    status = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
-    if len(valid_files_for_zip) < len(parsed_lines): status += "Some lines may have failed. "
-    if not merged_path and len(valid_files_for_zip) > 0: status += "Merging audio failed. "
-    elif not merged_path: status = "No audio to merge." # Overrides previous status if all failed before merge
-    else: status += "Merged audio generated."
-    # Note: job_audio_path_prefix (temp dir) is not explicitly deleted here.
-    # Gradio File/Audio components copy the file, so the temp dir can be cleaned
-    # by the OS or a cleanup routine if this Space were long-running.
-    # For HF Spaces, /tmp is ephemeral anyway. For robustness, could add shutil.rmtree(job_audio_path_prefix)
-    # after files are served, but need to ensure Gradio has finished with them.
-    # For now, rely on new unique dir per run and ephemeral /tmp.
-    return (zip_fn if os.path.exists(zip_fn) else None,
-           merged_path if merged_path and os.path.exists(merged_path) else None,
-           status)
-def handle_calculate_cost(dialogue_script: str, tts_model: str):
-    if not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
-    try:
-        parsed, chars = parse_dialogue_script(dialogue_script)
-        if not parsed: return "Cost: $0.00 (No valid lines in script)"
-        cost = calculate_cost(chars, len(parsed), tts_model)
-        # Using .6f for precision, especially for char-based cost
-        return f"Estimated Cost for {len(parsed)} lines ({chars} chars): ${cost:.6f}"
-    except ValueError as e: # Catch script length error from parser
-        return f"Cost calculation error: {str(e)}"
-    except Exception as e:
-        return f"An unexpected error occurred during cost calculation: {str(e)}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Dialogue Script to Speech (OpenAI TTS)")
     if not OPENAI_API_KEY or not async_openai_client:
-        gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY secret is not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
-    speaker_configs_state = gr.State({})
-    with gr.Row():
-        with gr.Column(scale=2):
-            script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\n[Speaker2] How are you today?", lines=10)
-        with gr.Column(scale=1):
-            tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT)
-            pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50)
-            global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (for tts-1/hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
-            global_instructions_input = gr.Textbox(label="Global Instructions (for gpt-4o-mini-tts)", placeholder="e.g., Speak with a calm tone.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
-    gr.Markdown("### Speaker Voice & Style Configuration")
-    speaker_config_method_dropdown = gr.Dropdown(
-        SPEAKER_CONFIG_METHODS, label="Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
-    )
-    with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
-        global_voice_dropdown = gr.Dropdown(
-            APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy", interactive=True
-        )
-    with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
-        load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
-        gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are applied per-speaker. If script changes, click again to refresh.</small>")
-        dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
-    with gr.Row():
-        calculate_cost_button = gr.Button("Calculate Estimated Cost")
-        generate_button = gr.Button("Generate Audio", variant="primary")
-    cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
-    with gr.Row():
-        individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)")
-        merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath")
-    status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5)
-    def update_model_controls_visibility(selected_model, script_text_for_refresh, current_speaker_configs_for_refresh):
-        new_dynamic_ui_components, updated_state = load_refresh_per_speaker_ui(script_text_for_refresh, current_speaker_configs_for_refresh, selected_model)
-        is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
-        is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
-        # It's crucial that dynamic_speaker_ui_area receives the *list* of components.
-        # If it's wrapped in a gr.update, it might not render correctly unless gr.update(children=...)
-        # Direct assignment seems to be what Gradio expects when outputting to a Column/Row that acts as a container.
-        return {
-            global_speed_input: gr.update(visible=is_tts1_family, interactive=is_tts1_family),
-            global_instructions_input: gr.update(visible=is_gpt_mini_tts, interactive=is_gpt_mini_tts),
-            dynamic_speaker_ui_area: new_dynamic_ui_components,
-            speaker_configs_state: updated_state
-        }
     tts_model_dropdown.change(
-        fn=update_model_controls_visibility,
-        inputs=[tts_model_dropdown, script_input, speaker_configs_state],
         outputs=[global_speed_input, global_instructions_input, dynamic_speaker_ui_area, speaker_configs_state]
     )
-    def update_speaker_config_method_visibility(method):
-        is_single = (method == "Single Voice (Global)")
-        is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
-        return {
-            single_voice_group: gr.update(visible=is_single),
-            detailed_per_speaker_ui_group: gr.update(visible=is_detailed_per_speaker),
-        }
     speaker_config_method_dropdown.change(
-        fn=update_speaker_config_method_visibility,
-        inputs=[speaker_config_method_dropdown],
         outputs=[single_voice_group, detailed_per_speaker_ui_group]
     )
     load_per_speaker_ui_button.click(
         fn=load_refresh_per_speaker_ui,
-        inputs=[script_input, speaker_configs_state, tts_model_dropdown],
-        outputs=[dynamic_speaker_ui_area, speaker_configs_state]
     )
-    calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
     generate_button.click(
-        fn=handle_script_processing,
         inputs=[
-            script_input, tts_model_dropdown, pause_input,
-            speaker_config_method_dropdown, global_voice_dropdown,
-            speaker_configs_state,
             global_speed_input, global_instructions_input
         ],
-        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
-    gr.Markdown("## Example Scripts")
-    example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\n[Alice] Let's see how this sounds."
-    example_script_2 = "[Narrator] This is a short story.\n[CharacterA] Once upon a time...\n[Narrator] ...there was a Gradio app.\n[CharacterB] And it could talk!"
-    gr.Examples(
-        examples=[
-            [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy", {}, 1.0, ""],
-            [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy", {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
-            ["[Solo] Just one line, using global voice and speed.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
-        ],
-        # speaker_configs_state is passed as an empty dict {} for examples.
-        # For "Detailed Configuration", the user should click "Load/Refresh Per-Speaker UI" after an example loads to populate the UI.
-        inputs=[
-            script_input, tts_model_dropdown, pause_input,
-            speaker_config_method_dropdown, global_voice_dropdown,
-            speaker_configs_state,
-            global_speed_input, global_instructions_input
-        ],
-        # Outputs for examples are not strictly necessary to pre-compute if cache_examples=False
-        # but defining them can help Gradio understand the flow.
-        # We can make the example click run the full processing.
-        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
-        fn=handle_script_processing,
-        cache_examples=False # Set to True if pre-computation is desired and feasible
     )
 if __name__ == "__main__":
-    # Required for Windows if using asyncio with ProactorEventLoop which can be default
-    if os.name == 'nt':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
-    demo.launch(debug=True) # Debug=True for development, remove for production/HF Space

 import gradio as gr
 import os
 import asyncio
 from openai import AsyncOpenAI
+from functools import partial # For handle_script_processing
+# Import UI creation functions and constants
+from ui_layout import (
+    create_main_input_components, create_speaker_config_components,
+    create_action_and_output_components, create_examples_ui,
+    TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV
+)
+# Import event handler functions
+from event_handlers import (
+    handle_script_processing, handle_calculate_cost,
+    update_model_controls_visibility, update_speaker_config_method_visibility,
+    load_refresh_per_speaker_ui
+)
+# --- Application Secrets and Global Client ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
+MODEL_DEFAULT_FROM_ENV = os.getenv("MODEL_DEFAULT", MODEL_DEFAULT_ENV)
+# Validate MODEL_DEFAULT_FROM_ENV or use hardcoded default
+EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_ENV if MODEL_DEFAULT_FROM_ENV in TTS_MODELS_AVAILABLE else MODEL_DEFAULT_ENV
+async_openai_client = None
 if not OPENAI_API_KEY:
     try:
+        # Attempt to load from Hugging Face Hub secrets if not in env
         from huggingface_hub import HfApi
         api = HfApi()
+        space_id = os.getenv("SPACE_ID") # Provided by HF Spaces
         if space_id:
             secrets = api.get_space_secrets(repo_id=space_id)
             OPENAI_API_KEY = secrets.get("OPENAI_API_KEY")
             NSFW_API_URL_TEMPLATE = secrets.get("NSFW_API_URL_TEMPLATE", NSFW_API_URL_TEMPLATE)
+            MODEL_DEFAULT_FROM_HUB = secrets.get("MODEL_DEFAULT", EFFECTIVE_MODEL_DEFAULT)
+            EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_HUB if MODEL_DEFAULT_FROM_HUB in TTS_MODELS_AVAILABLE else EFFECTIVE_MODEL_DEFAULT
+            print("Loaded secrets from Hugging Face Hub.")
     except Exception as e:
+        print(f"Could not retrieve secrets from Hugging Face Hub: {e}. OPENAI_API_KEY might be missing.")
 if OPENAI_API_KEY:
     async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
 else:
+    print("CRITICAL ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
+# --- Gradio Application UI and Logic ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Dialogue Script to Speech (OpenAI TTS) - Refactored")
     if not OPENAI_API_KEY or not async_openai_client:
+        gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
+    # Central state for detailed speaker configurations
+    speaker_configs_state = gr.State({}) # This is crucial for dynamic UI
+    # --- Define UI Components by calling layout functions ---
+    (script_input, tts_model_dropdown, pause_input,
+     global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
+    (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
+     detailed_per_speaker_ui_group, load_per_speaker_ui_button,
+     dynamic_speaker_ui_area) = create_speaker_config_components()
+    (calculate_cost_button, generate_button, cost_output,
+     individual_lines_zip_output, merged_dialogue_mp3_output,
+     status_output) = create_action_and_output_components()
+    # --- Event Wiring ---
+    # When TTS model changes, update visibility of global speed/instructions & refresh dynamic UI
     tts_model_dropdown.change(
+        fn=update_model_controls_visibility,
+        inputs=[tts_model_dropdown, script_input, speaker_configs_state, speaker_configs_state], # Pass state component itself
+        # The outputs list names the components that will be updated by the dictionary keys
+        # returned by update_model_controls_visibility.
         outputs=[global_speed_input, global_instructions_input, dynamic_speaker_ui_area, speaker_configs_state]
     )
+    # When speaker config method changes, update visibility of relevant UI groups
     speaker_config_method_dropdown.change(
+        fn=update_speaker_config_method_visibility,
+        inputs=[speaker_config_method_dropdown],
+        # The outputs list names the components that will be updated by the dictionary keys.
         outputs=[single_voice_group, detailed_per_speaker_ui_group]
     )
+    # Button to load/refresh the detailed per-speaker UI configurations
     load_per_speaker_ui_button.click(
         fn=load_refresh_per_speaker_ui,
+        inputs=[script_input, speaker_configs_state, tts_model_dropdown, speaker_configs_state], # Pass state comp
+        outputs=[dynamic_speaker_ui_area, speaker_configs_state]
     )
+    # Calculate cost button
+    calculate_cost_button.click(
+        fn=handle_calculate_cost,
+        inputs=[script_input, tts_model_dropdown],
+        outputs=[cost_output]
+    )
+    # Generate audio button
+    # Use functools.partial to pass fixed arguments like API key and client to the handler
+    # Gradio inputs will be appended to these fixed arguments when the handler is called.
     generate_button.click(
+        fn=partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE),
         inputs=[
+            script_input, tts_model_dropdown, pause_input,
+            speaker_config_method_dropdown, global_voice_dropdown,
+            speaker_configs_state, # The gr.State object itself
             global_speed_input, global_instructions_input
         ],
+        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
     )
+    # --- Examples UI ---
+    # Define inputs for examples (must match the structure in ui_layout.create_examples_ui)
+    # This list should contain the Gradio component instances
+    example_inputs_list = [
+        script_input, tts_model_dropdown, pause_input,
+        speaker_config_method_dropdown, global_voice_dropdown,
+        speaker_configs_state, # This is a gr.State component, gr.Examples handles it
+        global_speed_input, global_instructions_input
+    ]
+    # For examples to run the processing function, they need output components too.
+    example_outputs_list = [individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
+    # Create examples UI - now passing the process_fn correctly
+    # We need a wrapper for the process_fn if it uses partial for fixed args like the main button.
+    # Or, examples can just load inputs, and user clicks "Generate".
+    # For now, let's make examples only load inputs to simplify.
+    # To make examples runnable:
+    # example_process_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
+    # Then use example_process_fn in create_examples_ui if it's set up to take `fn`.
+    # The current create_examples_ui doesn't use `fn` for simplicity of fixing display.
+    _ = create_examples_ui(inputs_for_examples=example_inputs_list, process_fn=None)
+    # If you want examples to be clickable to run the generation directly:
+    # 1. Modify `create_examples_ui` in `ui_layout.py` to accept and use `fn` and `outputs_for_examples`.
+    #    Its gr.Examples call would be:
+    #    `gr.Examples(..., fn=process_fn, outputs=outputs_for_examples, cache_examples=False)`
+    # 2. In this file, define `example_process_fn` and pass it:
+    #    `example_process_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)`
+    #    `_ = create_examples_ui(inputs_for_examples=example_inputs_list, outputs_for_examples=example_outputs_list, process_fn=example_process_fn)`
+# --- Launch ---
 if __name__ == "__main__":
+    if os.name == 'nt':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    demo.queue().launch(debug=True, share=False) # .queue() is good for async operations

event_handlers.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import gradio as gr
+import os
+import asyncio
+import tempfile
+import shutil
+import zipfile
+import random
+from functools import partial
+from utils.script_parser import parse_dialogue_script, calculate_cost
+from utils.openai_tts import synthesize_speech_line
+from utils.merge_audio import merge_mp3_files
+# Import constants from ui_layout to avoid circular dependencies if they were in app.py
+from ui_layout import APP_AVAILABLE_VOICES, DEFAULT_VIBE, VIBE_CHOICES, PREDEFINED_VIBES
+# Ensure a default voice if APP_AVAILABLE_VOICES is empty (shouldn't happen with new ui_layout)
+DEFAULT_FALLBACK_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"
+def get_speakers_from_script(script_text: str):
+    if not script_text.strip():
+        return []
+    try:
+        parsed_lines, _ = parse_dialogue_script(script_text)
+        seen_speakers = set()
+        ordered_unique_speakers = []
+        for p in parsed_lines:
+            if p["speaker"] not in seen_speakers:
+                ordered_unique_speakers.append(p["speaker"])
+                seen_speakers.add(p["speaker"])
+        return ordered_unique_speakers
+    except ValueError:
+        return []
+def handle_dynamic_input_change(new_value, current_configs_state_dict: dict, speaker_name: str, config_key: str, tts_model: str):
+    """Handles changes from dynamically generated UI elements for per-speaker settings."""
+    # print(f"Dynamic change for {speaker_name}, key {config_key}: {new_value}. State: {current_configs_state_dict}")
+    if current_configs_state_dict is None: # Should ideally be initialized by Gradio's gr.State
+        current_configs_state_dict = {}
+    if speaker_name not in current_configs_state_dict:
+        current_configs_state_dict[speaker_name] = {}
+    current_configs_state_dict[speaker_name][config_key] = new_value
+    return current_configs_state_dict
+def load_refresh_per_speaker_ui(script_text: str, current_configs_state_dict: dict, tts_model: str, speaker_configs_state_component: gr.State):
+    """
+    Generates or refreshes the dynamic UI components (accordions) for each speaker.
+    Returns a list of Gradio components to populate the dynamic UI area and the updated state.
+    """
+    print(f"Load/Refresh UI called. TTS Model: {tts_model}") # Debug
+    unique_speakers = get_speakers_from_script(script_text)
+    new_ui_components = []
+    if current_configs_state_dict is None:
+        current_configs_state_dict = {}
+    # Ensure a default voice for safety
+    safe_default_voice = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"
+    for speaker_name in unique_speakers:
+        if speaker_name not in current_configs_state_dict:
+            current_configs_state_dict[speaker_name] = {
+                "voice": safe_default_voice, "speed": 1.0,
+                "vibe": DEFAULT_VIBE, "custom_instructions": ""
+            }
+        # Ensure all keys exist with defaults
+        current_configs_state_dict[speaker_name].setdefault("voice", safe_default_voice)
+        current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
+        current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
+        current_configs_state_dict[speaker_name].setdefault("custom_instructions", "")
+    if not unique_speakers:
+        print("No unique speakers found, returning markdown.") # Debug
+        new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again, or change the script content."))
+        return new_ui_components, current_configs_state_dict
+    print(f"Found speakers: {unique_speakers}. Building UI...") # Debug
+    for speaker_name in unique_speakers:
+        speaker_cfg = current_configs_state_dict[speaker_name]
+        speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
+        instructions_relevant = tts_model == "gpt-4o-mini-tts"
+        # Use a unique elem_id for each accordion to help Gradio differentiate if needed
+        accordion_elem_id = f"accordion_speaker_{speaker_name.replace(' ', '_')}"
+        with gr.Accordion(label=f"Settings for: {speaker_name}", open=False, elem_id=accordion_elem_id) as speaker_accordion:
+            # Voice Dropdown
+            voice_dd = gr.Dropdown(
+                label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg.get("voice", safe_default_voice), interactive=True
+            )
+            voice_dd.change(
+                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
+                inputs=[voice_dd, speaker_configs_state_component],
+                outputs=[speaker_configs_state_component]
+            )
+            # Speed Slider
+            speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
+            speed_slider = gr.Slider(
+                label=speed_slider_label, minimum=0.25, maximum=4.0, value=float(speaker_cfg.get("speed", 1.0)),
+                step=0.05, interactive=speed_interactive
+            )
+            if speed_interactive:
+                speed_slider.release(
+                    fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
+                    inputs=[speed_slider, speaker_configs_state_component],
+                    outputs=[speaker_configs_state_component]
+                )
+            # Vibe Dropdown
+            vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
+            vibe_dd = gr.Dropdown(
+                label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg.get("vibe", DEFAULT_VIBE), interactive=True
+            )
+            vibe_dd.change(
+                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="vibe", tts_model=tts_model),
+                inputs=[vibe_dd, speaker_configs_state_component],
+                outputs=[speaker_configs_state_component]
+            )
+            # Custom Instructions Textbox
+            custom_instr_label = "Custom Instructions"
+            custom_instr_placeholder = "Used if Vibe is 'Custom...'. Overrides Vibe preset."
+            custom_instr_tb = gr.Textbox(
+                label=custom_instr_label,
+                value=speaker_cfg.get("custom_instructions", ""),
+                placeholder=custom_instr_placeholder,
+                lines=2, interactive=True
+            )
+            custom_instr_tb.input(
+                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
+                inputs=[custom_instr_tb, speaker_configs_state_component],
+                outputs=[speaker_configs_state_component]
+            )
+        new_ui_components.append(speaker_accordion)
+    print(f"Returning {len(new_ui_components)} UI components for dynamic area.") # Debug
+    return new_ui_components, current_configs_state_dict
+async def handle_script_processing(
+    openai_api_key: str, async_openai_client, nsfw_api_url_template: str, # Passed from app.py
+    dialogue_script: str, tts_model: str, pause_ms: int,
+    speaker_config_method: str, global_voice_selection: str,
+    speaker_configs_state_dict: dict,
+    global_speed: float,
+    global_instructions: str,
+    progress=gr.Progress(track_tqdm=True)
+):
+    if not openai_api_key or not async_openai_client:
+        return None, None, "Error: OpenAI API Key or client is not configured."
+    if not dialogue_script.strip():
+        return None, None, "Error: Script is empty."
+    job_audio_path_prefix = os.path.join(tempfile.gettempdir(), f"dialogue_tts_job_{random.randint(10000, 99999)}")
+    if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
+    os.makedirs(job_audio_path_prefix, exist_ok=True)
+    try:
+        parsed_lines, _ = parse_dialogue_script(dialogue_script)
+        if not parsed_lines:
+            shutil.rmtree(job_audio_path_prefix)
+            return None, None, "Error: No valid lines found in script."
+    except ValueError as e:
+        shutil.rmtree(job_audio_path_prefix)
+        return None, None, f"Script parsing error: {str(e)}"
+    if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
+    # Ensure a default voice for safety
+    safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else DEFAULT_FALLBACK_VOICE
+    speaker_voice_map = {}
+    if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
+        unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
+        temp_voices_pool = APP_AVAILABLE_VOICES.copy()
+        if not temp_voices_pool: temp_voices_pool = [DEFAULT_FALLBACK_VOICE] # Ensure pool isn't empty
+        if speaker_config_method == "Random per Speaker":
+            for spk_name in unique_script_speakers_for_map:
+                speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
+        elif speaker_config_method == "A/B Round Robin":
+            for i, spk_name in enumerate(unique_script_speakers_for_map):
+                speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
+    tasks = []
+    # line_audio_files map to store results by original line ID for correct ordering
+    line_audio_files_map = {}
+    for i, line_data in enumerate(parsed_lines):
+        speaker_name = line_data["speaker"]
+        line_voice = safe_default_global_voice
+        line_speed = global_speed
+        line_instructions = global_instructions if global_instructions and global_instructions.strip() else None
+        if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
+            spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
+            line_voice = spk_cfg.get("voice", safe_default_global_voice)
+            if tts_model in ["tts-1", "tts-1-hd"]:
+                line_speed = float(spk_cfg.get("speed", global_speed))
+            if tts_model == "gpt-4o-mini-tts":
+                vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
+                custom_instr = spk_cfg.get("custom_instructions", "").strip()
+                if vibe == "Custom..." and custom_instr:
+                    line_instructions = custom_instr
+                elif vibe != "None" and vibe != "Custom...":
+                    line_instructions = PREDEFINED_VIBES.get(vibe, "")
+                if not line_instructions and global_instructions and global_instructions.strip():
+                    line_instructions = global_instructions
+                elif not line_instructions:
+                    line_instructions = None
+        elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
+            line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
+        if tts_model not in ["tts-1", "tts-1-hd"]:
+            line_speed = 1.0
+        out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}_{speaker_name.replace(' ','_')}.mp3")
+        progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
+        tasks.append(synthesize_speech_line(
+            client=async_openai_client, text=line_data["text"], voice=line_voice,
+            output_path=out_fn, model=tts_model, speed=line_speed,
+            instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
+            line_index=line_data['id']
+        ))
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for idx, res_path_or_exc in enumerate(results):
+        original_line_id = parsed_lines[idx]['id'] # Get original ID from the parsed line
+        if isinstance(res_path_or_exc, Exception):
+            print(f"Error synthesizing line ID {original_line_id} ({parsed_lines[idx]['speaker']}): {res_path_or_exc}")
+            line_audio_files_map[original_line_id] = None
+        elif res_path_or_exc is None:
+            print(f"Skipped or failed synthesizing line ID {original_line_id} ({parsed_lines[idx]['speaker']})")
+            line_audio_files_map[original_line_id] = None
+        else:
+            line_audio_files_map[original_line_id] = res_path_or_exc
+    # Reconstruct ordered list of files for merging, using original line IDs
+    ordered_files_for_merge_and_zip = []
+    for p_line in parsed_lines:
+        file_path = line_audio_files_map.get(p_line['id'])
+        if file_path and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
+            ordered_files_for_merge_and_zip.append(file_path)
+        else:
+            ordered_files_for_merge_and_zip.append(None) # Keep placeholder for failed lines for merge logic
+    valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
+    if not valid_files_for_zip:
+        shutil.rmtree(job_audio_path_prefix)
+        return None, None, "Error: No audio was successfully synthesized."
+    zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
+    with zipfile.ZipFile(zip_fn, 'w') as zf:
+        for f_path in valid_files_for_zip:
+            zf.write(f_path, os.path.basename(f_path))
+    merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
+    # For merge_mp3_files, pass only the list of existing files in order
+    files_to_actually_merge = [f for f in ordered_files_for_merge_and_zip if f]
+    merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
+    status = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
+    if len(valid_files_for_zip) < len(parsed_lines): status += "Some lines may have failed. "
+    if not merged_path and len(valid_files_for_zip) > 0: status += "Merging audio failed. "
+    elif not merged_path: status = "No audio to merge."
+    else: status += "Merged audio generated."
+    return (zip_fn if os.path.exists(zip_fn) else None,
+           merged_path if merged_path and os.path.exists(merged_path) else None,
+           status)
+def handle_calculate_cost(dialogue_script: str, tts_model: str):
+    if not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
+    try:
+        parsed, chars = parse_dialogue_script(dialogue_script)
+        if not parsed: return "Cost: $0.00 (No valid lines in script)"
+        cost = calculate_cost(chars, len(parsed), tts_model)
+        return f"Estimated Cost for {len(parsed)} lines ({chars} chars): ${cost:.6f}"
+    except ValueError as e:
+        return f"Cost calculation error: {str(e)}"
+    except Exception as e:
+        return f"An unexpected error occurred during cost calculation: {str(e)}"
+def update_model_controls_visibility(selected_model: str, script_text_for_refresh: str, current_speaker_configs_for_refresh: dict, speaker_configs_state_comp: gr.State):
+    """Updates visibility of global controls and refreshes per-speaker UI when TTS model changes."""
+    print(f"Model changed to: {selected_model}. Refreshing dynamic UI and controls.") # Debug
+    try:
+        # load_refresh_per_speaker_ui might return components or markdown
+        # It now takes speaker_configs_state_comp as an argument to wire up .change() correctly
+        dynamic_ui_output, updated_state = load_refresh_per_speaker_ui(
+            script_text_for_refresh, current_speaker_configs_for_refresh, selected_model, speaker_configs_state_comp
+        )
+    except Exception as e:
+        print(f"Error in load_refresh_per_speaker_ui called from model_controls_visibility: {e}")
+        # Fallback: clear dynamic UI and keep state as is, or return an error message component
+        dynamic_ui_output = [gr.Markdown(f"Error refreshing per-speaker UI: {e}")]
+        updated_state = current_speaker_configs_for_refresh # or {} to reset
+    is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
+    is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
+    # The keys in this dictionary must match the Gradio components passed in the `outputs` list
+    # of the .change() event.
+    updates = {
+        "global_speed_input": gr.update(visible=is_tts1_family, interactive=is_tts1_family),
+        "global_instructions_input": gr.update(visible=is_gpt_mini_tts, interactive=is_gpt_mini_tts),
+        "dynamic_speaker_ui_area": dynamic_ui_output, # This directly provides the new children for the Column
+        "speaker_configs_state": updated_state
+    }
+    return updates["global_speed_input"], updates["global_instructions_input"], updates["dynamic_speaker_ui_area"], updates["speaker_configs_state"]
+def update_speaker_config_method_visibility(method: str):
+    """Updates visibility of UI groups based on selected speaker configuration method."""
+    is_single = (method == "Single Voice (Global)")
+    is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
+    # Keys here must match the Gradio components in the .change() event's `outputs` list.
+    return {
+        "single_voice_group": gr.update(visible=is_single),
+        "detailed_per_speaker_ui_group": gr.update(visible=is_detailed_per_speaker),
+    }

ui_layout.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import gradio as gr
+from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES # Import directly for APP_AVAILABLE_VOICES
+# --- UI Constants and Configuration ---
+TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
+MODEL_DEFAULT_ENV = "tts-1-hd" # Default if env var not set or invalid
+SPEAKER_CONFIG_METHODS = [
+    "Single Voice (Global)",
+    "Random per Speaker",
+    "A/B Round Robin",
+    "Detailed Configuration (Per Speaker UI)"
+]
+DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
+APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
+# Ensure APP_AVAILABLE_VOICES is never empty for safety, though ALL_TTS_VOICES has defaults
+if not APP_AVAILABLE_VOICES:
+    APP_AVAILABLE_VOICES = ["alloy"] # Absolute fallback
+PREDEFINED_VIBES = {
+    "None": "",
+    "Calm": "Speak in a calm, composed, and relaxed manner.",
+    "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
+    "Happy": "Speak with a cheerful, bright, and joyful voice.",
+    "Sad": "Speak with a sorrowful, melancholic, and dejected tone.",
+    "Whisper": "Speak softly, as if whispering.",
+    "Angry": "Speak with a strong, firm, and possibly agitated voice.",
+    "Fearful": "Speak with a trembling, hesitant, and scared voice.",
+    "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
+    "Authoritative": "Speak with a commanding, confident, and firm voice.",
+    "Friendly": "Speak in a warm, approachable, and amiable manner.",
+    "Custom...": "CUSTOM"
+}
+VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
+DEFAULT_VIBE = "None"
+DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"
+# --- UI Element Creation Functions ---
+def create_main_input_components(model_default_value):
+    """Creates the main input components for script, model, pause, and global settings."""
+    with gr.Row():
+        with gr.Column(scale=2):
+            script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\n[Speaker2] How are you today?", lines=10)
+        with gr.Column(scale=1):
+            tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value)
+            pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50)
+            is_tts1_family_default = model_default_value in ["tts-1", "tts-1-hd"]
+            is_gpt_mini_tts_default = model_default_value == "gpt-4o-mini-tts"
+            global_speed_input = gr.Slider(
+                minimum=0.25, maximum=4.0, value=1.0, step=0.05,
+                label="Global Speed (for tts-1/hd)",
+                visible=is_tts1_family_default,
+                interactive=True
+            )
+            global_instructions_input = gr.Textbox(
+                label="Global Instructions (for gpt-4o-mini-tts)",
+                placeholder="e.g., Speak with a calm tone.",
+                visible=is_gpt_mini_tts_default,
+                interactive=True, lines=2
+            )
+    return script_input, tts_model_dropdown, pause_input, global_speed_input, global_instructions_input
+def create_speaker_config_components():
+    """Creates components for speaker configuration method and detailed settings UI."""
+    gr.Markdown("### Speaker Voice & Style Configuration")
+    speaker_config_method_dropdown = gr.Dropdown(
+        SPEAKER_CONFIG_METHODS, label="Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
+    )
+    # Visibility is controlled by event handlers based on speaker_config_method_dropdown
+    with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
+        global_voice_dropdown = gr.Dropdown(
+            APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True
+        )
+    with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
+        load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
+        gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are applied per-speaker. If script changes, click again to refresh.</small>")
+        dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
+    return speaker_config_method_dropdown, single_voice_group, global_voice_dropdown, detailed_per_speaker_ui_group, load_per_speaker_ui_button, dynamic_speaker_ui_area
+def create_action_and_output_components():
+    """Creates buttons for actions (cost, generate) and output display areas."""
+    with gr.Row():
+        calculate_cost_button = gr.Button("Calculate Estimated Cost")
+        generate_button = gr.Button("Generate Audio", variant="primary")
+    cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
+    with gr.Row():
+        individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)")
+        merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath")
+    status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5)
+    return calculate_cost_button, generate_button, cost_output, individual_lines_zip_output, merged_dialogue_mp3_output, status_output
+def create_examples_ui(inputs_for_examples, process_fn):
+    """Creates the examples section."""
+    gr.Markdown("## Example Scripts")
+    example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\n[Alice] Let's see how this sounds."
+    example_script_2 = "[Narrator] This is a short story.\n[CharacterA] Once upon a time...\n[Narrator] ...there was a Gradio app.\n[CharacterB] And it could talk!"
+    # Ensure a valid default voice for examples if APP_AVAILABLE_VOICES is somehow empty
+    # This is already handled by DEFAULT_GLOBAL_VOICE at the top of this file.
+    examples_data = [
+        [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
+        [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
+        ["[Solo] Just one line, using global voice and speed.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
+    ]
+    # Check if the number of items in each example matches the number of input components
+    num_inputs = len(inputs_for_examples)
+    valid_examples_data = []
+    for ex_data in examples_data:
+        if len(ex_data) == num_inputs:
+            valid_examples_data.append(ex_data)
+        else:
+            print(f"Warning: Example data mismatch. Expected {num_inputs} items, got {len(ex_data)}. Skipping example: {ex_data[0][:30]}...")
+    if not valid_examples_data:
+        gr.Markdown("<p style='color: orange;'>No valid examples could be loaded due to configuration mismatch.</p>")
+        return None # Or an empty Examples component if that's better
+    return gr.Examples(
+        examples=valid_examples_data,
+        inputs=inputs_for_examples, # This list must match the structure of example_data items
+        # Outputs for examples are not strictly necessary to pre-compute if cache_examples=False
+        # but defining them can help Gradio understand the flow.
+        # We can make the example click run the full processing.
+        # outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output], # these components are not passed here
+        # For examples to run the main function, the output components need to be part of the gr.Examples call.
+        # However, the prompt is to make the list appear. Let's simplify for now.
+        # If outputs are defined, they must be component instances.
+        # For now, let's remove fn and outputs from gr.Examples just to ensure the list shows.
+        # Later, we can wire them back if needed, ensuring the output components are correctly passed.
+        # fn=process_fn, # Removed for debugging example list display
+        # cache_examples=False
+        label="Example Scripts (Click to Load Inputs)",
+        samples_per_page=5 # Helps ensure it tries to render
+    )