Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files Community

abocha commited on May 7

Commit

49a48a4

1 Parent(s): 92c9b3d

hope

Browse files

Files changed (3) hide show

app.py +86 -65
event_handlers.py +95 -133
ui_layout.py +21 -20

app.py CHANGED Viewed

@@ -4,12 +4,13 @@ import os
 import asyncio
 from openai import AsyncOpenAI
 from functools import partial
-import datetime # For timestamp in @gr.render debug
 from ui_layout import (
     create_main_input_components, create_speaker_config_components,
     create_action_and_output_components, create_examples_ui,
-    TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES, DEFAULT_GLOBAL_VOICE # Added for @gr.render context
 )
 from event_handlers import (
@@ -17,9 +18,8 @@ from event_handlers import (
     handle_speaker_config_method_visibility_change,
     handle_tts_model_change,
     handle_load_refresh_per_speaker_ui_trigger,
-    # Placeholder for handling changes from controls inside @gr.render's dynamic UI
-    # handle_dynamic_accordion_input_change,
-    get_speakers_from_script # Utility function
 )
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -54,78 +54,104 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
     if not OPENAI_API_KEY or not async_openai_client:
         gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
-    # This state holds the detailed configurations for each speaker.
-    # It's an input to @gr.render and an output of event handlers that modify it.
     speaker_configs_state = gr.State({})
     (script_input, tts_model_dropdown, pause_input,
      global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
     (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
-     detailed_per_speaker_ui_group_container, # This is the gr.Column container
      load_per_speaker_ui_button) = create_speaker_config_components()
     (calculate_cost_button, generate_button, cost_output,
      individual_lines_zip_output, merged_dialogue_mp3_output,
      status_output) = create_action_and_output_components()
-    # --- Dynamic UI Area using @gr.render ---
-    # This block is defined within the `detailed_per_speaker_ui_group_container` (a gr.Column)
-    with detailed_per_speaker_ui_group_container:
-        # The @gr.render decorator ensures this function re-runs and its UI
-        # is re-drawn when its `inputs` change or its `triggers` are activated.
         @gr.render(
             inputs=[script_input, speaker_configs_state, tts_model_dropdown],
             triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
         )
         def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
-            """
-            Defines the dynamic UI for per-speaker configuration.
-            This function's content is re-rendered by Gradio when inputs/triggers change.
-            """
-            print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State: {str(current_speaker_configs)[:30]}...")
-            # --- For Phase 1 Debugging: Display simple Markdown ---
-            timestamp = datetime.datetime.now().isoformat()
-            gr.Markdown(
-                f"#### Dynamic UI Area (@render)\n"
-                f"- **TTS Model:** {current_tts_model}\n"
-                f"- **Timestamp:** {timestamp}\n"
-                # f"- Script: `{current_script_text[:60].replace('[', '(').replace(']', ')') + ('...' if len(current_script_text) > 60 else '')}`\n" # Be careful with Markdown in script
-                # f"- State Keys: `{list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}`"
-            )
-            # --- End of Phase 1 Debugging ---
-            # --- Future Full Implementation (Illustrative Structure) ---
-            # This is where you would build the actual accordions for each speaker.
-            # unique_speakers = get_speakers_from_script(current_script_text)
-            # if not unique_speakers:
-            #     gr.Markdown("<p style='color: #888;'>Enter a script and click 'Load/Refresh' to see per-speaker settings.</p>")
-            # else:
-            #     for speaker_name in unique_speakers:
-            #         # Ensure speaker_configs_state is a dict
-            #         speaker_specific_config = current_speaker_configs.get(speaker_name, {}) if isinstance(current_speaker_configs, dict) else {}
-            #         default_voice_for_speaker = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
-            #         # ... and so on for other settings like speed, vibe ...
-            #
-            #         with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False): # Start closed
-            #             gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")
-            #             # Example: Voice Dropdown
-            #             # voice_dd = gr.Dropdown(APP_AVAILABLE_VOICES, value=default_voice_for_speaker, label="Voice")
-            #             # Attach listener to update speaker_configs_state
-            #             # voice_dd.change(
-            #             #     fn=lambda val, spk=speaker_name, cfg=current_speaker_configs: \
-            #             #         handle_dynamic_accordion_input_change(val, spk, "voice", cfg, speaker_configs_state),
-            #             #     inputs=[voice_dd],
-            #             #     outputs=[speaker_configs_state] # CRITICAL: This updates the state, which can re-trigger @gr.render
-            #             # )                                    # if speaker_configs_state is also an input to @gr.render.
-            #                                                 # This creates a reactive loop.
-            #
-            #             # Add other controls (speed slider, vibe dropdown, custom instructions textbox)
-            #             # based on current_tts_model, similar to the voice_dd example.
-            #             gr.Textbox(label="Placeholder for voice/speed/vibe controls")
-            # --- End of Future Full Implementation ---
     # --- Event Listeners for other UI elements ---
     tts_model_dropdown.change(
@@ -144,9 +170,6 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
         fn=handle_load_refresh_per_speaker_ui_trigger,
         inputs=[script_input, speaker_configs_state, tts_model_dropdown],
         outputs=[speaker_configs_state]
-        # The button click itself is also a trigger for @gr.render,
-        # and this handler updates speaker_configs_state, which is an input to @gr.render,
-        # thus ensuring @gr.render re-runs with potentially new state.
     )
     calculate_cost_button.click(
@@ -183,8 +206,6 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
     )
 if __name__ == "__main__":
-    if os.name == 'nt': # Required for Windows if using asyncio default event loop
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
-    # Consider adding `prevent_thread_lock=True` if running in a script and want it to exit
-    # or if issues with thread locking occur.
     demo.queue().launch(debug=True, share=False)

 import asyncio
 from openai import AsyncOpenAI
 from functools import partial
+import datetime
 from ui_layout import (
     create_main_input_components, create_speaker_config_components,
     create_action_and_output_components, create_examples_ui,
+    TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES,
+    DEFAULT_GLOBAL_VOICE, VIBE_CHOICES, DEFAULT_VIBE, PREDEFINED_VIBES
 )
 from event_handlers import (
     handle_speaker_config_method_visibility_change,
     handle_tts_model_change,
     handle_load_refresh_per_speaker_ui_trigger,
+    handle_dynamic_accordion_input_change, # For controls inside @gr.render
+    get_speakers_from_script
 )
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
     if not OPENAI_API_KEY or not async_openai_client:
         gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
     speaker_configs_state = gr.State({})
     (script_input, tts_model_dropdown, pause_input,
      global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
     (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
+     detailed_per_speaker_ui_group_container,
      load_per_speaker_ui_button) = create_speaker_config_components()
     (calculate_cost_button, generate_button, cost_output,
      individual_lines_zip_output, merged_dialogue_mp3_output,
      status_output) = create_action_and_output_components()
+    with detailed_per_speaker_ui_group_container: # Define @gr.render inside this column
         @gr.render(
             inputs=[script_input, speaker_configs_state, tts_model_dropdown],
             triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
         )
         def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
+            """Defines and re-renders the dynamic UI for per-speaker configuration."""
+            print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs,dict) else 'Not a dict'}")
+            unique_speakers = get_speakers_from_script(current_script_text)
+            if not unique_speakers:
+                gr.Markdown("<p style='color: #888; margin-top:10px;'>Enter a script with speaker tags (e.g., `[SpeakerName] Text`) and click 'Load/Refresh' to see per-speaker settings.</p>")
+                return
+            for speaker_idx, speaker_name in enumerate(unique_speakers):
+                # Ensure current_speaker_configs is a valid dictionary
+                if not isinstance(current_speaker_configs, dict):
+                    current_speaker_configs = {}
+                speaker_specific_config = current_speaker_configs.get(speaker_name, {})
+                accordion_elem_id = f"accordion_spk_{speaker_idx}_{speaker_name.replace(' ','_').lower()}"
+                with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False, elem_id=accordion_elem_id):
+                    gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")
+                    default_voice = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
+                    voice_dd_elem_id = f"voice_dd_spk_{speaker_idx}"
+                    voice_dropdown = gr.Dropdown(
+                        APP_AVAILABLE_VOICES,
+                        value=default_voice,
+                        label="Voice",
+                        elem_id=voice_dd_elem_id
+                    )
+                    voice_dropdown.change(
+                        fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="voice"),
+                        inputs=[voice_dropdown, speaker_configs_state], # Pass current state to handler
+                        outputs=[speaker_configs_state]
+                    )
+                    if current_tts_model in ["tts-1", "tts-1-hd"]:
+                        default_speed = float(speaker_specific_config.get("speed", 1.0))
+                        speed_slider_elem_id = f"speed_slider_spk_{speaker_idx}"
+                        speed_slider = gr.Slider(
+                            minimum=0.25, maximum=4.0, value=default_speed, step=0.05,
+                            label="Speed", elem_id=speed_slider_elem_id
+                        )
+                        speed_slider.change(
+                            fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="speed"),
+                            inputs=[speed_slider, speaker_configs_state],
+                            outputs=[speaker_configs_state]
+                        )
+                    elif current_tts_model == "gpt-4o-mini-tts":
+                        default_vibe = speaker_specific_config.get("vibe", DEFAULT_VIBE)
+                        vibe_dd_elem_id = f"vibe_dd_spk_{speaker_idx}"
+                        vibe_dropdown = gr.Dropdown(
+                            VIBE_CHOICES, value=default_vibe, label="Vibe/Emotion", elem_id=vibe_dd_elem_id
+                        )
+                        default_custom_instructions = speaker_specific_config.get("custom_instructions", "")
+                        custom_instr_tb_elem_id = f"custom_instr_tb_spk_{speaker_idx}"
+                        custom_instructions_textbox = gr.Textbox(
+                            label="Custom Instructions",
+                            value=default_custom_instructions,
+                            placeholder="e.g., Speak with a slightly hesitant tone.",
+                            lines=2,
+                            visible=(default_vibe == "Custom..."), # Initial visibility
+                            elem_id=custom_instr_tb_elem_id
+                        )
+                        vibe_dropdown.change(
+                            fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="vibe"),
+                            inputs=[vibe_dropdown, speaker_configs_state],
+                            outputs=[speaker_configs_state]
+                        ).then( # Chain another update for visibility
+                            fn=lambda vibe_val: gr.update(visible=(vibe_val == "Custom...")),
+                            inputs=[vibe_dropdown],
+                            outputs=[custom_instructions_textbox]
+                        )
+                        custom_instructions_textbox.change(
+                            fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="custom_instructions"),
+                            inputs=[custom_instructions_textbox, speaker_configs_state],
+                            outputs=[speaker_configs_state]
+                        )
     # --- Event Listeners for other UI elements ---
     tts_model_dropdown.change(
         fn=handle_load_refresh_per_speaker_ui_trigger,
         inputs=[script_input, speaker_configs_state, tts_model_dropdown],
         outputs=[speaker_configs_state]
     )
     calculate_cost_button.click(
     )
 if __name__ == "__main__":
+    if os.name == 'nt':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     demo.queue().launch(debug=True, share=False)

event_handlers.py CHANGED Viewed

@@ -9,7 +9,6 @@ import random
 from functools import partial
 import datetime
-# Assuming utils.script_parser and utils.openai_tts, utils.merge_audio are correctly defined
 from utils.script_parser import parse_dialogue_script, calculate_cost
 from utils.openai_tts import synthesize_speech_line
 from utils.merge_audio import merge_mp3_files
@@ -21,45 +20,48 @@ def get_speakers_from_script(script_text: str) -> list:
     if not script_text or not script_text.strip():
         return []
     try:
-        parsed_lines, _ = parse_dialogue_script(script_text)
         if not parsed_lines:
             return []
         seen_speakers = set()
         ordered_unique_speakers = []
         for line_data in parsed_lines:
             speaker = line_data.get("speaker")
-            if speaker and speaker not in seen_speakers:
                 ordered_unique_speakers.append(speaker)
                 seen_speakers.add(speaker)
         return ordered_unique_speakers
-    except ValueError: # Handles potential errors from parse_dialogue_script
         return []
 def handle_dynamic_accordion_input_change(
-    new_value,
-    speaker_name: str,
-    config_key: str,
-    current_speaker_configs: dict
-    # target_state_component is implicitly speaker_configs_state when this function is an output to it
     ):
     """
     Updates the speaker_configs_state when a dynamic UI element within an Accordion changes.
-    This function is INTENDED to be used as the 'fn' for .change() listeners of
-    components created inside the @gr.render block.
     """
     if not isinstance(current_speaker_configs, dict):
-        print(f"Warning: current_speaker_configs was not a dict, re-initializing. Type: {type(current_speaker_configs)}")
-        current_speaker_configs = {} # Defensive: re-initialize if it's not a dict
-    updated_configs = current_speaker_configs.copy() # Work on a copy
     if speaker_name not in updated_configs:
         updated_configs[speaker_name] = {}
     updated_configs[speaker_name][config_key] = new_value
-    updated_configs["_last_config_change"] = f"{speaker_name}_{config_key}_{datetime.datetime.now().isoformat()}" # To ensure state change
-    print(f"DEBUG (accordion_change): Speaker '{speaker_name}' config '{config_key}' to '{new_value}'. New state hint: {updated_configs.get('_last_config_change')}")
     return updated_configs
@@ -67,7 +69,7 @@ async def handle_script_processing(
     openai_api_key: str, async_openai_client, nsfw_api_url_template: str,
     dialogue_script: str, tts_model: str, pause_ms: int,
     speaker_config_method: str, global_voice_selection: str,
-    speaker_configs_state_dict: dict, # This is the value from gr.State
     global_speed: float,
     global_instructions: str,
     progress=gr.Progress(track_tqdm=True)
@@ -84,140 +86,120 @@ async def handle_script_processing(
     try:
         parsed_lines, _ = parse_dialogue_script(dialogue_script)
         if not parsed_lines:
-            shutil.rmtree(job_audio_path_prefix)
-            return None, None, "Error: No valid lines found in script."
     except ValueError as e:
-        shutil.rmtree(job_audio_path_prefix)
-        return None, None, f"Script parsing error: {str(e)}"
     if not isinstance(speaker_configs_state_dict, dict):
         print(f"Warning: speaker_configs_state_dict was not a dict in handle_script_processing. Re-initializing. Type: {type(speaker_configs_state_dict)}")
         speaker_configs_state_dict = {}
-    safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
-    speaker_voice_map = {} # For Random or A/B methods
     if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
         unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
         temp_voices_pool = APP_AVAILABLE_VOICES.copy()
-        if not temp_voices_pool: temp_voices_pool = [DEFAULT_GLOBAL_VOICE]
         if speaker_config_method == "Random per Speaker":
             for spk_name in unique_script_speakers_for_map:
                 speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
-        elif speaker_config_method == "A/B Round Robin" and temp_voices_pool: # Check temp_voices_pool not empty
             for i, spk_name in enumerate(unique_script_speakers_for_map):
                 speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
-    tasks = []
-    # line_audio_files_map = {} # To store results indexed by original line ID
-    # Create a list to store (original_line_id, future_task)
     task_info_list = []
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
         line_voice = safe_default_global_voice
         line_speed = global_speed
         line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
         if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
             spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
-            line_voice = spk_cfg.get("voice", safe_default_global_voice) # Fallback to safe global voice
             if tts_model in ["tts-1", "tts-1-hd"]:
                 line_speed = float(spk_cfg.get("speed", global_speed))
-            else: # Other models don't use speed
-                line_speed = 1.0
             if tts_model == "gpt-4o-mini-tts":
                 vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
                 custom_instr_raw = spk_cfg.get("custom_instructions", "")
                 custom_instr = custom_instr_raw.strip() if custom_instr_raw else ""
                 if vibe == "Custom..." and custom_instr:
-                    line_instructions = custom_instr
                 elif vibe != "None" and vibe != "Custom..." and PREDEFINED_VIBES.get(vibe):
-                    line_instructions = PREDEFINED_VIBES[vibe]
-                elif not line_instructions and global_instructions and global_instructions.strip(): # Fallback to global if still no specific instructions
-                    line_instructions = global_instructions.strip()
-                elif not line_instructions: # Explicitly None if all else fails
-                    line_instructions = None
-            else: # tts-1 models don't use vibe/custom instructions from per-speaker UI
-                line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
         elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
             line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
-            if tts_model not in ["tts-1", "tts-1-hd"]: # Ensure speed is reset if not tts-1 family
-                 line_speed = 1.0
-            # For Random/AB, use global instructions if any
-            line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
-        # Final check for speed parameter validity based on model
-        if tts_model not in ["tts-1", "tts-1-hd"]:
-            line_speed = 1.0
-        out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}_{speaker_name.replace(' ','_')}.mp3")
         progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
         current_task = synthesize_speech_line(
-            client=async_openai_client, text=line_data["text"], voice=line_voice,
             output_path=out_fn, model=tts_model, speed=line_speed,
             instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
-            line_index=line_data['id']
         )
-        task_info_list.append({"id": line_data['id'], "speaker": speaker_name, "task": current_task, "out_fn": out_fn})
-    # Gather results
-    # await asyncio.gather(*(info['task'] for info in task_info_list)) # This runs them, but doesn't handle exceptions well for individual ones
-    processed_results = []
     for info in task_info_list:
         try:
             result_path = await info['task']
-            processed_results.append({"id": info['id'], "path": result_path, "speaker": info['speaker']})
         except Exception as e:
             print(f"Error synthesizing line ID {info['id']} ({info['speaker']}): {e}")
-            processed_results.append({"id": info['id'], "path": None, "error": e, "speaker": info['speaker']})
-    # Ensure files are processed in the original script order for merging
-    ordered_files_for_merge_and_zip = [None] * len(parsed_lines)
-    for res in processed_results:
-        # Find original index of this line_id to maintain order
-        original_line_index = -1
-        for idx, p_line in enumerate(parsed_lines):
-            if p_line['id'] == res['id']:
-                original_line_index = idx
-                break
-        if original_line_index != -1:
-            if res.get("path") and os.path.exists(res["path"]) and os.path.getsize(res["path"]) > 0:
-                ordered_files_for_merge_and_zip[original_line_index] = res["path"]
-            else:
-                print(f"Skipped or failed synthesizing line ID {res['id']} ({res['speaker']})")
-                # ordered_files_for_merge_and_zip remains None at this index
     valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
     if not valid_files_for_zip:
-        shutil.rmtree(job_audio_path_prefix) # Cleanup
-        return None, None, "Error: No audio was successfully synthesized for any line."
     zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
     with zipfile.ZipFile(zip_fn, 'w') as zf:
         for f_path in valid_files_for_zip:
             zf.write(f_path, os.path.basename(f_path))
-    # Files to actually merge (only valid ones, already in order)
-    files_to_actually_merge = valid_files_for_zip
     merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
     merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
     status_msg = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
     if len(valid_files_for_zip) < len(parsed_lines): status_msg += "Some lines may have failed. "
     if not merged_path and len(valid_files_for_zip) > 0 : status_msg += "Merging audio failed. "
     elif not merged_path: status_msg = "No audio to merge (all lines failed or were skipped)."
     else: status_msg += "Merged audio generated."
@@ -226,74 +208,55 @@ async def handle_script_processing(
            merged_path if merged_path and os.path.exists(merged_path) else None,
            status_msg)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
-    """Calculates the estimated cost for TTS processing."""
     if not dialogue_script or not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
     try:
         parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
         if not parsed_lines: return "Cost: $0.00 (No valid lines in script)"
         cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
         return f"Estimated Cost for {len(parsed_lines)} lines ({total_chars} chars): ${cost:.6f}"
-    except ValueError as e: # Specific error from parsing or cost calculation
-        return f"Cost calculation error: {str(e)}"
-    except Exception as e: # Generic fallback for unexpected errors
-        return f"An unexpected error occurred during cost calculation: {str(e)}"
 def handle_load_refresh_per_speaker_ui_trigger(script_text: str, current_speaker_configs: dict, tts_model: str):
-    """
-    Triggered by the 'Load/Refresh' button. Updates speaker_configs_state.
-    The @gr.render function reacts to this state change (and the button click trigger).
-    """
     print(f"DEBUG (Load/Refresh Trigger): Script: '{script_text[:30]}...', Model: {tts_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
-    # Ensure current_speaker_configs is a dictionary
-    if not isinstance(current_speaker_configs, dict):
-        current_speaker_configs = {}
-    updated_configs = current_speaker_configs.copy() # Work on a copy to ensure change detection
-    # Example: Initialize new speakers found in the script if they aren't in the state
     # unique_speakers_in_script = get_speakers_from_script(script_text)
     # for speaker in unique_speakers_in_script:
     #     if speaker not in updated_configs:
-    #         updated_configs[speaker] = {"voice": DEFAULT_GLOBAL_VOICE} # Or more detailed defaults based on tts_model
-    # Add/update a key to ensure Gradio recognizes the state object has changed,
-    # which helps in reliably triggering @gr.render if other inputs didn't change.
-    updated_configs["_last_refresh_source"] = "button_click"
-    updated_configs["_last_refresh_timestamp"] = datetime.datetime.now().isoformat()
     return updated_configs
 def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
-    """
-    Handles changes to the TTS model dropdown. Updates global controls and speaker_configs_state.
-    """
     print(f"DEBUG (TTS Model Change): Model: {selected_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
-    if not isinstance(current_speaker_configs, dict):
-        current_speaker_configs = {}
-    updated_configs = current_speaker_configs.copy() # Work on a copy
-    # Potentially adjust or clear parts of the state that are model-specific
-    # For example, if switching from tts-1 to gpt-4o-mini-tts, 'speed' settings might be irrelevant
-    # and 'vibe' settings might need to be initialized.
-    # for speaker_cfg in updated_configs.values():
-    #     if isinstance(speaker_cfg, dict):
-    #         if selected_model == "gpt-4o-mini-tts":
-    #             speaker_cfg.pop("speed", None) # Remove speed if it exists
-    #             if "vibe" not in speaker_cfg: speaker_cfg["vibe"] = DEFAULT_VIBE
-    #         elif selected_model in ["tts-1", "tts-1-hd"]:
-    #             speaker_cfg.pop("vibe", None)
-    #             speaker_cfg.pop("custom_instructions", None)
-    #             if "speed" not in speaker_cfg: speaker_cfg["speed"] = 1.0
-    updated_configs["_last_refresh_source"] = "model_change"
-    updated_configs["_last_refresh_timestamp"] = datetime.datetime.now().isoformat()
     is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
     is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
@@ -305,7 +268,6 @@ def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
     )
 def handle_speaker_config_method_visibility_change(method: str):
-    """Handles changes to the speaker configuration method dropdown."""
     print(f"DEBUG (Config Method Change): Method: {method}")
     is_single_voice_visible = (method == "Single Voice (Global)")
     is_detailed_per_speaker_container_visible = (method == "Detailed Configuration (Per Speaker UI)")

 from functools import partial
 import datetime
 from utils.script_parser import parse_dialogue_script, calculate_cost
 from utils.openai_tts import synthesize_speech_line
 from utils.merge_audio import merge_mp3_files
     if not script_text or not script_text.strip():
         return []
     try:
+        parsed_lines, _ = parse_dialogue_script(script_text) # Assuming this returns (list_of_dicts, total_chars)
         if not parsed_lines:
             return []
         seen_speakers = set()
         ordered_unique_speakers = []
         for line_data in parsed_lines:
             speaker = line_data.get("speaker")
+            if speaker and speaker not in seen_speakers: # Ensure speaker is not None or empty
                 ordered_unique_speakers.append(speaker)
                 seen_speakers.add(speaker)
         return ordered_unique_speakers
+    except ValueError:
+        print("ValueError during script parsing in get_speakers_from_script.")
         return []
+    except Exception as e:
+        print(f"Unexpected error in get_speakers_from_script: {e}")
+        return []
 def handle_dynamic_accordion_input_change(
+    new_value, # Value from the changed component (e.g., voice_dropdown)
+    current_speaker_configs: dict, # Current value of speaker_configs_state
+    speaker_name: str, # Passed via partial from the event listener
+    config_key: str    # Passed via partial from the event listener
     ):
     """
     Updates the speaker_configs_state when a dynamic UI element within an Accordion changes.
     """
     if not isinstance(current_speaker_configs, dict):
+        print(f"Warning: current_speaker_configs was not a dict in handle_dynamic_accordion_input_change. Type: {type(current_speaker_configs)}. Re-initializing.")
+        current_speaker_configs = {}
+    updated_configs = current_speaker_configs.copy()
     if speaker_name not in updated_configs:
         updated_configs[speaker_name] = {}
     updated_configs[speaker_name][config_key] = new_value
+    updated_configs["_last_dynamic_update_details"] = f"Speaker: {speaker_name}, Key: {config_key}, Val: {str(new_value)[:20]}, TS: {datetime.datetime.now().isoformat()}"
+    print(f"DEBUG (dynamic_input_change): Speaker '{speaker_name}' config '{config_key}' to '{str(new_value)[:50]}'. New state hint: {updated_configs.get('_last_dynamic_update_details')}")
     return updated_configs
     openai_api_key: str, async_openai_client, nsfw_api_url_template: str,
     dialogue_script: str, tts_model: str, pause_ms: int,
     speaker_config_method: str, global_voice_selection: str,
+    speaker_configs_state_dict: dict,
     global_speed: float,
     global_instructions: str,
     progress=gr.Progress(track_tqdm=True)
     try:
         parsed_lines, _ = parse_dialogue_script(dialogue_script)
         if not parsed_lines:
+            shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No valid lines found in script."
     except ValueError as e:
+        shutil.rmtree(job_audio_path_prefix); return None, None, f"Script parsing error: {str(e)}"
     if not isinstance(speaker_configs_state_dict, dict):
         print(f"Warning: speaker_configs_state_dict was not a dict in handle_script_processing. Re-initializing. Type: {type(speaker_configs_state_dict)}")
         speaker_configs_state_dict = {}
+    safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else (APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy")
+    speaker_voice_map = {}
     if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
         unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
         temp_voices_pool = APP_AVAILABLE_VOICES.copy()
+        if not temp_voices_pool: temp_voices_pool = [safe_default_global_voice]
         if speaker_config_method == "Random per Speaker":
             for spk_name in unique_script_speakers_for_map:
                 speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
+        elif speaker_config_method == "A/B Round Robin" and temp_voices_pool:
             for i, spk_name in enumerate(unique_script_speakers_for_map):
                 speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
     task_info_list = []
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
+        line_text = line_data["text"]
+        line_id = line_data["id"]
         line_voice = safe_default_global_voice
         line_speed = global_speed
         line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
         if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
             spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
+            line_voice = spk_cfg.get("voice", safe_default_global_voice)
             if tts_model in ["tts-1", "tts-1-hd"]:
                 line_speed = float(spk_cfg.get("speed", global_speed))
+            else: line_speed = 1.0
             if tts_model == "gpt-4o-mini-tts":
                 vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
                 custom_instr_raw = spk_cfg.get("custom_instructions", "")
                 custom_instr = custom_instr_raw.strip() if custom_instr_raw else ""
+                current_line_specific_instructions = None
                 if vibe == "Custom..." and custom_instr:
+                    current_line_specific_instructions = custom_instr
                 elif vibe != "None" and vibe != "Custom..." and PREDEFINED_VIBES.get(vibe):
+                    current_line_specific_instructions = PREDEFINED_VIBES[vibe]
+                # If per-speaker instructions are set, they take precedence. Otherwise, fall back to global instructions.
+                line_instructions = current_line_specific_instructions if current_line_specific_instructions is not None else line_instructions
+            else: # tts-1, tts-1-hd do not use vibe/custom_instructions from per-speaker UI
+                  # They will use the global_instructions if set.
+                pass # line_instructions already set to global_instructions or None
         elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
             line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
+            # For these methods, speed and instructions remain global
+            if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
+        if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
+        out_fn = os.path.join(job_audio_path_prefix, f"line_{line_id}_{speaker_name.replace(' ','_')}.mp3")
         progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
         current_task = synthesize_speech_line(
+            client=async_openai_client, text=line_text, voice=line_voice,
             output_path=out_fn, model=tts_model, speed=line_speed,
             instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
+            line_index=line_id
         )
+        task_info_list.append({"id": line_id, "speaker": speaker_name, "task": current_task, "out_fn": out_fn})
+    processed_results_map = {} # Store by line_id for easier lookup
     for info in task_info_list:
         try:
             result_path = await info['task']
+            processed_results_map[info['id']] = {"path": result_path, "speaker": info['speaker']}
         except Exception as e:
             print(f"Error synthesizing line ID {info['id']} ({info['speaker']}): {e}")
+            processed_results_map[info['id']] = {"path": None, "error": e, "speaker": info['speaker']}
+    ordered_files_for_merge_and_zip = []
+    for p_line in parsed_lines: # Iterate through original parsed lines to maintain order
+        line_id = p_line['id']
+        res = processed_results_map.get(line_id)
+        if res and res.get("path") and os.path.exists(res["path"]) and os.path.getsize(res["path"]) > 0:
+            ordered_files_for_merge_and_zip.append(res["path"])
+        else:
+            # File was not successfully created or result not found, append None placeholder
+            ordered_files_for_merge_and_zip.append(None)
+            if res: print(f"Skipped or failed synthesizing line ID {line_id} ({res.get('speaker', 'Unknown')}) for merge/zip.")
+            else: print(f"Result for line ID {line_id} not found in processed_results_map.")
     valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
     if not valid_files_for_zip:
+        shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio was successfully synthesized for any line."
     zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
     with zipfile.ZipFile(zip_fn, 'w') as zf:
         for f_path in valid_files_for_zip:
             zf.write(f_path, os.path.basename(f_path))
+    files_to_actually_merge = valid_files_for_zip # Already ordered and filtered
     merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
     merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
     status_msg = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
     if len(valid_files_for_zip) < len(parsed_lines): status_msg += "Some lines may have failed. "
     if not merged_path and len(valid_files_for_zip) > 0 : status_msg += "Merging audio failed. "
     elif not merged_path: status_msg = "No audio to merge (all lines failed or were skipped)."
     else: status_msg += "Merged audio generated."
            merged_path if merged_path and os.path.exists(merged_path) else None,
            status_msg)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
     if not dialogue_script or not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
     try:
         parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
         if not parsed_lines: return "Cost: $0.00 (No valid lines in script)"
         cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
         return f"Estimated Cost for {len(parsed_lines)} lines ({total_chars} chars): ${cost:.6f}"
+    except ValueError as e: return f"Cost calculation error: {str(e)}"
+    except Exception as e: return f"An unexpected error: {str(e)}"
 def handle_load_refresh_per_speaker_ui_trigger(script_text: str, current_speaker_configs: dict, tts_model: str):
     print(f"DEBUG (Load/Refresh Trigger): Script: '{script_text[:30]}...', Model: {tts_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
+    if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
+    updated_configs = current_speaker_configs.copy()
+    # Optionally, you might want to pre-populate settings for new speakers here
     # unique_speakers_in_script = get_speakers_from_script(script_text)
     # for speaker in unique_speakers_in_script:
     #     if speaker not in updated_configs:
+    #         updated_configs[speaker] = {"voice": DEFAULT_GLOBAL_VOICE} # Default init
+    #         if tts_model in ["tts-1", "tts-1-hd"]: updated_configs[speaker]["speed"] = 1.0
+    #         elif tts_model == "gpt-4o-mini-tts": updated_configs[speaker]["vibe"] = DEFAULT_VIBE
+    updated_configs["_last_action_source"] = "load_refresh_button"
+    updated_configs["_last_action_timestamp"] = datetime.datetime.now().isoformat()
     return updated_configs
 def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
     print(f"DEBUG (TTS Model Change): Model: {selected_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
+    if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
+    updated_configs = current_speaker_configs.copy()
+    # When model changes, you might want to reset or adjust model-specific settings for all speakers
+    # For example, 'speed' is for tts-1, 'vibe' for gpt-4o-mini-tts
+    for speaker_name_key in list(updated_configs.keys()): # Iterate over keys if modifying dict
+        if isinstance(updated_configs[speaker_name_key], dict): # Check if it's a speaker config dict
+            if selected_model == "gpt-4o-mini-tts":
+                updated_configs[speaker_name_key].pop("speed", None)
+                if "vibe" not in updated_configs[speaker_name_key]:
+                    updated_configs[speaker_name_key]["vibe"] = DEFAULT_VIBE
+            elif selected_model in ["tts-1", "tts-1-hd"]:
+                updated_configs[speaker_name_key].pop("vibe", None)
+                updated_configs[speaker_name_key].pop("custom_instructions", None)
+                if "speed" not in updated_configs[speaker_name_key]:
+                     updated_configs[speaker_name_key]["speed"] = 1.0
+            # Add other model-specific adjustments if needed
+    updated_configs["_last_action_source"] = "tts_model_change"
+    updated_configs["_last_action_timestamp"] = datetime.datetime.now().isoformat()
     is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
     is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
     )
 def handle_speaker_config_method_visibility_change(method: str):
     print(f"DEBUG (Config Method Change): Method: {method}")
     is_single_voice_visible = (method == "Single Voice (Global)")
     is_detailed_per_speaker_container_visible = (method == "Detailed Configuration (Per Speaker UI)")

ui_layout.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # FILE: ui_layout.py
 import gradio as gr
-from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES # Assuming this import path is correct
 # --- UI Constants and Configuration ---
 TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
@@ -12,10 +11,17 @@ SPEAKER_CONFIG_METHODS = [
     "A/B Round Robin",
     "Detailed Configuration (Per Speaker UI)"
 ]
-DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker" # Or your preferred default
-APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
-if not APP_AVAILABLE_VOICES: # Absolute fallback
     APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
@@ -34,17 +40,17 @@ PREDEFINED_VIBES = {
 }
 VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
 DEFAULT_VIBE = "None"
-DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0]
 # --- UI Element Creation Functions ---
 def create_main_input_components(model_default_value):
     """Creates the main input components for script, model, pause, and global settings."""
-    with gr.Row():
-        with gr.Column(scale=2):
             script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\\n[Speaker2] How are you today?", lines=10, elem_id="script_input_area")
-        with gr.Column(scale=1):
             tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value, elem_id="tts_model_dropdown")
             pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50, elem_id="pause_input")
@@ -71,20 +77,16 @@ def create_speaker_config_components():
         elem_id="speaker_config_method_dropdown"
     )
-    with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
         global_voice_dropdown = gr.Dropdown(
             APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True,
             elem_id="global_voice_dropdown"
         )
-    # This Column group is the container where the @gr.render function in app.py
-    # will place its dynamically generated UI.
     detailed_per_speaker_ui_group_container = gr.Column(
         visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)"),
         elem_id="detailed_per_speaker_ui_group_container"
     )
-    # The button and its help text are part of this container, defined statically.
-    # The dynamic content from @gr.render will appear below them within this column.
     with detailed_per_speaker_ui_group_container:
         load_per_speaker_ui_button = gr.Button(
             "Load/Refresh Per-Speaker Settings UI (from Script Above)",
@@ -94,24 +96,24 @@ def create_speaker_config_components():
             "<small>Click button above to populate settings for each speaker found in the script. "
             "Settings are applied per-speaker. If script changes, click again to refresh.</small>"
         )
-        # The dynamic UI from @gr.render in app.py will be inserted here.
     return (
         speaker_config_method_dropdown,
         single_voice_group,
         global_voice_dropdown,
-        detailed_per_speaker_ui_group_container, # Container for @gr.render content
         load_per_speaker_ui_button
     )
 def create_action_and_output_components():
     """Creates buttons for actions (cost, generate) and output display areas."""
-    with gr.Row():
         calculate_cost_button = gr.Button("Calculate Estimated Cost", elem_id="calculate_cost_button")
         generate_button = gr.Button("Generate Audio", variant="primary", elem_id="generate_button")
     cost_output = gr.Textbox(label="Estimated Cost", interactive=False, elem_id="cost_output")
-    with gr.Row():
         individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)", elem_id="individual_lines_zip_output")
         merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath", elem_id="merged_dialogue_mp3_output")
     status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5, elem_id="status_output")
@@ -123,7 +125,6 @@ def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=Non
     example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\\n[Alice] Let's see how this sounds."
     example_script_2 = "[Narrator] This is a short story.\\n[CharacterA] Once upon a time...\\n[Narrator] ...there was a Gradio app.\\n[CharacterB] And it could talk!"
-    # Ensure example state matches the number of inputs for `speaker_configs_state` (which is one dictionary)
     examples_data = [
         [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
         [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
@@ -148,7 +149,7 @@ def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=Non
             inputs=inputs_for_examples,
             outputs=outputs_for_examples,
             fn=process_fn,
-            cache_examples=False, # Caching can be complex with external API calls
             examples_per_page=5,
             label="Example Scripts (Click to Load & Run)",
             run_on_click=True

 # FILE: ui_layout.py
 import gradio as gr
 # --- UI Constants and Configuration ---
 TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
     "A/B Round Robin",
     "Detailed Configuration (Per Speaker UI)"
 ]
+DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
+# Assuming utils.openai_tts.OPENAI_VOICES exists and provides a list of voice names
+# If not, define APP_AVAILABLE_VOICES directly here for now.
+try:
+    from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES
+    APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
+    if not APP_AVAILABLE_VOICES: # Fallback if OPENAI_VOICES is empty
+        APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
+except ImportError:
+    print("Warning: Could not import OPENAI_VOICES from utils.openai_tts. Using default list.")
     APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
 }
 VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
 DEFAULT_VIBE = "None"
+DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"
 # --- UI Element Creation Functions ---
 def create_main_input_components(model_default_value):
     """Creates the main input components for script, model, pause, and global settings."""
+    with gr.Row(elem_id="main_input_row"):
+        with gr.Column(scale=2, elem_id="script_input_column"):
             script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\\n[Speaker2] How are you today?", lines=10, elem_id="script_input_area")
+        with gr.Column(scale=1, elem_id="model_config_column"):
             tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value, elem_id="tts_model_dropdown")
             pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50, elem_id="pause_input")
         elem_id="speaker_config_method_dropdown"
     )
+    with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), elem_id="single_voice_config_group") as single_voice_group:
         global_voice_dropdown = gr.Dropdown(
             APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True,
             elem_id="global_voice_dropdown"
         )
     detailed_per_speaker_ui_group_container = gr.Column(
         visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)"),
         elem_id="detailed_per_speaker_ui_group_container"
     )
     with detailed_per_speaker_ui_group_container:
         load_per_speaker_ui_button = gr.Button(
             "Load/Refresh Per-Speaker Settings UI (from Script Above)",
             "<small>Click button above to populate settings for each speaker found in the script. "
             "Settings are applied per-speaker. If script changes, click again to refresh.</small>"
         )
+        # The @gr.render dynamic UI will be placed here by app.py
     return (
         speaker_config_method_dropdown,
         single_voice_group,
         global_voice_dropdown,
+        detailed_per_speaker_ui_group_container,
         load_per_speaker_ui_button
     )
 def create_action_and_output_components():
     """Creates buttons for actions (cost, generate) and output display areas."""
+    with gr.Row(elem_id="action_buttons_row"):
         calculate_cost_button = gr.Button("Calculate Estimated Cost", elem_id="calculate_cost_button")
         generate_button = gr.Button("Generate Audio", variant="primary", elem_id="generate_button")
     cost_output = gr.Textbox(label="Estimated Cost", interactive=False, elem_id="cost_output")
+    with gr.Row(elem_id="output_files_row"):
         individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)", elem_id="individual_lines_zip_output")
         merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath", elem_id="merged_dialogue_mp3_output")
     status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5, elem_id="status_output")
     example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\\n[Alice] Let's see how this sounds."
     example_script_2 = "[Narrator] This is a short story.\\n[CharacterA] Once upon a time...\\n[Narrator] ...there was a Gradio app.\\n[CharacterB] And it could talk!"
     examples_data = [
         [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
         [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
             inputs=inputs_for_examples,
             outputs=outputs_for_examples,
             fn=process_fn,
+            cache_examples=False,
             examples_per_page=5,
             label="Example Scripts (Click to Load & Run)",
             run_on_click=True