Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files

xet

Community

abocha commited on May 7

Commit

d44dfc0

1 Parent(s): 59d4d10

new ui approach

Browse files

Files changed (1) hide show

app.py +238 -223

app.py CHANGED Viewed

@@ -5,9 +5,10 @@ import tempfile
 import shutil
 import zipfile
 import random
-import json
-import pandas as pd # Import pandas
 from openai import AsyncOpenAI
 from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
 from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
@@ -45,188 +46,160 @@ SPEAKER_CONFIG_METHODS = [
     "Single Voice (Global)",
     "Random per Speaker",
     "A/B Round Robin",
-    "Detailed Configuration (Table)" # Using Table method
 ]
-DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
 APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
-_speaker_config_cache = {}
-# --- Speaker Configuration Table Logic ---
-DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
-# Column indices for clarity (optional, but can help maintainability)
-DATAFRAME_COL_SPEAKER = 0
-DATAFRAME_COL_VOICE = 1
-DATAFRAME_COL_SPEED = 2
-DATAFRAME_COL_INSTRUCTIONS = 3
 def get_speakers_from_script(script_text):
-    if not script_text.strip():
-        return []
     try:
         parsed_lines, _ = parse_dialogue_script(script_text)
-        unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
-        return unique_speakers
-    except ValueError:
-        return []
-def convert_df_to_config_map(df_data_obj):
-    config_map = {}
-    # gr.DataFrame's value is a dict like {'data': [[row1_val1, row1_val2], ...], 'headers': [...]}
-    # or if constructed with pandas, it might be a pandas DataFrame directly.
-    # Let's handle if it's the dict structure.
-    # Ensure df_data_obj is not None and has 'data'
-    if not df_data_obj or 'data' not in df_data_obj or not isinstance(df_data_obj['data'], list):
-        if isinstance(df_data_obj, pd.DataFrame): # If it's already a pandas DF
-            data_list = df_data_obj.values.tolist()
-        else: # Unrecognized format or empty
-            print("DataFrame data is empty or in an unrecognized format for conversion.")
-            return config_map
-    else:
-        data_list = df_data_obj['data']
-    for row in data_list:
-        # Defensive check for row length, assuming DATAFRAME_HEADERS length
-        if len(row) < len(DATAFRAME_HEADERS):
-            print(f"Skipping malformed row in DataFrame data during conversion: {row}")
-            continue
-        speaker_name = str(row[DATAFRAME_COL_SPEAKER]).strip()
-        voice = str(row[DATAFRAME_COL_VOICE]).strip()
-        speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
-        instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
-        if not speaker_name or speaker_name == "(Load speakers first)" or speaker_name == "(No speakers detected)":
-            continue # Skip placeholder/empty speaker names
-        if voice not in APP_AVAILABLE_VOICES:
-            print(f"Warning: Invalid voice '{voice}' for '{speaker_name}'. Using '{APP_AVAILABLE_VOICES[0]}'.")
-            voice = APP_AVAILABLE_VOICES[0]
-        cfg = {"voice": voice}
-        if speed_str:
-            try:
-                speed_val = float(speed_str)
-                if 0.25 <= speed_val <= 4.0: cfg["speed"] = speed_val
-                else: print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range. Ignoring.")
-            except ValueError: print(f"Warning: Invalid speed '{speed_str}' for '{speaker_name}'. Ignoring.")
-        if instructions: cfg["instructions"] = instructions
-        config_map[speaker_name] = cfg
-    return config_map
-def load_speakers_for_table(script_text, current_df_obj, selected_tts_model):
-    current_config_map = {}
-    if isinstance(current_df_obj, pd.DataFrame) and not current_df_obj.empty:
-         for _, row_series in current_df_obj.iterrows():
-            row = row_series.tolist() # Convert Series to list
-            if len(row) >= len(DATAFRAME_HEADERS):
-                speaker = str(row[DATAFRAME_COL_SPEAKER])
-                current_config_map[speaker] = {
-                    "voice": str(row[DATAFRAME_COL_VOICE]),
-                    "speed": str(row[DATAFRAME_COL_SPEED]),
-                    "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
-                }
-    elif isinstance(current_df_obj, dict) and current_df_obj.get('data'): # Handle raw dict from Gradio
-        for row in current_df_obj['data']:
-            if len(row) >= len(DATAFRAME_HEADERS):
-                speaker = str(row[DATAFRAME_COL_SPEAKER])
-                current_config_map[speaker] = {
-                    "voice": str(row[DATAFRAME_COL_VOICE]),
-                    "speed": str(row[DATAFRAME_COL_SPEED]),
-                    "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
-                }
-    unique_speakers = get_speakers_from_script(script_text)
-    new_df_rows = []
-    default_voice, default_speed, default_instr = APP_AVAILABLE_VOICES[0], "1.0", ""
-    for speaker in unique_speakers:
-        s_cfg = current_config_map.get(speaker, {})
-        new_df_rows.append([
-            speaker,
-            s_cfg.get("voice", default_voice),
-            s_cfg.get("speed", default_speed),
-            s_cfg.get("instructions", default_instr)
-        ])
-    speed_info = " (tts-1/hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A)"
-    instr_info = " (gpt-4o-mini)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A)"
-    dynamic_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
-                       f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
-                       f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
-    if not new_df_rows:
-        new_df_rows.append(["(No speakers detected in script)", default_voice, default_speed, default_instr])
-    return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=dynamic_headers)
-def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
-                           global_selected_voice, detailed_config_map_from_table): # Renamed last param
-    global _speaker_config_cache
-    if _speaker_config_cache.get("__method") != speaker_config_method or \
-       _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
-        _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
-    base_config = {"voice": APP_AVAILABLE_VOICES[0], "speed": None, "instructions": None}
-    if speaker_config_method == "Single Voice (Global)":
-        base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
-        return base_config
-    if speaker_config_method == "Detailed Configuration (Table)":
-        if speaker_name in detailed_config_map_from_table:
-            speaker_specific = detailed_config_map_from_table[speaker_name]
-            return {
-                "voice": speaker_specific.get("voice", base_config["voice"]),
-                "speed": speaker_specific.get("speed"),
-                "instructions": speaker_specific.get("instructions")
             }
-        else:
-            print(f"Warning: Speaker '{speaker_name}' not in Table Config. Using default.")
-            return base_config
-    # For "Random" and "A/B" (same as before)
-    if speaker_name not in _speaker_config_cache:
-        if speaker_config_method == "Random per Speaker":
-            available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
-            if not _speaker_config_cache.get("__all_assigned_random"):
-                 for i, spk_unique in enumerate(unique_script_speakers):
-                     if spk_unique not in _speaker_config_cache:
-                        _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
-                 _speaker_config_cache["__all_assigned_random"] = True
-            if speaker_name not in _speaker_config_cache:
-                 _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
-        elif speaker_config_method == "A/B Round Robin":
-            if not _speaker_config_cache.get("__all_assigned_ab"):
-                for i, spk_unique in enumerate(unique_script_speakers):
-                     if spk_unique not in _speaker_config_cache:
-                        _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
-                _speaker_config_cache["__all_assigned_ab"] = True
-            if speaker_name not in _speaker_config_cache:
-                speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
-                _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
-        else:
-             _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
-    cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
-    return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
 async def handle_script_processing(
     dialogue_script: str, tts_model: str, pause_ms: int,
     speaker_config_method: str, global_voice_selection: str,
-    speaker_details_df_value: pd.DataFrame | dict, # Can be pd.DataFrame or dict from Gradio
     global_speed: float,
     global_instructions: str, progress=gr.Progress(track_tqdm=True)):
-    global _speaker_config_cache; _speaker_config_cache = {}
     if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
     if not dialogue_script.strip(): return None, None, "Error: Script empty."
@@ -240,36 +213,60 @@ async def handle_script_processing(
         if not parsed_lines: return None, None, "Error: No valid lines."
     except ValueError as e: return None, None, f"Script error: {str(e)}"
-    unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
-    parsed_detailed_config_map = {}
-    if speaker_config_method == "Detailed Configuration (Table)":
-        try:
-            parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
-        except Exception as e: return None, None, f"Error processing Speaker Table: {str(e)}"
     tasks, line_audio_files = [], [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
-        speaker_base_cfg = get_config_for_speaker(
-            speaker_name, speaker_config_method, unique_speakers,
-            global_voice_selection, parsed_detailed_config_map
-        )
-        line_voice = speaker_base_cfg["voice"]
-        eff_speed = global_speed
-        if speaker_base_cfg.get("speed") is not None: eff_speed = speaker_base_cfg["speed"]
-        eff_instr = global_instructions if global_instructions and global_instructions.strip() else None
-        if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
-            eff_instr = speaker_base_cfg["instructions"]
         out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
         progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
         tasks.append(synthesize_speech_line(
             client=async_openai_client, text=line_data["text"], voice=line_voice,
-            output_path=out_fn, model=tts_model, speed=eff_speed,
-            instructions=eff_instr, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
             line_index=line_data['id']))
     results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -298,7 +295,9 @@ async def handle_script_processing(
            merged_path if merged_path and os.path.exists(merged_path) else None,
            status)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
     if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
     try:
         parsed, chars = parse_dialogue_script(dialogue_script)
@@ -307,11 +306,15 @@ def handle_calculate_cost(dialogue_script: str, tts_model: str):
         return f"Est. Cost: ${cost:.6f}"
     except Exception as e: return f"Cost calc error: {str(e)}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Dialogue Script to Speech (Table Config)")
     if not OPENAI_API_KEY or not async_openai_client:
         gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
             script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
@@ -321,26 +324,25 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
             global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
-    gr.Markdown("### Speaker Configuration")
-    with gr.Row():
-        speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
-        global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
-    # Detailed Table Configuration UI
-    # Visibility of this whole column is controlled by the dropdown selection
-    with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")) as detailed_table_config_group:
-        load_speakers_button = gr.Button("Load/Refresh Speakers into Table (from Script Above)")
-        gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. "
-                    "Edit table below. Speed/Instructions column applicability depends on TTS model.</small>")
-        speaker_details_df = gr.DataFrame(
-            label="Per-Speaker Settings",
-            headers=DATAFRAME_HEADERS, # Initial static headers
-            value=pd.DataFrame([["(Load speakers first or type here)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
-            interactive=True,
-            row_count=(1, "dynamic"), # Allow dynamic rows, starting with 1
-            col_count=(len(DATAFRAME_HEADERS), "fixed")
         )
     with gr.Row():
         calculate_cost_button = gr.Button("Calculate Cost")
         generate_button = gr.Button("Generate Audio", variant="primary")
@@ -351,61 +353,74 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
     status_output = gr.Textbox(label="Status", interactive=False, lines=1)
-    def update_model_controls_visibility(selected_model):
         is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
         is_gpt_mini = selected_model == "gpt-4o-mini-tts"
-        speed_info = " (tts-1/hd)" if is_tts1 else " (N/A)"
-        instr_info = " (gpt-4o-mini)" if is_gpt_mini else " (N/A)"
-        new_df_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
-                           f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
-                           f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
         return {
             global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
             global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
-            speaker_details_df: gr.update(headers=new_df_headers) # Update DF headers dynamically
         }
     tts_model_dropdown.change(
         fn=update_model_controls_visibility,
-        inputs=[tts_model_dropdown],
-        outputs=[global_speed_input, global_instructions_input, speaker_details_df]
     )
     def update_speaker_config_method_visibility(method):
         is_single = (method == "Single Voice (Global)")
-        is_detailed_table = (method == "Detailed Configuration (Table)")
         return {
-            global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
-            detailed_table_config_group: gr.update(visible=is_detailed_table),
         }
     speaker_config_method_dropdown.change(
         fn=update_speaker_config_method_visibility,
         inputs=[speaker_config_method_dropdown],
-        outputs=[global_voice_dropdown, detailed_table_config_group]
     )
-    load_speakers_button.click(
-        fn=load_speakers_for_table,
-        inputs=[script_input, speaker_details_df, tts_model_dropdown],
-        outputs=[speaker_details_df]
     )
     calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
     generate_button.click(
         fn=handle_script_processing,
-        inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
-                global_voice_dropdown, speaker_details_df,
-                global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
-            ["[Alice] This table config is new!\n[Bob] Indeed it is.", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""],
-            ["[Narrator] Global voice example.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
         ],
-        inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
-                speaker_details_df, global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
         fn=handle_script_processing, cache_examples=False)

 import shutil
 import zipfile
 import random
+import json # Keep for other potential uses, though not primary for this config
+import pandas as pd # Keep for now, in case other features might use it
 from openai import AsyncOpenAI
+from functools import partial # For dynamic event handlers
 from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
 from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
     "Single Voice (Global)",
     "Random per Speaker",
     "A/B Round Robin",
+    "Detailed Configuration (Per Speaker UI)" # New Method Name
 ]
+DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
 APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
+PREDEFINED_VIBES = {
+    "None": "", # No specific instruction
+    "Calm": "Speak in a calm, composed, and relaxed manner.",
+    "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
+    "Happy": "Speak with a cheerful, bright, and joyful voice.",
+    "Sad": "Speak with a sorrowful, melancholic, and dejected tone.",
+    "Whisper": "Speak softly, as if whispering.",
+    "Angry": "Speak with a strong, firm, and possibly agitated voice.",
+    "Fearful": "Speak with a trembling, hesitant, and scared voice.",
+    "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
+    "Authoritative": "Speak with a commanding, confident, and firm voice.",
+    "Friendly": "Speak in a warm, approachable, and amiable manner.",
+    "Custom...": "CUSTOM" # Special value indicating custom text should be used
+}
+VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
+DEFAULT_VIBE = "None"
 def get_speakers_from_script(script_text):
+    if not script_text.strip(): return []
     try:
         parsed_lines, _ = parse_dialogue_script(script_text)
+        return sorted(list(set(p["speaker"] for p in parsed_lines)))
+    except ValueError: return []
+def handle_dynamic_input_change(new_value, current_configs_state_dict, speaker_name, config_key, tts_model):
+    """
+    Updates the gr.State dictionary when a dynamic UI element changes.
+    current_configs_state_dict is the raw dictionary from gr.State.
+    """
+    if speaker_name not in current_configs_state_dict:
+        current_configs_state_dict[speaker_name] = {}
+    current_configs_state_dict[speaker_name][config_key] = new_value
+    # Special handling for Vibe -> Custom Instructions visibility (Simpler: custom textbox always visible)
+    # For this iteration, custom textbox is always visible. Backend decides to use it.
+    # Determine visibility/interactivity of speed slider for this specific speaker's UI (if we were to update it directly)
+    # This is complex to do from a generic handler. Better to set initial visibility in load_refresh_per_speaker_ui.
+    # Global tts_model_dropdown change will refresh the whole dynamic UI if needed for speed/instr applicability.
+    return current_configs_state_dict
+def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_model):
+    """
+    Generates the dynamic UI components (accordions) for each speaker.
+    Returns a list of Gradio components and the updated state.
+    """
+    unique_speakers = get_speakers_from_script(script_text)
+    new_ui_components = []
+    # Ensure state dict is not None (Gradio might pass None initially for gr.State)
+    if current_configs_state_dict is None:
+        current_configs_state_dict = {}
+    # Update state for any new speakers or remove speakers no longer in script
+    # (Optional: more complex logic could be to remove speakers from state if not in script)
+    # For now, just add new ones with defaults if not present.
+    for speaker_name in unique_speakers:
+        if speaker_name not in current_configs_state_dict:
+            current_configs_state_dict[speaker_name] = {
+                "voice": APP_AVAILABLE_VOICES[0],
+                "speed": 1.0,
+                "vibe": DEFAULT_VIBE,
+                "custom_instructions": ""
             }
+        # Ensure all keys exist for existing speakers (e.g., if new fields added)
+        current_configs_state_dict[speaker_name].setdefault("voice", APP_AVAILABLE_VOICES[0])
+        current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
+        current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
+        current_configs_state_dict[speaker_name].setdefault("custom_instructions", "")
+    if not unique_speakers:
+        new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again."))
+        # Return current (possibly empty) state and the markdown message
+        return new_ui_components, current_configs_state_dict
+    for speaker_name in unique_speakers:
+        speaker_cfg = current_configs_state_dict[speaker_name] # Should exist now
+        # Determine if speed/instructions are applicable for the current global TTS model
+        speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
+        instructions_relevant = tts_model == "gpt-4o-mini-tts" # Vibe/Custom is primarily for this
+        with gr.Accordion(label=f"Settings for: {speaker_name}", open=False) as speaker_accordion:
+            # Voice Dropdown
+            voice_dd = gr.Dropdown(
+                label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg["voice"], interactive=True
+            )
+            voice_dd.change(
+                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
+                inputs=[voice_dd, speaker_configs_state], # Pass the component itself and the state
+                outputs=[speaker_configs_state]
+            )
+            # Speed Slider
+            speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
+            speed_slider = gr.Slider(
+                label=speed_slider_label, minimum=0.25, maximum=4.0, value=speaker_cfg["speed"],
+                step=0.05, interactive=speed_interactive
+            )
+            if speed_interactive: # Only attach listener if interactive
+                speed_slider.release( # Use release to avoid too many updates during drag
+                    fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
+                    inputs=[speed_slider, speaker_configs_state],
+                    outputs=[speaker_configs_state]
+                )
+            # Vibe Dropdown
+            vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
+            vibe_dd = gr.Dropdown(
+                label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg["vibe"], interactive=True
+            )
+            vibe_dd.change(
+                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="vibe", tts_model=tts_model),
+                inputs=[vibe_dd, speaker_configs_state],
+                outputs=[speaker_configs_state]
+            )
+            # Custom Instructions Textbox
+            custom_instr_label = "Custom Instructions"
+            custom_instr_placeholder = "Only used if Vibe is 'Custom...'. Overrides Vibe."
+            custom_instr_tb = gr.Textbox(
+                label=custom_instr_label,
+                value=speaker_cfg["custom_instructions"],
+                placeholder=custom_instr_placeholder,
+                lines=2, interactive=True # Always interactive, backend logic decides if used
+            )
+            custom_instr_tb.input( # Use input for real-time typing updates
+                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
+                inputs=[custom_instr_tb, speaker_configs_state],
+                outputs=[speaker_configs_state]
+            )
+        new_ui_components.append(speaker_accordion)
+    return new_ui_components, current_configs_state_dict
 async def handle_script_processing(
     dialogue_script: str, tts_model: str, pause_ms: int,
     speaker_config_method: str, global_voice_selection: str,
+    # No more df_value, instead we use speaker_configs_state_dict from gr.State
+    speaker_configs_state_dict: dict,
     global_speed: float,
     global_instructions: str, progress=gr.Progress(track_tqdm=True)):
     if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
     if not dialogue_script.strip(): return None, None, "Error: Script empty."
         if not parsed_lines: return None, None, "Error: No valid lines."
     except ValueError as e: return None, None, f"Script error: {str(e)}"
+    # Ensure state dict is usable
+    if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
     tasks, line_audio_files = [], [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
+        # Determine voice, speed, instructions for this line
+        line_voice = global_voice_selection
+        line_speed = global_speed
+        line_instructions = global_instructions if global_instructions and global_instructions.strip() else None
+        if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
+            spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
+            line_voice = spk_cfg.get("voice", global_voice_selection) # Fallback to global if needed
+            # Speed: per-speaker if tts-1/hd and set, else global if tts-1/hd, else API default
+            if tts_model in ["tts-1", "tts-1-hd"]:
+                line_speed = spk_cfg.get("speed", global_speed)
+            # Instructions: primarily for gpt-4o-mini-tts
+            if tts_model == "gpt-4o-mini-tts":
+                vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
+                custom_instr = spk_cfg.get("custom_instructions", "").strip()
+                if vibe == "Custom..." and custom_instr:
+                    line_instructions = custom_instr
+                elif vibe != "None" and vibe != "Custom...":
+                    line_instructions = PREDEFINED_VIBES.get(vibe, "")
+                # If vibe is None or Custom with no text, line_instructions might remain global or become ""
+                if not line_instructions and global_instructions and global_instructions.strip(): # Fallback to global if specific instructions are empty
+                    line_instructions = global_instructions
+                elif not line_instructions : # Ensure it's None if truly no instruction
+                     line_instructions = None
+        elif speaker_config_method == "Random per Speaker":
+            # Simplified: assign random now, could be cached as before for consistency within run
+            line_voice = random.choice(APP_AVAILABLE_VOICES)
+        elif speaker_config_method == "A/B Round Robin":
+            # Simplified: assign A/B now
+            unique_script_speakers = get_speakers_from_script(dialogue_script) # Re-get for this logic
+            speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
+            line_voice = APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]
+        # Fallback for speed if not tts-1/hd (API won't use it anyway)
+        if tts_model not in ["tts-1", "tts-1-hd"]:
+            line_speed = 1.0 # API default, won't be sent
         out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
         progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
         tasks.append(synthesize_speech_line(
             client=async_openai_client, text=line_data["text"], voice=line_voice,
+            output_path=out_fn, model=tts_model, speed=line_speed,
+            instructions=line_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
             line_index=line_data['id']))
     results = await asyncio.gather(*tasks, return_exceptions=True)
            merged_path if merged_path and os.path.exists(merged_path) else None,
            status)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
+    # ... (same as before) ...
     if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
     try:
         parsed, chars = parse_dialogue_script(dialogue_script)
         return f"Est. Cost: ${cost:.6f}"
     except Exception as e: return f"Cost calc error: {str(e)}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Dialogue Script to Speech (Dynamic Per-Speaker UI)")
     if not OPENAI_API_KEY or not async_openai_client:
         gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
+    # State to hold detailed speaker configurations
+    speaker_configs_state = gr.State({})
     with gr.Row():
         with gr.Column(scale=2):
             script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
             global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
             global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
+    gr.Markdown("### Speaker Configuration Method")
+    speaker_config_method_dropdown = gr.Dropdown(
+        SPEAKER_CONFIG_METHODS, label="Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
+    )
+    # UI for "Single Voice (Global)"
+    with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
+        global_voice_dropdown = gr.Dropdown(
+            APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], interactive=True
         )
+    # UI for "Detailed Configuration (Per Speaker UI)"
+    with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
+        load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
+        gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are per-speaker.</small>")
+        # This column will be populated by the output of load_per_speaker_ui_button
+        dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
     with gr.Row():
         calculate_cost_button = gr.Button("Calculate Cost")
         generate_button = gr.Button("Generate Audio", variant="primary")
         merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
     status_output = gr.Textbox(label="Status", interactive=False, lines=1)
+    # --- Event Handlers ---
+    def update_model_controls_visibility(selected_model, script_text_for_refresh, current_speaker_configs_for_refresh):
+        # When model changes, also refresh the dynamic UI because speed/instr applicability changes
+        # This means load_refresh_per_speaker_ui will be called.
+        new_dynamic_ui, updated_state = load_refresh_per_speaker_ui(script_text_for_refresh, current_speaker_configs_for_refresh, selected_model)
         is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
         is_gpt_mini = selected_model == "gpt-4o-mini-tts"
         return {
             global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
             global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
+            dynamic_speaker_ui_area: new_dynamic_ui, # Return the actual list of components
+            speaker_configs_state: updated_state
         }
     tts_model_dropdown.change(
         fn=update_model_controls_visibility,
+        inputs=[tts_model_dropdown, script_input, speaker_configs_state],
+        outputs=[global_speed_input, global_instructions_input, dynamic_speaker_ui_area, speaker_configs_state]
     )
     def update_speaker_config_method_visibility(method):
         is_single = (method == "Single Voice (Global)")
+        is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
+        # Add more if other methods exist...
         return {
+            single_voice_group: gr.update(visible=is_single),
+            detailed_per_speaker_ui_group: gr.update(visible=is_detailed_per_speaker),
         }
     speaker_config_method_dropdown.change(
         fn=update_speaker_config_method_visibility,
         inputs=[speaker_config_method_dropdown],
+        outputs=[single_voice_group, detailed_per_speaker_ui_group]
     )
+    load_per_speaker_ui_button.click(
+        fn=load_refresh_per_speaker_ui,
+        inputs=[script_input, speaker_configs_state, tts_model_dropdown],
+        # Output the list of components to the column, and the updated state to the state component
+        outputs=[dynamic_speaker_ui_area, speaker_configs_state]
     )
     calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
+    # Generate button now takes speaker_configs_state as input
     generate_button.click(
         fn=handle_script_processing,
+        inputs=[
+            script_input, tts_model_dropdown, pause_input,
+            speaker_config_method_dropdown, global_voice_dropdown,
+            speaker_configs_state, # Pass the state object
+            global_speed_input, global_instructions_input
+        ],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
+            ["[Alice] Hello from Alice!\n[Bob] Bob here, testing the dynamic UI.", "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", APP_AVAILABLE_VOICES[0], {}, 1.0, ""],
+            ["[Narrator] Just one line, using global.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
+        ],
+        # Note: speaker_configs_state is passed as an empty dict {} for examples.
+        # The user would click "Load/Refresh Per-Speaker UI" after an example loads.
+        inputs=[
+            script_input, tts_model_dropdown, pause_input,
+            speaker_config_method_dropdown, global_voice_dropdown,
+            speaker_configs_state,
+            global_speed_input, global_instructions_input
         ],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
         fn=handle_script_processing, cache_examples=False)