Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files Community

abocha commited on May 7

Commit

59d4d10

1 Parent(s): a2f0e99

ui 2

Browse files

Files changed (1) hide show

app.py +160 -184

app.py CHANGED Viewed

@@ -45,14 +45,15 @@ SPEAKER_CONFIG_METHODS = [
     "Single Voice (Global)",
     "Random per Speaker",
     "A/B Round Robin",
-    "Detailed Configuration (Table)" # Changed from JSON
 ]
-DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker" # Or "Detailed Configuration (Table)" if preferred
 APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
-_speaker_config_cache = {} # For Random/AB
 # --- Speaker Configuration Table Logic ---
 DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
 DATAFRAME_COL_SPEAKER = 0
 DATAFRAME_COL_VOICE = 1
 DATAFRAME_COL_SPEED = 2
@@ -62,29 +63,45 @@ def get_speakers_from_script(script_text):
     if not script_text.strip():
         return []
     try:
-        # Use a simplified parsing just for speaker names if full parsing is too heavy here
-        # For now, leverage existing robust parser.
         parsed_lines, _ = parse_dialogue_script(script_text)
         unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
         return unique_speakers
-    except ValueError: # Catch parsing errors like script too long
-        return [] # Or raise/notify
-def convert_df_to_config_map(df_data):
-    """Converts DataFrame data (list of lists) to the internal config map."""
     config_map = {}
-    if not df_data or not df_data['data']: # df_data can be {'data': [], 'headers': []}
-        return config_map
-    for row in df_data['data']:
-        speaker_name = row[DATAFRAME_COL_SPEAKER]
-        voice = row[DATAFRAME_COL_VOICE]
         speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
         instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
-        # Validate voice
         if voice not in APP_AVAILABLE_VOICES:
-            print(f"Warning: Invalid voice '{voice}' for speaker '{speaker_name}' in table. Using default '{APP_AVAILABLE_VOICES[0]}'.")
             voice = APP_AVAILABLE_VOICES[0]
         cfg = {"voice": voice}
@@ -92,85 +109,66 @@ def convert_df_to_config_map(df_data):
         if speed_str:
             try:
                 speed_val = float(speed_str)
-                if 0.25 <= speed_val <= 4.0:
-                    cfg["speed"] = speed_val
-                else:
-                    print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range (0.25-4.0). Ignoring.")
-            except ValueError:
-                print(f"Warning: Invalid speed value '{speed_str}' for '{speaker_name}'. Ignoring.")
-        if instructions:
-            cfg["instructions"] = instructions
         config_map[speaker_name] = cfg
     return config_map
-def load_speakers_for_table(script_text, current_df_data_obj, selected_tts_model):
-    """
-    Populates a DataFrame structure based on speakers in the script.
-    Tries to preserve existing settings from current_df_data_obj.
-    current_df_data_obj is the raw object from gr.DataFrame, like {'data': [...], 'headers': ...}
-    """
     current_config_map = {}
-    if current_df_data_obj and current_df_data_obj.get('data'):
-        # Convert existing DataFrame to a temporary map to easily fetch settings
-        for row in current_df_data_obj['data']:
-            # Ensure row has enough elements to prevent IndexError
             if len(row) >= len(DATAFRAME_HEADERS):
-                speaker = row[DATAFRAME_COL_SPEAKER]
                 current_config_map[speaker] = {
-                    "voice": row[DATAFRAME_COL_VOICE],
                     "speed": str(row[DATAFRAME_COL_SPEED]),
                     "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
                 }
-            else:
-                print(f"Skipping malformed row in current DataFrame data: {row}")
     unique_speakers = get_speakers_from_script(script_text)
     new_df_rows = []
-    default_voice = APP_AVAILABLE_VOICES[0]
-    default_speed_str = "1.0"
-    default_instructions_str = ""
     for speaker in unique_speakers:
-        existing_settings = current_config_map.get(speaker, {})
         new_df_rows.append([
             speaker,
-            existing_settings.get("voice", default_voice),
-            existing_settings.get("speed", default_speed_str),
-            existing_settings.get("instructions", default_instructions_str)
         ])
-    # Provide info about which columns are relevant
-    speed_info = " (for tts-1/tts-1-hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A for this model)"
-    instr_info = " (for gpt-4o-mini-tts)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A for this model)"
-    headers_with_model_info = [
-        DATAFRAME_HEADERS[0],
-        DATAFRAME_HEADERS[1],
-        f"{DATAFRAME_HEADERS[2]}{speed_info}",
-        f"{DATAFRAME_HEADERS[3]}{instr_info}",
-    ]
-    # If no speakers, provide a dummy row to keep the DataFrame component from erroring/disappearing if it needs data
-    if not new_df_rows:
-        new_df_rows.append(["(No speakers detected)", default_voice, default_speed_str, default_instructions_str])
-    # Return a new DataFrame object compatible with gr.DataFrame.update
-    # This structure for gr.DataFrame update might be df_obj or pd.DataFrame(new_df_rows, columns=...)
-    # Let's try returning a pandas DataFrame
-    return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=headers_with_model_info)
-# --- Existing parsing and config logic (slightly adapted if needed) ---
-def parse_detailed_speaker_config(json_text, parsed_script_lines): # Kept for reference or other uses, but not for Table UI
-    # ... (implementation from previous version for JSON parsing)
-    pass # For now, this function is not directly used by the Table UI.
 def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
-                           global_selected_voice, detailed_config_map_from_table_or_json):
     global _speaker_config_cache
     if _speaker_config_cache.get("__method") != speaker_config_method or \
        _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
@@ -182,23 +180,21 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
         base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
         return base_config
-    # This part now uses the map derived from either JSON (if we kept that option) or Table
-    if speaker_config_method == "Detailed Configuration (Table)": # or "Detailed Configuration (JSON)"
-        if speaker_name in detailed_config_map_from_table_or_json:
-            speaker_specific = detailed_config_map_from_table_or_json[speaker_name]
-            return { # Ensure all keys exist, falling back to None if not in map
                 "voice": speaker_specific.get("voice", base_config["voice"]),
                 "speed": speaker_specific.get("speed"),
                 "instructions": speaker_specific.get("instructions")
             }
         else:
-            print(f"Warning: Speaker '{speaker_name}' not found in Detailed Config. Using default voice '{base_config['voice']}'.")
             return base_config
-    # For "Random" and "A/B"
     if speaker_name not in _speaker_config_cache:
         if speaker_config_method == "Random per Speaker":
-            # ... (same logic as before) ...
             available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
             if not _speaker_config_cache.get("__all_assigned_random"):
                  for i, spk_unique in enumerate(unique_script_speakers):
@@ -209,7 +205,6 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
                  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
         elif speaker_config_method == "A/B Round Robin":
-            # ... (same logic as before) ...
             if not _speaker_config_cache.get("__all_assigned_ab"):
                 for i, spk_unique in enumerate(unique_script_speakers):
                      if spk_unique not in _speaker_config_cache:
@@ -228,140 +223,123 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
 async def handle_script_processing(
     dialogue_script: str, tts_model: str, pause_ms: int,
     speaker_config_method: str, global_voice_selection: str,
-    speaker_details_df_value: dict, # This will be the dict from gr.DataFrame value
     global_speed: float,
     global_instructions: str, progress=gr.Progress(track_tqdm=True)):
-    global _speaker_config_cache
-    _speaker_config_cache = {}
-    if not OPENAI_API_KEY or not async_openai_client:
-        return None, None, "Error: OPENAI_API_KEY is not configured."
-    if not dialogue_script.strip():
-        return None, None, "Error: Script is empty."
     job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
     if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
     os.makedirs(job_audio_path_prefix, exist_ok=True)
     try:
-        parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
-        if not parsed_lines: return None, None, "Error: No valid dialogue lines found."
-    except ValueError as e: return None, None, f"Script parsing error: {str(e)}"
     unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
-    # This is where we get the detailed config map
     parsed_detailed_config_map = {}
     if speaker_config_method == "Detailed Configuration (Table)":
         try:
             parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
-        except Exception as e: # Catch any error during DataFrame conversion
-            return None, None, f"Error processing Speaker Table: {str(e)}"
-    # (If you had a JSON option, you'd parse JSON here too)
     tasks, line_audio_files = [], [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
         speaker_base_cfg = get_config_for_speaker(
             speaker_name, speaker_config_method, unique_speakers,
-            global_voice_selection, parsed_detailed_config_map # Pass the map
         )
         line_voice = speaker_base_cfg["voice"]
-        effective_speed = global_speed
-        if speaker_base_cfg.get("speed") is not None: effective_speed = speaker_base_cfg["speed"]
-        effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
         if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
-            effective_instructions = speaker_base_cfg["instructions"]
-        output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
-        progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
         tasks.append(synthesize_speech_line(
             client=async_openai_client, text=line_data["text"], voice=line_voice,
-            output_path=output_filename, model=tts_model, speed=effective_speed,
-            instructions=effective_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
             line_index=line_data['id']))
-    synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
-    for idx, result in enumerate(synthesis_results):
-        if isinstance(result, Exception): print(f"Error for line {parsed_lines[idx]['id']}: {result}")
-        elif result is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
-        else: line_audio_files[idx] = result
-    valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
-    if not valid_audio_files:
-        shutil.rmtree(job_audio_path_prefix)
-        return None, None, "Error: No audio files successfully synthesized."
-    zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
-    with zipfile.ZipFile(zip_filename, 'w') as zf:
-        for pth in valid_audio_files: zf.write(pth, os.path.basename(pth))
-    merged_mp3_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
-    merged_out_path = merge_mp3_files([f for f in line_audio_files if f], merged_mp3_fn, pause_ms)
-    status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
-    if len(valid_audio_files) < len(parsed_lines): status += "Some lines failed/skipped. "
-    if not merged_out_path and len(valid_audio_files) > 0: status += "Merged audio failed. "
-    elif not merged_out_path and len(valid_audio_files) == 0: status += "No audio generated."
-    else: status += "Outputs generated."
-    return (zip_filename if os.path.exists(zip_filename) else None,
-           merged_out_path if merged_out_path and os.path.exists(merged_out_path) else None,
            status)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
-    # ... (same as before) ...
-    if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
     try:
         parsed, chars = parse_dialogue_script(dialogue_script)
-        if not parsed: return "Cost: $0.000000 (No valid lines)"
         cost = calculate_cost(chars, len(parsed), tts_model)
-        return f"Est. Cost: ${cost:.6f} ({chars} chars, {len(parsed)} lines)"
-    except ValueError as e: return f"Error: {str(e)}"
     except Exception as e: return f"Cost calc error: {str(e)}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Dialogue Script to Speech Converter (Enhanced Table Config)")
-    # ... (intro markdown same) ...
     if not OPENAI_API_KEY or not async_openai_client:
-        gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
-            script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10, info=f"Max {MAX_SCRIPT_LENGTH} chars.")
         with gr.Column(scale=1):
-            tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
-            pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
-            global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
-            global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
     gr.Markdown("### Speaker Configuration")
     with gr.Row():
         speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
-        global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice')", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
-    # --- New UI for Detailed Table Configuration ---
-    current_visibility_detailed_table = (DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")
-    with gr.Column(visible=current_visibility_detailed_table) as detailed_table_config_group:
-        load_speakers_button = gr.Button("Load/Refresh Speakers for Table from Script Above")
-        gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. Type a voice name into the 'Voice' column. "
-                    "Speed and Instructions applicability depends on the globally selected TTS model.</small>")
         speaker_details_df = gr.DataFrame(
-            label="Per-Speaker Settings (Editable Table)",
-            headers=DATAFRAME_HEADERS,
-            # Initialize with a placeholder row or empty data
-            value=pd.DataFrame([["(Load speakers first)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
             interactive=True,
-            # `max_rows=20` or some sensible limit might be good for UI
-            # `datatype` can be used to specify types per column, e.g., ["str", "str", "number", "str"]
-            # For now, all will be string input in table, conversion happens in backend.
-            # Consider making "Speaker Name" column read-only if possible with Gradio versions
         )
-    # --- End New UI ---
     with gr.Row():
         calculate_cost_button = gr.Button("Calculate Cost")
@@ -371,38 +349,44 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         individual_lines_zip_output = gr.File(label="Download ZIP")
         merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
-    status_output = gr.Textbox(label="Status", interactive=False, lines=2)
-    # --- Event Handlers ---
-    def update_model_controls(selected_model):
-        # ... (same as before) ...
         is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
         is_gpt_mini = selected_model == "gpt-4o-mini-tts"
-        # Also update headers for DataFrame when model changes
-        speed_info = " (for tts-1/tts-1-hd)" if is_tts1 else " (N/A for this model)"
-        instr_info = " (for gpt-4o-mini-tts)" if is_gpt_mini else " (N/A for this model)"
-        new_headers = [DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1], f"{DATAFRAME_HEADERS[2]}{speed_info}", f"{DATAFRAME_HEADERS[3]}{instr_info}"]
         return {
             global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
             global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
-            speaker_details_df: gr.update(headers=new_headers) # Update headers dynamically
         }
-    tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input, speaker_details_df])
-    def update_speaker_controls(method):
         is_single = (method == "Single Voice (Global)")
         is_detailed_table = (method == "Detailed Configuration (Table)")
         return {
             global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
-            detailed_table_config_group: gr.update(visible=is_detailed_table), # Show/hide the whole group
         }
-    speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_table_config_group])
-    # Connect Load Speakers button
     load_speakers_button.click(
         fn=load_speakers_for_table,
-        inputs=[script_input, speaker_details_df, tts_model_dropdown], # Pass current DF to preserve, and model for header info
         outputs=[speaker_details_df]
     )
@@ -410,29 +394,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     generate_button.click(
         fn=handle_script_processing,
         inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
-                global_voice_dropdown,
-                speaker_details_df, # Changed from detailed_speaker_config_input (JSON)
                 global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
     gr.Markdown("## Examples")
-    # Examples will need to be adjusted if the "Detailed Configuration (Table)" is default and needs pre-filled table
-    # For simplicity, examples might focus on other config methods or a very simple table.
-    # For "Detailed Configuration (Table)", the flow is: type script -> click "Load Speakers" -> edit table -> generate.
-    # So, an example would just set the script and method. The table data itself isn't directly an input to an example click.
-    # We can, however, provide an example script.
     gr.Examples(
         examples=[
-            ["[Alice] Hi Bob.\n[Bob] Hello Alice.\n[Alice] This table config is new!", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""], # DF input is None for example
-            ["[Narrator] A single line, global voice.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
-            ["[Random1] Voice 1\n[Random2] Voice 2", "gpt-4o-mini-tts", 400, "Random per Speaker", APP_AVAILABLE_VOICES[0], None, 1.0, "Speak clearly."]
         ],
         inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
-                speaker_details_df, # This will be None for example clicks unless we can construct a df
-                global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
         fn=handle_script_processing, cache_examples=False)
 if __name__ == "__main__":
     if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
-    demo.launch(debug=True) # Added debug=True for easier testing of UI changes

     "Single Voice (Global)",
     "Random per Speaker",
     "A/B Round Robin",
+    "Detailed Configuration (Table)" # Using Table method
 ]
+DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
 APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
+_speaker_config_cache = {}
 # --- Speaker Configuration Table Logic ---
 DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
+# Column indices for clarity (optional, but can help maintainability)
 DATAFRAME_COL_SPEAKER = 0
 DATAFRAME_COL_VOICE = 1
 DATAFRAME_COL_SPEED = 2
     if not script_text.strip():
         return []
     try:
         parsed_lines, _ = parse_dialogue_script(script_text)
         unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
         return unique_speakers
+    except ValueError:
+        return []
+def convert_df_to_config_map(df_data_obj):
     config_map = {}
+    # gr.DataFrame's value is a dict like {'data': [[row1_val1, row1_val2], ...], 'headers': [...]}
+    # or if constructed with pandas, it might be a pandas DataFrame directly.
+    # Let's handle if it's the dict structure.
+    # Ensure df_data_obj is not None and has 'data'
+    if not df_data_obj or 'data' not in df_data_obj or not isinstance(df_data_obj['data'], list):
+        if isinstance(df_data_obj, pd.DataFrame): # If it's already a pandas DF
+            data_list = df_data_obj.values.tolist()
+        else: # Unrecognized format or empty
+            print("DataFrame data is empty or in an unrecognized format for conversion.")
+            return config_map
+    else:
+        data_list = df_data_obj['data']
+    for row in data_list:
+        # Defensive check for row length, assuming DATAFRAME_HEADERS length
+        if len(row) < len(DATAFRAME_HEADERS):
+            print(f"Skipping malformed row in DataFrame data during conversion: {row}")
+            continue
+        speaker_name = str(row[DATAFRAME_COL_SPEAKER]).strip()
+        voice = str(row[DATAFRAME_COL_VOICE]).strip()
         speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
         instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
+        if not speaker_name or speaker_name == "(Load speakers first)" or speaker_name == "(No speakers detected)":
+            continue # Skip placeholder/empty speaker names
         if voice not in APP_AVAILABLE_VOICES:
+            print(f"Warning: Invalid voice '{voice}' for '{speaker_name}'. Using '{APP_AVAILABLE_VOICES[0]}'.")
             voice = APP_AVAILABLE_VOICES[0]
         cfg = {"voice": voice}
         if speed_str:
             try:
                 speed_val = float(speed_str)
+                if 0.25 <= speed_val <= 4.0: cfg["speed"] = speed_val
+                else: print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range. Ignoring.")
+            except ValueError: print(f"Warning: Invalid speed '{speed_str}' for '{speaker_name}'. Ignoring.")
+        if instructions: cfg["instructions"] = instructions
         config_map[speaker_name] = cfg
     return config_map
+def load_speakers_for_table(script_text, current_df_obj, selected_tts_model):
     current_config_map = {}
+    if isinstance(current_df_obj, pd.DataFrame) and not current_df_obj.empty:
+         for _, row_series in current_df_obj.iterrows():
+            row = row_series.tolist() # Convert Series to list
+            if len(row) >= len(DATAFRAME_HEADERS):
+                speaker = str(row[DATAFRAME_COL_SPEAKER])
+                current_config_map[speaker] = {
+                    "voice": str(row[DATAFRAME_COL_VOICE]),
+                    "speed": str(row[DATAFRAME_COL_SPEED]),
+                    "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
+                }
+    elif isinstance(current_df_obj, dict) and current_df_obj.get('data'): # Handle raw dict from Gradio
+        for row in current_df_obj['data']:
             if len(row) >= len(DATAFRAME_HEADERS):
+                speaker = str(row[DATAFRAME_COL_SPEAKER])
                 current_config_map[speaker] = {
+                    "voice": str(row[DATAFRAME_COL_VOICE]),
                     "speed": str(row[DATAFRAME_COL_SPEED]),
                     "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
                 }
     unique_speakers = get_speakers_from_script(script_text)
     new_df_rows = []
+    default_voice, default_speed, default_instr = APP_AVAILABLE_VOICES[0], "1.0", ""
     for speaker in unique_speakers:
+        s_cfg = current_config_map.get(speaker, {})
         new_df_rows.append([
             speaker,
+            s_cfg.get("voice", default_voice),
+            s_cfg.get("speed", default_speed),
+            s_cfg.get("instructions", default_instr)
         ])
+    speed_info = " (tts-1/hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A)"
+    instr_info = " (gpt-4o-mini)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A)"
+    dynamic_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
+                       f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
+                       f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
+    if not new_df_rows:
+        new_df_rows.append(["(No speakers detected in script)", default_voice, default_speed, default_instr])
+    return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=dynamic_headers)
 def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
+                           global_selected_voice, detailed_config_map_from_table): # Renamed last param
     global _speaker_config_cache
     if _speaker_config_cache.get("__method") != speaker_config_method or \
        _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
         base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
         return base_config
+    if speaker_config_method == "Detailed Configuration (Table)":
+        if speaker_name in detailed_config_map_from_table:
+            speaker_specific = detailed_config_map_from_table[speaker_name]
+            return {
                 "voice": speaker_specific.get("voice", base_config["voice"]),
                 "speed": speaker_specific.get("speed"),
                 "instructions": speaker_specific.get("instructions")
             }
         else:
+            print(f"Warning: Speaker '{speaker_name}' not in Table Config. Using default.")
             return base_config
+    # For "Random" and "A/B" (same as before)
     if speaker_name not in _speaker_config_cache:
         if speaker_config_method == "Random per Speaker":
             available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
             if not _speaker_config_cache.get("__all_assigned_random"):
                  for i, spk_unique in enumerate(unique_script_speakers):
                  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
         elif speaker_config_method == "A/B Round Robin":
             if not _speaker_config_cache.get("__all_assigned_ab"):
                 for i, spk_unique in enumerate(unique_script_speakers):
                      if spk_unique not in _speaker_config_cache:
 async def handle_script_processing(
     dialogue_script: str, tts_model: str, pause_ms: int,
     speaker_config_method: str, global_voice_selection: str,
+    speaker_details_df_value: pd.DataFrame | dict, # Can be pd.DataFrame or dict from Gradio
     global_speed: float,
     global_instructions: str, progress=gr.Progress(track_tqdm=True)):
+    global _speaker_config_cache; _speaker_config_cache = {}
+    if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
+    if not dialogue_script.strip(): return None, None, "Error: Script empty."
     job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
     if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
     os.makedirs(job_audio_path_prefix, exist_ok=True)
     try:
+        parsed_lines, _ = parse_dialogue_script(dialogue_script)
+        if not parsed_lines: return None, None, "Error: No valid lines."
+    except ValueError as e: return None, None, f"Script error: {str(e)}"
     unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
     parsed_detailed_config_map = {}
     if speaker_config_method == "Detailed Configuration (Table)":
         try:
             parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
+        except Exception as e: return None, None, f"Error processing Speaker Table: {str(e)}"
     tasks, line_audio_files = [], [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
         speaker_base_cfg = get_config_for_speaker(
             speaker_name, speaker_config_method, unique_speakers,
+            global_voice_selection, parsed_detailed_config_map
         )
         line_voice = speaker_base_cfg["voice"]
+        eff_speed = global_speed
+        if speaker_base_cfg.get("speed") is not None: eff_speed = speaker_base_cfg["speed"]
+        eff_instr = global_instructions if global_instructions and global_instructions.strip() else None
         if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
+            eff_instr = speaker_base_cfg["instructions"]
+        out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
+        progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
         tasks.append(synthesize_speech_line(
             client=async_openai_client, text=line_data["text"], voice=line_voice,
+            output_path=out_fn, model=tts_model, speed=eff_speed,
+            instructions=eff_instr, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
             line_index=line_data['id']))
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for idx, res in enumerate(results):
+        if isinstance(res, Exception): print(f"Error line {parsed_lines[idx]['id']}: {res}")
+        elif res is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
+        else: line_audio_files[idx] = res
+    valid_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
+    if not valid_files:
+        shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio synthesized."
+    zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
+    with zipfile.ZipFile(zip_fn, 'w') as zf: [zf.write(p, os.path.basename(p)) for p in valid_files]
+    merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
+    merged_path = merge_mp3_files([f for f in line_audio_files if f], merged_fn, pause_ms)
+    status = f"{len(valid_files)}/{len(parsed_lines)} lines. "
+    if len(valid_files) < len(parsed_lines): status += "Some failed. "
+    if not merged_path and len(valid_files) > 0: status += "Merge failed. "
+    elif not merged_path: status += "No audio."
+    else: status += "Generated."
+    return (zip_fn if os.path.exists(zip_fn) else None,
+           merged_path if merged_path and os.path.exists(merged_path) else None,
            status)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
+    if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
     try:
         parsed, chars = parse_dialogue_script(dialogue_script)
+        if not parsed: return "Cost: $0.00 (No lines)"
         cost = calculate_cost(chars, len(parsed), tts_model)
+        return f"Est. Cost: ${cost:.6f}"
     except Exception as e: return f"Cost calc error: {str(e)}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Dialogue Script to Speech (Table Config)")
     if not OPENAI_API_KEY or not async_openai_client:
+        gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
+            script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
         with gr.Column(scale=1):
+            tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT)
+            pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50)
+            global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
+            global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
     gr.Markdown("### Speaker Configuration")
     with gr.Row():
         speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
+        global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
+    # Detailed Table Configuration UI
+    # Visibility of this whole column is controlled by the dropdown selection
+    with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")) as detailed_table_config_group:
+        load_speakers_button = gr.Button("Load/Refresh Speakers into Table (from Script Above)")
+        gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. "
+                    "Edit table below. Speed/Instructions column applicability depends on TTS model.</small>")
         speaker_details_df = gr.DataFrame(
+            label="Per-Speaker Settings",
+            headers=DATAFRAME_HEADERS, # Initial static headers
+            value=pd.DataFrame([["(Load speakers first or type here)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
             interactive=True,
+            row_count=(1, "dynamic"), # Allow dynamic rows, starting with 1
+            col_count=(len(DATAFRAME_HEADERS), "fixed")
         )
     with gr.Row():
         calculate_cost_button = gr.Button("Calculate Cost")
     with gr.Row():
         individual_lines_zip_output = gr.File(label="Download ZIP")
         merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
+    status_output = gr.Textbox(label="Status", interactive=False, lines=1)
+    def update_model_controls_visibility(selected_model):
         is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
         is_gpt_mini = selected_model == "gpt-4o-mini-tts"
+        speed_info = " (tts-1/hd)" if is_tts1 else " (N/A)"
+        instr_info = " (gpt-4o-mini)" if is_gpt_mini else " (N/A)"
+        new_df_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
+                           f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
+                           f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
         return {
             global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
             global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
+            speaker_details_df: gr.update(headers=new_df_headers) # Update DF headers dynamically
         }
+    tts_model_dropdown.change(
+        fn=update_model_controls_visibility,
+        inputs=[tts_model_dropdown],
+        outputs=[global_speed_input, global_instructions_input, speaker_details_df]
+    )
+    def update_speaker_config_method_visibility(method):
         is_single = (method == "Single Voice (Global)")
         is_detailed_table = (method == "Detailed Configuration (Table)")
         return {
             global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
+            detailed_table_config_group: gr.update(visible=is_detailed_table),
         }
+    speaker_config_method_dropdown.change(
+        fn=update_speaker_config_method_visibility,
+        inputs=[speaker_config_method_dropdown],
+        outputs=[global_voice_dropdown, detailed_table_config_group]
+    )
     load_speakers_button.click(
         fn=load_speakers_for_table,
+        inputs=[script_input, speaker_details_df, tts_model_dropdown],
         outputs=[speaker_details_df]
     )
     generate_button.click(
         fn=handle_script_processing,
         inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
+                global_voice_dropdown, speaker_details_df,
                 global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
+            ["[Alice] This table config is new!\n[Bob] Indeed it is.", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""],
+            ["[Narrator] Global voice example.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
         ],
         inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
+                speaker_details_df, global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
         fn=handle_script_processing, cache_examples=False)
 if __name__ == "__main__":
     if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    demo.launch(debug=True)