Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files Community

abocha commited on May 7

Commit

354d940

1 Parent(s): 7a4548f

gradio code parameters

Browse files

Files changed (1) hide show

app.py +106 -250

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import os
 import asyncio
@@ -9,7 +10,7 @@ import json
 from openai import AsyncOpenAI
 from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
-from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES # Renamed for clarity
 from utils.merge_audio import merge_mp3_files
 # --- Configuration ---
@@ -17,7 +18,6 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
 MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
-# Ensure API key is available
 if not OPENAI_API_KEY:
     try:
         from huggingface_hub import HfApi
@@ -37,12 +37,10 @@ if OPENAI_API_KEY:
 else:
     print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
-# TTS Models available in the UI
 TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
-if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE: # Ensure default is valid
     MODEL_DEFAULT = "tts-1-hd"
 SPEAKER_CONFIG_METHODS = [
     "Single Voice (Global)",
     "Random per Speaker",
@@ -50,21 +48,15 @@ SPEAKER_CONFIG_METHODS = [
     "Detailed Configuration (JSON)"
 ]
 DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
-# For UI elements that need the list of voices
 APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
-# Global store for parsed speaker configurations
 _speaker_config_cache = {}
 def parse_detailed_speaker_config(json_text, parsed_script_lines):
-    """ Parses the JSON config string into a speaker map. """
     config_map = {}
     default_voice = APP_AVAILABLE_VOICES[0]
     try:
         if not json_text.strip():
-            return {} # Empty config means rely on global or other fallbacks
         config_list = json.loads(json_text)
         if not isinstance(config_list, list):
@@ -72,46 +64,38 @@ def parse_detailed_speaker_config(json_text, parsed_script_lines):
         for item in config_list:
             if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
-                # Log warning or skip malformed item
                 print(f"Skipping malformed item in JSON config: {item}")
                 continue
             if item["voice"] not in APP_AVAILABLE_VOICES:
                 print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
                 item["voice"] = default_voice
-            # Validate speed if present
             if "speed" in item:
                 try:
                     item["speed"] = float(item["speed"])
                     if not (0.25 <= item["speed"] <= 4.0):
-                        print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping or defaulting.")
-                        item["speed"] = max(0.25, min(item["speed"], 4.0)) # Clamp
                 except ValueError:
                     print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
-                    # Let it be None or remove, so global/default speed applies
                     item.pop("speed", None)
             config_map[item["speaker"]] = {
                 "voice": item["voice"],
-                "speed": item.get("speed"), # Will be None if not present or invalid
-                "instructions": item.get("instructions") # Will be None if not present
             }
         return config_map
     except json.JSONDecodeError as e:
         raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
-    except ValueError as e: # Catch our own ValueErrors
-        raise e # Re-raise
-    except Exception as e: # Catch any other unexpected errors during parsing
         raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
 def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
                            global_selected_voice, detailed_config_map):
-    global _speaker_config_cache # This cache helps maintain consistency for "Random" and "A/B" within a run
-    # If method changed or cache is for a different set of speakers, reset it.
-    # A more robust cache key might involve hashing unique_script_speakers.
     if _speaker_config_cache.get("__method") != speaker_config_method or \
        _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
         _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
@@ -124,72 +108,51 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
     if speaker_config_method == "Detailed Configuration (JSON)":
         if speaker_name in detailed_config_map:
-            # Merge with base_config to ensure all keys are present if some are optional in JSON
-            # JSON values take precedence
             speaker_specific = detailed_config_map[speaker_name]
             return {
                 "voice": speaker_specific.get("voice", base_config["voice"]),
-                "speed": speaker_specific.get("speed"), # Allow None to use global
-                "instructions": speaker_specific.get("instructions") # Allow None to use global
             }
-        else: # Fallback for speakers in script but not in JSON map
-            # Could use a default voice, or a cycling voice for unmapped speakers
-            # For now, let's use the first voice as a simple fallback.
             print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
             return base_config
-    # For "Random" and "A/B", we only map voices. Speed/Instructions will be global.
     if speaker_name not in _speaker_config_cache:
         if speaker_config_method == "Random per Speaker":
-            # Assign a random voice if not already cached for this run
             available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
-            # Ensure all unique speakers get an assignment before reusing voices from cache build
             if not _speaker_config_cache.get("__all_assigned_random"):
                  for i, spk_unique in enumerate(unique_script_speakers):
                      if spk_unique not in _speaker_config_cache:
                         _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
                  _speaker_config_cache["__all_assigned_random"] = True
-            # If somehow still not found (should not happen if pre-populated)
             if speaker_name not in _speaker_config_cache:
                  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
         elif speaker_config_method == "A/B Round Robin":
-            # Assign voices in order if not already cached
             if not _speaker_config_cache.get("__all_assigned_ab"):
                 for i, spk_unique in enumerate(unique_script_speakers):
                      if spk_unique not in _speaker_config_cache:
                         _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
                 _speaker_config_cache["__all_assigned_ab"] = True
-            if speaker_name not in _speaker_config_cache: # Fallback, should be populated
                 speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
                 _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
-        else: # Should not happen
              _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
-    # Return only voice for Random/AB, speed/instructions will be from global UI inputs
     cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
     return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
 async def handle_script_processing(
-    dialogue_script: str,
-    tts_model: str,
-    pause_ms: int,
-    speaker_config_method: str,
-    global_voice_selection: str, # From dropdown if "Single Voice (Global)"
-    detailed_speaker_json: str, # From JSON input
-    global_speed: float,
-    global_instructions: str,
-    progress=gr.Progress(track_tqdm=True)
-    ):
     global _speaker_config_cache
-    _speaker_config_cache = {} # Reset speaker config cache for each new run
     if not OPENAI_API_KEY or not async_openai_client:
-        return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
     if not dialogue_script.strip():
         return None, None, "Error: Script is empty."
@@ -213,266 +176,159 @@ async def handle_script_processing(
         except ValueError as e:
             return None, None, f"Configuration Error: {str(e)}"
-    tasks = []
-    line_audio_files = [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
-        # Get base config (primarily voice) based on method
-        # For "Random" and "A/B", this will just return voice. Speed/instructions are taken from global.
-        # For "Single Voice", it returns the globally selected voice.
-        # For "Detailed JSON", it returns voice, and potentially per-speaker speed/instructions.
         speaker_base_cfg = get_config_for_speaker(
             speaker_name, speaker_config_method, unique_speakers,
-            global_voice_selection, parsed_detailed_config_map
-        )
         line_voice = speaker_base_cfg["voice"]
-        # Determine effective speed and instructions
-        # Priority: Per-speaker from JSON > Global UI > API Default (1.0 for speed, None for instructions)
-        effective_speed = global_speed # Start with global
-        if speaker_base_cfg.get("speed") is not None: # If JSON provided a speed for this speaker
             effective_speed = speaker_base_cfg["speed"]
         effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
-        if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip(): # If JSON provided instructions
             effective_instructions = speaker_base_cfg["instructions"]
         output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
         progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
-        tasks.append(
-            synthesize_speech_line(
-                client=async_openai_client,
-                text=line_data["text"],
-                voice=line_voice,
-                output_path=output_filename,
-                model=tts_model,
-                speed=effective_speed,
-                instructions=effective_instructions,
-                nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
-                line_index=line_data['id']
-            )
-        )
     synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
     for idx, result in enumerate(synthesis_results):
-        if isinstance(result, Exception):
-            print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
-        elif result is None:
-            print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
-        else:
-            line_audio_files[idx] = result
     valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
     if not valid_audio_files:
         shutil.rmtree(job_audio_path_prefix)
-        return None, None, "Error: No audio files were successfully synthesized."
     zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
     with zipfile.ZipFile(zip_filename, 'w') as zf:
-        for audio_file_path in valid_audio_files:
-            zf.write(audio_file_path, os.path.basename(audio_file_path))
-    ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
-    merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
-    merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
-    final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
-    if len(valid_audio_files) < len(parsed_lines):
-        final_status += "Some lines failed or were skipped. "
-    if not merged_output_path and len(valid_audio_files) > 0 :
-        final_status += "Merged audio could not be created."
-    elif not merged_output_path and len(valid_audio_files) == 0:
-         final_status += "No audio generated."
-    else:
-        final_status += "Outputs generated."
-    return zip_filename if os.path.exists(zip_filename) else None, \
-           merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
-           final_status
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
-    if not dialogue_script.strip():
-        return "Cost: $0.000000 (Script is empty)"
     try:
-        parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
-        if not parsed_lines:
-             return "Cost: $0.000000 (No valid lines)"
-        cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
-        return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
-    except ValueError as e: # Handles MAX_SCRIPT_LENGTH
-        return f"Error: {str(e)}"
-    except Exception as e:
-        return f"Error calculating cost: {str(e)}"
-# --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
-    gr.Markdown(
-        "Convert dialogue scripts to speech with fine-grained control over voices, speed, and instructions."
-    )
     if not OPENAI_API_KEY or not async_openai_client:
-        gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. Speech synthesis will fail.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
-            script_input = gr.TextArea(
-                label="Dialogue Script",
-                placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!",
-                lines=10,
-                info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
-            )
         with gr.Column(scale=1):
-            tts_model_dropdown = gr.Dropdown(
-                TTS_MODELS_AVAILABLE, label="OpenAI TTS Model", value=MODEL_DEFAULT,
-                info="Select TTS model. Affects available controls below."
-            )
-            pause_input = gr.Number(
-                label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
-                info="Silence duration in milliseconds between merged lines."
-            )
-            # Model-specific global controls
-            global_speed_input = gr.Slider(
-                minimum=0.25, maximum=4.0, value=1.0, step=0.05,
-                label="Global Speech Speed (for tts-1/tts-1-hd)",
-                visible= (MODEL_DEFAULT in ["tts-1", "tts-1-hd"]),
-                interactive=True
-            )
-            global_instructions_input = gr.Textbox(
-                label="Global Voice Instructions (for gpt-4o-mini-tts)",
-                placeholder="e.g., Speak in a calm, reassuring tone.",
-                visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"),
-                interactive=True, lines=2
-            )
     gr.Markdown("### Speaker Configuration")
     with gr.Row():
-        speaker_config_method_dropdown = gr.Dropdown(
-            SPEAKER_CONFIG_METHODS, label="Speaker Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
-        )
-        global_voice_dropdown = gr.Dropdown(
-            APP_AVAILABLE_VOICES, label="Global Voice (for 'Single Voice' method)",
-            value=APP_AVAILABLE_VOICES[0],
-            visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"),
-            interactive=True
-        )
-    initial_json_config_example = """
-[
   {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
   {"speaker": "Bob", "voice": "echo"},
   {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
-]
-    """.strip()
     detailed_speaker_config_input = gr.Code(
         label="Detailed Speaker Configuration (JSON)",
         language="json",
         lines=7,
-        value=initial_json_config_example, # Corrected: Use value for default content
-        visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)"),
-        info="Define voice, and optionally model-compatible speed/instructions per speaker."
     )
     with gr.Row():
-        calculate_cost_button = gr.Button("Calculate Estimated Cost")
-        generate_button = gr.Button("Generate Audio Files", variant="primary")
     cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
     with gr.Row():
-        individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
-        merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
     status_output = gr.Textbox(label="Status", interactive=False, lines=2)
-    # --- Event Handlers for UI Interactivity ---
-    def update_model_specific_controls_visibility(selected_model):
-        is_tts_1_family = selected_model in ["tts-1", "tts-1-hd"]
-        is_gpt_mini = selected_model == "gpt-4o-mini-tts" # And any other future models supporting instructions
         return {
-            global_speed_input: gr.update(visible=is_tts_1_family, interactive=is_tts_1_family),
             global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
         }
-    tts_model_dropdown.change(
-        fn=update_model_specific_controls_visibility,
-        inputs=[tts_model_dropdown],
-        outputs=[global_speed_input, global_instructions_input]
-    )
-    def update_speaker_config_visibility(config_method):
-        is_single_voice = (config_method == "Single Voice (Global)")
-        is_detailed_json = (config_method == "Detailed Configuration (JSON)")
         return {
-            global_voice_dropdown: gr.update(visible=is_single_voice, interactive=is_single_voice),
-            detailed_speaker_config_input: gr.update(visible=is_detailed_json, interactive=is_detailed_json)
         }
-    speaker_config_method_dropdown.change(
-        fn=update_speaker_config_visibility,
-        inputs=[speaker_config_method_dropdown],
-        outputs=[global_voice_dropdown, detailed_speaker_config_input]
-    )
-    # --- Main Button Actions ---
-    calculate_cost_button.click(
-        fn=handle_calculate_cost,
-        inputs=[script_input, tts_model_dropdown],
-        outputs=[cost_output]
-    )
     generate_button.click(
         fn=handle_script_processing,
-        inputs=[
-            script_input, tts_model_dropdown, pause_input,
-            speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
-            global_speed_input, global_instructions_input
-        ],
-        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
-    )
     gr.Markdown("## Examples")
-    example_script_1 = "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more."
-    example_json_1 = """
-[
-  {"speaker": "Alice", "voice": "nova", "instructions": "sound curious"},
-  {"speaker": "Bob", "voice": "echo", "speed": 0.9}
-]
-    """.strip()
-    example_script_2 = "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain. I am speaking slowly.\n[Captain Eva] Excellent. Maintain course for Kepler-186f."
-    example_json_2 = """
-[
-  {"speaker": "Captain Eva", "voice": "alloy", "speed": 1.0},
-  {"speaker": "Computer", "voice": "onyx", "speed": 0.8, "instructions": "sound robotic and calm"}
-]
-    """.strip()
     gr.Examples(
         examples=[
-            [example_script_1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_1, 1.0, "Speak naturally."],
-            [example_script_2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_2, 1.1, ""],
-            ["[Narrator] A single line, using global settings.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]
-        ],
-        inputs=[
-            script_input, tts_model_dropdown, pause_input,
-            speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
-            global_speed_input, global_instructions_input
-        ],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
-        fn=handle_script_processing,
-        cache_examples=False,
-    )
 if __name__ == "__main__":
-    if os.name == 'nt':
-        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     demo.launch()

+/// --- app.py --- ///
 import gradio as gr
 import os
 import asyncio
 from openai import AsyncOpenAI
 from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
+from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
 from utils.merge_audio import merge_mp3_files
 # --- Configuration ---
 NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
 MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
 if not OPENAI_API_KEY:
     try:
         from huggingface_hub import HfApi
 else:
     print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
 TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
+if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE:
     MODEL_DEFAULT = "tts-1-hd"
 SPEAKER_CONFIG_METHODS = [
     "Single Voice (Global)",
     "Random per Speaker",
     "Detailed Configuration (JSON)"
 ]
 DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
 APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
 _speaker_config_cache = {}
 def parse_detailed_speaker_config(json_text, parsed_script_lines):
     config_map = {}
     default_voice = APP_AVAILABLE_VOICES[0]
     try:
         if not json_text.strip():
+            return {}
         config_list = json.loads(json_text)
         if not isinstance(config_list, list):
         for item in config_list:
             if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
                 print(f"Skipping malformed item in JSON config: {item}")
                 continue
             if item["voice"] not in APP_AVAILABLE_VOICES:
                 print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
                 item["voice"] = default_voice
             if "speed" in item:
                 try:
                     item["speed"] = float(item["speed"])
                     if not (0.25 <= item["speed"] <= 4.0):
+                        print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping.")
+                        item["speed"] = max(0.25, min(item["speed"], 4.0))
                 except ValueError:
                     print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
                     item.pop("speed", None)
             config_map[item["speaker"]] = {
                 "voice": item["voice"],
+                "speed": item.get("speed"),
+                "instructions": item.get("instructions")
             }
         return config_map
     except json.JSONDecodeError as e:
         raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
+    except ValueError as e:
+        raise e
+    except Exception as e:
         raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
 def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
                            global_selected_voice, detailed_config_map):
+    global _speaker_config_cache
     if _speaker_config_cache.get("__method") != speaker_config_method or \
        _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
         _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
     if speaker_config_method == "Detailed Configuration (JSON)":
         if speaker_name in detailed_config_map:
             speaker_specific = detailed_config_map[speaker_name]
             return {
                 "voice": speaker_specific.get("voice", base_config["voice"]),
+                "speed": speaker_specific.get("speed"),
+                "instructions": speaker_specific.get("instructions")
             }
+        else:
             print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
             return base_config
     if speaker_name not in _speaker_config_cache:
         if speaker_config_method == "Random per Speaker":
             available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
             if not _speaker_config_cache.get("__all_assigned_random"):
                  for i, spk_unique in enumerate(unique_script_speakers):
                      if spk_unique not in _speaker_config_cache:
                         _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
                  _speaker_config_cache["__all_assigned_random"] = True
             if speaker_name not in _speaker_config_cache:
                  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
         elif speaker_config_method == "A/B Round Robin":
             if not _speaker_config_cache.get("__all_assigned_ab"):
                 for i, spk_unique in enumerate(unique_script_speakers):
                      if spk_unique not in _speaker_config_cache:
                         _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
                 _speaker_config_cache["__all_assigned_ab"] = True
+            if speaker_name not in _speaker_config_cache:
                 speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
                 _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
+        else:
              _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
     cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
     return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
 async def handle_script_processing(
+    dialogue_script: str, tts_model: str, pause_ms: int,
+    speaker_config_method: str, global_voice_selection: str,
+    detailed_speaker_json: str, global_speed: float,
+    global_instructions: str, progress=gr.Progress(track_tqdm=True)):
     global _speaker_config_cache
+    _speaker_config_cache = {}
     if not OPENAI_API_KEY or not async_openai_client:
+        return None, None, "Error: OPENAI_API_KEY is not configured."
     if not dialogue_script.strip():
         return None, None, "Error: Script is empty."
         except ValueError as e:
             return None, None, f"Configuration Error: {str(e)}"
+    tasks, line_audio_files = [], [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
         speaker_name = line_data["speaker"]
         speaker_base_cfg = get_config_for_speaker(
             speaker_name, speaker_config_method, unique_speakers,
+            global_voice_selection, parsed_detailed_config_map)
         line_voice = speaker_base_cfg["voice"]
+        effective_speed = global_speed
+        if speaker_base_cfg.get("speed") is not None:
             effective_speed = speaker_base_cfg["speed"]
         effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
+        if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
             effective_instructions = speaker_base_cfg["instructions"]
         output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
         progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
+        tasks.append(synthesize_speech_line(
+            client=async_openai_client, text=line_data["text"], voice=line_voice,
+            output_path=output_filename, model=tts_model, speed=effective_speed,
+            instructions=effective_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
+            line_index=line_data['id']))
     synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
     for idx, result in enumerate(synthesis_results):
+        if isinstance(result, Exception): print(f"Error for line {parsed_lines[idx]['id']}: {result}")
+        elif result is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
+        else: line_audio_files[idx] = result
     valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
     if not valid_audio_files:
         shutil.rmtree(job_audio_path_prefix)
+        return None, None, "Error: No audio files successfully synthesized."
     zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
     with zipfile.ZipFile(zip_filename, 'w') as zf:
+        for pth in valid_audio_files: zf.write(pth, os.path.basename(pth))
+    merged_mp3_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
+    merged_out_path = merge_mp3_files([f for f in line_audio_files if f], merged_mp3_fn, pause_ms)
+    status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
+    if len(valid_audio_files) < len(parsed_lines): status += "Some lines failed/skipped. "
+    if not merged_out_path and len(valid_audio_files) > 0: status += "Merged audio failed. "
+    elif not merged_out_path and len(valid_audio_files) == 0: status += "No audio generated."
+    else: status += "Outputs generated."
+    return (zip_filename if os.path.exists(zip_filename) else None,
+           merged_out_path if merged_out_path and os.path.exists(merged_out_path) else None,
+           status)
 def handle_calculate_cost(dialogue_script: str, tts_model: str):
+    if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
     try:
+        parsed, chars = parse_dialogue_script(dialogue_script)
+        if not parsed: return "Cost: $0.000000 (No valid lines)"
+        cost = calculate_cost(chars, len(parsed), tts_model)
+        return f"Est. Cost: ${cost:.6f} ({chars} chars, {len(parsed)} lines)"
+    except ValueError as e: return f"Error: {str(e)}"
+    except Exception as e: return f"Cost calc error: {str(e)}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
+    gr.Markdown("Convert scripts with control over voices, speed, and instructions.")
     if not OPENAI_API_KEY or not async_openai_client:
+        gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
+            script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10, info=f"Max {MAX_SCRIPT_LENGTH} chars.")
         with gr.Column(scale=1):
+            tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
+            pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
+            global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/tts-1-hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
+            global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini-tts)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
     gr.Markdown("### Speaker Configuration")
     with gr.Row():
+        speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
+        global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice' method)", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
+    initial_json_config_example = """[
   {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
   {"speaker": "Bob", "voice": "echo"},
   {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
+]""".strip()
     detailed_speaker_config_input = gr.Code(
         label="Detailed Speaker Configuration (JSON)",
         language="json",
         lines=7,
+        value=initial_json_config_example,
+        visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
+    )
+    # New Markdown for info text, visibility tied to dropdown
+    detailed_config_info_md = gr.Markdown(
+        "<small>Define voice per speaker. Optionally, `speed` (0.25-4.0) for `tts-1`/`tts-1-hd` models, "
+        "and `instructions` (text) for `gpt-4o-mini-tts`.</small>",
+        visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
     )
     with gr.Row():
+        calculate_cost_button = gr.Button("Calculate Cost")
+        generate_button = gr.Button("Generate Audio", variant="primary")
     cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
     with gr.Row():
+        individual_lines_zip_output = gr.File(label="Download ZIP")
+        merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
     status_output = gr.Textbox(label="Status", interactive=False, lines=2)
+    def update_model_controls(selected_model):
+        is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
+        is_gpt_mini = selected_model == "gpt-4o-mini-tts"
         return {
+            global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
             global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
         }
+    tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input])
+    def update_speaker_controls(method):
+        is_single = (method == "Single Voice (Global)")
+        is_detailed = (method == "Detailed Configuration (JSON)")
         return {
+            global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
+            detailed_speaker_config_input: gr.update(visible=is_detailed, interactive=is_detailed),
+            detailed_config_info_md: gr.update(visible=is_detailed) # Control visibility of new Markdown
         }
+    speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_speaker_config_input, detailed_config_info_md])
+    calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
     generate_button.click(
         fn=handle_script_processing,
+        inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
+                global_voice_dropdown, detailed_speaker_config_input, global_speed_input, global_instructions_input],
+        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
     gr.Markdown("## Examples")
+    ex_script1 = "[A] Hi B.\n[B] Hey A.\n[A] What's up?"
+    ex_json1 = '[{"speaker": "A", "voice": "nova", "instr...": "curious"}, {"speaker": "B", "voice": "echo", "speed": 0.9}]'.replace("instr...", "instructions") # Hack for brevity
+    ex_script2 = "[Cpt] Status?\n[Comp] Nominal. Slow.\n[Cpt] Good."
+    ex_json2 = '[{"speaker": "Cpt", "voice": "alloy"}, {"speaker": "Comp", "voice": "onyx", "speed": 0.8, "instr...": "robotic"}]'.replace("instr...", "instructions")
     gr.Examples(
         examples=[
+            [ex_script1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json1, 1.0, "Speak naturally."],
+            [ex_script2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json2, 1.1, ""],
+            ["[N] Single line.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]],
+        inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
+                detailed_speaker_config_input, global_speed_input, global_instructions_input],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
+        fn=handle_script_processing, cache_examples=False)
 if __name__ == "__main__":
+    if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     demo.launch()