Spaces:

abocha
/

esl-dialogue-tts

Running

File size: 17,161 Bytes

8468afb

import gradio as gr
import os
import asyncio
import tempfile
import shutil
import zipfile
import random
from functools import partial

from utils.script_parser import parse_dialogue_script, calculate_cost
from utils.openai_tts import synthesize_speech_line
from utils.merge_audio import merge_mp3_files

# Import constants from ui_layout to avoid circular dependencies if they were in app.py
from ui_layout import APP_AVAILABLE_VOICES, DEFAULT_VIBE, VIBE_CHOICES, PREDEFINED_VIBES

# Ensure a default voice if APP_AVAILABLE_VOICES is empty (shouldn't happen with new ui_layout)
DEFAULT_FALLBACK_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"


def get_speakers_from_script(script_text: str):
    if not script_text.strip():
        return []
    try:
        parsed_lines, _ = parse_dialogue_script(script_text)
        seen_speakers = set()
        ordered_unique_speakers = []
        for p in parsed_lines:
            if p["speaker"] not in seen_speakers:
                ordered_unique_speakers.append(p["speaker"])
                seen_speakers.add(p["speaker"])
        return ordered_unique_speakers
    except ValueError:
        return []


def handle_dynamic_input_change(new_value, current_configs_state_dict: dict, speaker_name: str, config_key: str, tts_model: str):
    """Handles changes from dynamically generated UI elements for per-speaker settings."""
    # print(f"Dynamic change for {speaker_name}, key {config_key}: {new_value}. State: {current_configs_state_dict}")
    if current_configs_state_dict is None: # Should ideally be initialized by Gradio's gr.State
        current_configs_state_dict = {}
    if speaker_name not in current_configs_state_dict:
        current_configs_state_dict[speaker_name] = {}
    
    current_configs_state_dict[speaker_name][config_key] = new_value
    return current_configs_state_dict


def load_refresh_per_speaker_ui(script_text: str, current_configs_state_dict: dict, tts_model: str, speaker_configs_state_component: gr.State):
    """
    Generates or refreshes the dynamic UI components (accordions) for each speaker.
    Returns a list of Gradio components to populate the dynamic UI area and the updated state.
    """
    print(f"Load/Refresh UI called. TTS Model: {tts_model}") # Debug
    unique_speakers = get_speakers_from_script(script_text)
    new_ui_components = []
    
    if current_configs_state_dict is None:
        current_configs_state_dict = {}

    # Ensure a default voice for safety
    safe_default_voice = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"

    for speaker_name in unique_speakers:
        if speaker_name not in current_configs_state_dict:
            current_configs_state_dict[speaker_name] = {
                "voice": safe_default_voice, "speed": 1.0,
                "vibe": DEFAULT_VIBE, "custom_instructions": ""
            }
        # Ensure all keys exist with defaults
        current_configs_state_dict[speaker_name].setdefault("voice", safe_default_voice)
        current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
        current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
        current_configs_state_dict[speaker_name].setdefault("custom_instructions", "")

    if not unique_speakers:
        print("No unique speakers found, returning markdown.") # Debug
        new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again, or change the script content."))
        return new_ui_components, current_configs_state_dict

    print(f"Found speakers: {unique_speakers}. Building UI...") # Debug
    for speaker_name in unique_speakers:
        speaker_cfg = current_configs_state_dict[speaker_name]

        speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
        instructions_relevant = tts_model == "gpt-4o-mini-tts"

        # Use a unique elem_id for each accordion to help Gradio differentiate if needed
        accordion_elem_id = f"accordion_speaker_{speaker_name.replace(' ', '_')}"

        with gr.Accordion(label=f"Settings for: {speaker_name}", open=False, elem_id=accordion_elem_id) as speaker_accordion:
            # Voice Dropdown
            voice_dd = gr.Dropdown(
                label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg.get("voice", safe_default_voice), interactive=True
            )
            voice_dd.change(
                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
                inputs=[voice_dd, speaker_configs_state_component],
                outputs=[speaker_configs_state_component]
            )

            # Speed Slider
            speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
            speed_slider = gr.Slider(
                label=speed_slider_label, minimum=0.25, maximum=4.0, value=float(speaker_cfg.get("speed", 1.0)),
                step=0.05, interactive=speed_interactive
            )
            if speed_interactive:
                speed_slider.release(
                    fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
                    inputs=[speed_slider, speaker_configs_state_component],
                    outputs=[speaker_configs_state_component]
                )
            
            # Vibe Dropdown
            vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
            vibe_dd = gr.Dropdown(
                label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg.get("vibe", DEFAULT_VIBE), interactive=True
            )
            vibe_dd.change(
                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="vibe", tts_model=tts_model),
                inputs=[vibe_dd, speaker_configs_state_component],
                outputs=[speaker_configs_state_component]
            )

            # Custom Instructions Textbox
            custom_instr_label = "Custom Instructions"
            custom_instr_placeholder = "Used if Vibe is 'Custom...'. Overrides Vibe preset."
            custom_instr_tb = gr.Textbox(
                label=custom_instr_label,
                value=speaker_cfg.get("custom_instructions", ""),
                placeholder=custom_instr_placeholder,
                lines=2, interactive=True
            )
            custom_instr_tb.input(
                fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
                inputs=[custom_instr_tb, speaker_configs_state_component],
                outputs=[speaker_configs_state_component]
            )
        new_ui_components.append(speaker_accordion)
    
    print(f"Returning {len(new_ui_components)} UI components for dynamic area.") # Debug
    return new_ui_components, current_configs_state_dict


async def handle_script_processing(
    openai_api_key: str, async_openai_client, nsfw_api_url_template: str, # Passed from app.py
    dialogue_script: str, tts_model: str, pause_ms: int,
    speaker_config_method: str, global_voice_selection: str,
    speaker_configs_state_dict: dict,
    global_speed: float,
    global_instructions: str,
    progress=gr.Progress(track_tqdm=True)
):
    if not openai_api_key or not async_openai_client:
        return None, None, "Error: OpenAI API Key or client is not configured."
    if not dialogue_script.strip():
        return None, None, "Error: Script is empty."

    job_audio_path_prefix = os.path.join(tempfile.gettempdir(), f"dialogue_tts_job_{random.randint(10000, 99999)}")
    if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
    os.makedirs(job_audio_path_prefix, exist_ok=True)

    try:
        parsed_lines, _ = parse_dialogue_script(dialogue_script)
        if not parsed_lines:
            shutil.rmtree(job_audio_path_prefix)
            return None, None, "Error: No valid lines found in script."
    except ValueError as e:
        shutil.rmtree(job_audio_path_prefix)
        return None, None, f"Script parsing error: {str(e)}"

    if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
    
    # Ensure a default voice for safety
    safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else DEFAULT_FALLBACK_VOICE

    speaker_voice_map = {}
    if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
        unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
        temp_voices_pool = APP_AVAILABLE_VOICES.copy()
        if not temp_voices_pool: temp_voices_pool = [DEFAULT_FALLBACK_VOICE] # Ensure pool isn't empty
        
        if speaker_config_method == "Random per Speaker":
            for spk_name in unique_script_speakers_for_map:
                speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
        elif speaker_config_method == "A/B Round Robin":
            for i, spk_name in enumerate(unique_script_speakers_for_map):
                speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
    
    tasks = []
    # line_audio_files map to store results by original line ID for correct ordering
    line_audio_files_map = {} 

    for i, line_data in enumerate(parsed_lines):
        speaker_name = line_data["speaker"]
        line_voice = safe_default_global_voice
        line_speed = global_speed
        line_instructions = global_instructions if global_instructions and global_instructions.strip() else None

        if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
            spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
            line_voice = spk_cfg.get("voice", safe_default_global_voice)
            if tts_model in ["tts-1", "tts-1-hd"]:
                line_speed = float(spk_cfg.get("speed", global_speed))
            if tts_model == "gpt-4o-mini-tts":
                vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
                custom_instr = spk_cfg.get("custom_instructions", "").strip()
                if vibe == "Custom..." and custom_instr:
                    line_instructions = custom_instr
                elif vibe != "None" and vibe != "Custom...":
                    line_instructions = PREDEFINED_VIBES.get(vibe, "")
                if not line_instructions and global_instructions and global_instructions.strip():
                    line_instructions = global_instructions
                elif not line_instructions:
                    line_instructions = None
        elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
            line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
        
        if tts_model not in ["tts-1", "tts-1-hd"]:
            line_speed = 1.0

        out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}_{speaker_name.replace(' ','_')}.mp3")
        progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
        
        tasks.append(synthesize_speech_line(
            client=async_openai_client, text=line_data["text"], voice=line_voice,
            output_path=out_fn, model=tts_model, speed=line_speed,
            instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
            line_index=line_data['id']
        ))

    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    for idx, res_path_or_exc in enumerate(results):
        original_line_id = parsed_lines[idx]['id'] # Get original ID from the parsed line
        if isinstance(res_path_or_exc, Exception):
            print(f"Error synthesizing line ID {original_line_id} ({parsed_lines[idx]['speaker']}): {res_path_or_exc}")
            line_audio_files_map[original_line_id] = None
        elif res_path_or_exc is None:
            print(f"Skipped or failed synthesizing line ID {original_line_id} ({parsed_lines[idx]['speaker']})")
            line_audio_files_map[original_line_id] = None
        else:
            line_audio_files_map[original_line_id] = res_path_or_exc

    # Reconstruct ordered list of files for merging, using original line IDs
    ordered_files_for_merge_and_zip = []
    for p_line in parsed_lines:
        file_path = line_audio_files_map.get(p_line['id'])
        if file_path and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
            ordered_files_for_merge_and_zip.append(file_path)
        else:
            ordered_files_for_merge_and_zip.append(None) # Keep placeholder for failed lines for merge logic

    valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]

    if not valid_files_for_zip:
        shutil.rmtree(job_audio_path_prefix)
        return None, None, "Error: No audio was successfully synthesized."

    zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
    with zipfile.ZipFile(zip_fn, 'w') as zf:
        for f_path in valid_files_for_zip:
            zf.write(f_path, os.path.basename(f_path))
    
    merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
    # For merge_mp3_files, pass only the list of existing files in order
    files_to_actually_merge = [f for f in ordered_files_for_merge_and_zip if f]
    merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)

    status = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
    if len(valid_files_for_zip) < len(parsed_lines): status += "Some lines may have failed. "
    if not merged_path and len(valid_files_for_zip) > 0: status += "Merging audio failed. "
    elif not merged_path: status = "No audio to merge."
    else: status += "Merged audio generated."
        
    return (zip_fn if os.path.exists(zip_fn) else None,
           merged_path if merged_path and os.path.exists(merged_path) else None,
           status)


def handle_calculate_cost(dialogue_script: str, tts_model: str):
    if not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
    try:
        parsed, chars = parse_dialogue_script(dialogue_script)
        if not parsed: return "Cost: $0.00 (No valid lines in script)"
        cost = calculate_cost(chars, len(parsed), tts_model)
        return f"Estimated Cost for {len(parsed)} lines ({chars} chars): ${cost:.6f}"
    except ValueError as e:
        return f"Cost calculation error: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred during cost calculation: {str(e)}"

def update_model_controls_visibility(selected_model: str, script_text_for_refresh: str, current_speaker_configs_for_refresh: dict, speaker_configs_state_comp: gr.State):
    """Updates visibility of global controls and refreshes per-speaker UI when TTS model changes."""
    print(f"Model changed to: {selected_model}. Refreshing dynamic UI and controls.") # Debug
    try:
        # load_refresh_per_speaker_ui might return components or markdown
        # It now takes speaker_configs_state_comp as an argument to wire up .change() correctly
        dynamic_ui_output, updated_state = load_refresh_per_speaker_ui(
            script_text_for_refresh, current_speaker_configs_for_refresh, selected_model, speaker_configs_state_comp
        )
    except Exception as e:
        print(f"Error in load_refresh_per_speaker_ui called from model_controls_visibility: {e}")
        # Fallback: clear dynamic UI and keep state as is, or return an error message component
        dynamic_ui_output = [gr.Markdown(f"Error refreshing per-speaker UI: {e}")]
        updated_state = current_speaker_configs_for_refresh # or {} to reset

    is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
    is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
    
    # The keys in this dictionary must match the Gradio components passed in the `outputs` list
    # of the .change() event.
    updates = {
        "global_speed_input": gr.update(visible=is_tts1_family, interactive=is_tts1_family),
        "global_instructions_input": gr.update(visible=is_gpt_mini_tts, interactive=is_gpt_mini_tts),
        "dynamic_speaker_ui_area": dynamic_ui_output, # This directly provides the new children for the Column
        "speaker_configs_state": updated_state
    }
    return updates["global_speed_input"], updates["global_instructions_input"], updates["dynamic_speaker_ui_area"], updates["speaker_configs_state"]


def update_speaker_config_method_visibility(method: str):
    """Updates visibility of UI groups based on selected speaker configuration method."""
    is_single = (method == "Single Voice (Global)")
    is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
    
    # Keys here must match the Gradio components in the .change() event's `outputs` list.
    return {
        "single_voice_group": gr.update(visible=is_single),
        "detailed_per_speaker_ui_group": gr.update(visible=is_detailed_per_speaker),
    }