Spaces:

abocha
/

esl-dialogue-tts

Running

File size: 7,772 Bytes

import gradio as gr
from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES # Import directly for APP_AVAILABLE_VOICES

# --- UI Constants and Configuration ---
TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
MODEL_DEFAULT_ENV = "tts-1-hd" # Default if env var not set or invalid

SPEAKER_CONFIG_METHODS = [
    "Single Voice (Global)",
    "Random per Speaker",
    "A/B Round Robin",
    "Detailed Configuration (Per Speaker UI)"
]
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
# Ensure APP_AVAILABLE_VOICES is never empty for safety, though ALL_TTS_VOICES has defaults
if not APP_AVAILABLE_VOICES:
    APP_AVAILABLE_VOICES = ["alloy"] # Absolute fallback

PREDEFINED_VIBES = {
    "None": "",
    "Calm": "Speak in a calm, composed, and relaxed manner.",
    "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
    "Happy": "Speak with a cheerful, bright, and joyful voice.",
    "Sad": "Speak with a sorrowful, melancholic, and dejected tone.",
    "Whisper": "Speak softly, as if whispering.",
    "Angry": "Speak with a strong, firm, and possibly agitated voice.",
    "Fearful": "Speak with a trembling, hesitant, and scared voice.",
    "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
    "Authoritative": "Speak with a commanding, confident, and firm voice.",
    "Friendly": "Speak in a warm, approachable, and amiable manner.",
    "Custom...": "CUSTOM"
}
VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
DEFAULT_VIBE = "None"
DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"


# --- UI Element Creation Functions ---

def create_main_input_components(model_default_value):
    """Creates the main input components for script, model, pause, and global settings."""
    with gr.Row():
        with gr.Column(scale=2):
            script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\n[Speaker2] How are you today?", lines=10)
        with gr.Column(scale=1):
            tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value)
            pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50)
            
            is_tts1_family_default = model_default_value in ["tts-1", "tts-1-hd"]
            is_gpt_mini_tts_default = model_default_value == "gpt-4o-mini-tts"

            global_speed_input = gr.Slider(
                minimum=0.25, maximum=4.0, value=1.0, step=0.05,
                label="Global Speed (for tts-1/hd)",
                visible=is_tts1_family_default,
                interactive=True
            )
            global_instructions_input = gr.Textbox(
                label="Global Instructions (for gpt-4o-mini-tts)",
                placeholder="e.g., Speak with a calm tone.",
                visible=is_gpt_mini_tts_default,
                interactive=True, lines=2
            )
    return script_input, tts_model_dropdown, pause_input, global_speed_input, global_instructions_input

def create_speaker_config_components():
    """Creates components for speaker configuration method and detailed settings UI."""
    gr.Markdown("### Speaker Voice & Style Configuration")
    speaker_config_method_dropdown = gr.Dropdown(
        SPEAKER_CONFIG_METHODS, label="Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
    )

    # Visibility is controlled by event handlers based on speaker_config_method_dropdown
    with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
        global_voice_dropdown = gr.Dropdown(
            APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True
        )

    with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
        load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
        gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are applied per-speaker. If script changes, click again to refresh.</small>")
        dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
    
    return speaker_config_method_dropdown, single_voice_group, global_voice_dropdown, detailed_per_speaker_ui_group, load_per_speaker_ui_button, dynamic_speaker_ui_area

def create_action_and_output_components():
    """Creates buttons for actions (cost, generate) and output display areas."""
    with gr.Row():
        calculate_cost_button = gr.Button("Calculate Estimated Cost")
        generate_button = gr.Button("Generate Audio", variant="primary")
    
    cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
    with gr.Row():
        individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)")
        merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath")
    status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5)
    return calculate_cost_button, generate_button, cost_output, individual_lines_zip_output, merged_dialogue_mp3_output, status_output

def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=None): # Added outputs_for_examples
    """Creates the examples section."""
    gr.Markdown("## Example Scripts")
    example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\n[Alice] Let's see how this sounds."
    example_script_2 = "[Narrator] This is a short story.\n[CharacterA] Once upon a time...\n[Narrator] ...there was a Gradio app.\n[CharacterB] And it could talk!"
    
    examples_data = [
        [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
        [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
        ["[Solo] Just one line, using global voice and speed.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
    ]
    
    num_inputs = len(inputs_for_examples)
    valid_examples_data = []
    for ex_data in examples_data:
        if len(ex_data) == num_inputs:
            valid_examples_data.append(ex_data)
        else:
            print(f"Warning: Example data mismatch. Expected {num_inputs} items, got {len(ex_data)}. Skipping example: {ex_data[0][:30]}...")

    if not valid_examples_data:
        gr.Markdown("<p style='color: orange;'>No valid examples could be loaded due to configuration mismatch.</p>")
        return None

    # If process_fn and outputs_for_examples are provided, make examples runnable
    if process_fn and outputs_for_examples:
        return gr.Examples(
            examples=valid_examples_data,
            inputs=inputs_for_examples,
            outputs=outputs_for_examples,
            fn=process_fn,
            cache_examples=False, # Caching can be complex with external API calls
            examples_per_page=5, # Corrected parameter name
            label="Example Scripts (Click to Load & Run)",
            run_on_click=True # Make examples run when clicked
        )
    else: # Otherwise, just load inputs
        return gr.Examples(
            examples=valid_examples_data,
            inputs=inputs_for_examples,
            examples_per_page=5, # Corrected parameter name
            label="Example Scripts (Click to Load Inputs)",
            # No fn, outputs, or run_on_click if process_fn is None
        )