Spaces:

abocha
/

esl-dialogue-tts

Running

File size: 6,258 Bytes

1190db4
 
 
 
8468afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1190db4
5c85d81
8468afb
1190db4
8468afb
 
 
 
1190db4
 
8468afb
1190db4
 
8468afb
1190db4
 
 
 
8468afb
 
 
1190db4
8468afb
1190db4
 
 
 
8468afb
1190db4
d44dfc0
8468afb
1190db4
8468afb
1190db4
8468afb
1190db4
8468afb
 
d44dfc0
8468afb
 
 
d44dfc0
8468afb
 
 
d44dfc0
8468afb
 
 
5c85d81
8468afb
d48101f
8468afb
59d4d10
8468afb
b7680b4
d44dfc0
59d4d10
5c85d81
8468afb
59d4d10
8468afb
 
d44dfc0
59d4d10
a2f0e99
8468afb
d44dfc0
 
b7680b4
8468afb
a2f0e99
5c85d81
8468afb
 
 
 
 
 
d44dfc0
8468afb
 
 
b7680b4
1190db4
b7680b4
d44dfc0
8468afb
 
 
d44dfc0
 
8468afb
d48101f
1190db4
8468afb
 
 
 
b7680b4
8468afb
 
 
 
 
b7680b4
 
8468afb
b7680b4
 
 
 
 
 
8468afb
 
1190db4
8468afb
d48101f
b7680b4

import gradio as gr
import os
import asyncio
from openai import AsyncOpenAI
from functools import partial # For handle_script_processing

# Import UI creation functions and constants
from ui_layout import (
    create_main_input_components, create_speaker_config_components,
    create_action_and_output_components, create_examples_ui,
    TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV
)

# Import event handler functions
from event_handlers import (
    handle_script_processing, handle_calculate_cost,
    update_model_controls_visibility, update_speaker_config_method_visibility,
    load_refresh_per_speaker_ui
)

# --- Application Secrets and Global Client ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
MODEL_DEFAULT_FROM_ENV = os.getenv("MODEL_DEFAULT", MODEL_DEFAULT_ENV)

# Validate MODEL_DEFAULT_FROM_ENV or use hardcoded default
EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_ENV if MODEL_DEFAULT_FROM_ENV in TTS_MODELS_AVAILABLE else MODEL_DEFAULT_ENV

async_openai_client = None
if not OPENAI_API_KEY:
    try:
        # Attempt to load from Hugging Face Hub secrets if not in env
        from huggingface_hub import HfApi
        api = HfApi()
        space_id = os.getenv("SPACE_ID") # Provided by HF Spaces
        if space_id:
            secrets = api.get_space_secrets(repo_id=space_id)
            OPENAI_API_KEY = secrets.get("OPENAI_API_KEY")
            NSFW_API_URL_TEMPLATE = secrets.get("NSFW_API_URL_TEMPLATE", NSFW_API_URL_TEMPLATE)
            MODEL_DEFAULT_FROM_HUB = secrets.get("MODEL_DEFAULT", EFFECTIVE_MODEL_DEFAULT)
            EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_HUB if MODEL_DEFAULT_FROM_HUB in TTS_MODELS_AVAILABLE else EFFECTIVE_MODEL_DEFAULT
            print("Loaded secrets from Hugging Face Hub.")
    except Exception as e:
        print(f"Could not retrieve secrets from Hugging Face Hub: {e}. OPENAI_API_KEY might be missing.")

if OPENAI_API_KEY:
    async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
else:
    print("CRITICAL ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")


# --- Gradio Application UI and Logic ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Dialogue Script to Speech (OpenAI TTS) - Refactored")
    if not OPENAI_API_KEY or not async_openai_client:
        gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")

    # Central state for detailed speaker configurations
    speaker_configs_state = gr.State({}) # This is crucial for dynamic UI

    # --- Define UI Components by calling layout functions ---
    (script_input, tts_model_dropdown, pause_input,
     global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)

    (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
     detailed_per_speaker_ui_group, load_per_speaker_ui_button,
     dynamic_speaker_ui_area) = create_speaker_config_components()

    (calculate_cost_button, generate_button, cost_output,
     individual_lines_zip_output, merged_dialogue_mp3_output,
     status_output) = create_action_and_output_components()

    # --- Event Wiring ---

    # When TTS model changes, update visibility of global speed/instructions & refresh dynamic UI
    tts_model_dropdown.change(
        fn=update_model_controls_visibility,
        inputs=[tts_model_dropdown, script_input, speaker_configs_state, speaker_configs_state],
        outputs=[global_speed_input, global_instructions_input, dynamic_speaker_ui_area, speaker_configs_state]
    )

    # When speaker config method changes, update visibility of relevant UI groups
    speaker_config_method_dropdown.change(
        fn=update_speaker_config_method_visibility,
        inputs=[speaker_config_method_dropdown],
        outputs=[single_voice_group, detailed_per_speaker_ui_group]
    )
    
    # Button to load/refresh the detailed per-speaker UI configurations
    load_per_speaker_ui_button.click(
        fn=load_refresh_per_speaker_ui,
        inputs=[script_input, speaker_configs_state, tts_model_dropdown, speaker_configs_state], 
        outputs=[dynamic_speaker_ui_area, speaker_configs_state]
    )
    
    # Calculate cost button
    calculate_cost_button.click(
        fn=handle_calculate_cost,
        inputs=[script_input, tts_model_dropdown],
        outputs=[cost_output]
    )
    
    # Generate audio button
    # Use functools.partial to pass fixed arguments like API key and client to the handler
    # Gradio inputs will be appended to these fixed arguments when the handler is called.
    generate_button_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
    generate_button.click(
        fn=generate_button_fn,
        inputs=[
            script_input, tts_model_dropdown, pause_input,
            speaker_config_method_dropdown, global_voice_dropdown,
            speaker_configs_state, # The gr.State object itself
            global_speed_input, global_instructions_input
        ],
        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
    )

    # --- Examples UI ---
    example_inputs_list = [
        script_input, tts_model_dropdown, pause_input,
        speaker_config_method_dropdown, global_voice_dropdown,
        speaker_configs_state, 
        global_speed_input, global_instructions_input
    ]
    
    example_outputs_list = [individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
    
    # Make examples runnable
    example_process_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
    
    _ = create_examples_ui(
        inputs_for_examples=example_inputs_list,
        process_fn=example_process_fn if OPENAI_API_KEY else None, # Only make runnable if API key exists
        outputs_for_examples=example_outputs_list if OPENAI_API_KEY else None
    )


# --- Launch ---
if __name__ == "__main__":
    if os.name == 'nt':
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    demo.queue().launch(debug=True, share=False)