File size: 10,785 Bytes
e5a707f
1190db4
 
 
 
635a2fb
49a48a4
8468afb
 
 
 
49a48a4
 
8468afb
 
 
 
92c9b3d
 
 
49a48a4
 
8468afb
 
1190db4
5c85d81
8468afb
 
 
e5a707f
1190db4
 
 
 
635a2fb
1190db4
 
 
 
8468afb
 
 
1190db4
8468afb
1190db4
 
 
 
8468afb
1190db4
d44dfc0
92c9b3d
 
1190db4
8468afb
1190db4
92c9b3d
d44dfc0
8468afb
 
d44dfc0
8468afb
49a48a4
92c9b3d
d44dfc0
8468afb
 
 
5c85d81
49a48a4
92c9b3d
 
 
 
 
49a48a4
 
92c9b3d
49a48a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92c9b3d
 
59d4d10
92c9b3d
 
 
59d4d10
5c85d81
59d4d10
92c9b3d
8468afb
92c9b3d
59d4d10
a2f0e99
d44dfc0
92c9b3d
e73ce82
92c9b3d
a2f0e99
5c85d81
8468afb
 
 
 
 
d44dfc0
b7680b4
1190db4
b7680b4
d44dfc0
8468afb
 
635a2fb
d44dfc0
 
8468afb
d48101f
1190db4
8468afb
 
 
b7680b4
8468afb
 
 
b7680b4
8468afb
b7680b4
 
635a2fb
b7680b4
 
 
1190db4
49a48a4
d48101f
b7680b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# FILE: app.py
import gradio as gr
import os
import asyncio
from openai import AsyncOpenAI
from functools import partial 
import datetime 

from ui_layout import (
    create_main_input_components, create_speaker_config_components,
    create_action_and_output_components, create_examples_ui,
    TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES, 
    DEFAULT_GLOBAL_VOICE, VIBE_CHOICES, DEFAULT_VIBE, PREDEFINED_VIBES
)

from event_handlers import (
    handle_script_processing, handle_calculate_cost,
    handle_speaker_config_method_visibility_change, 
    handle_tts_model_change, 
    handle_load_refresh_per_speaker_ui_trigger,
    handle_dynamic_accordion_input_change, # For controls inside @gr.render
    get_speakers_from_script
)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
MODEL_DEFAULT_FROM_ENV = os.getenv("MODEL_DEFAULT", MODEL_DEFAULT_ENV)
EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_ENV if MODEL_DEFAULT_FROM_ENV in TTS_MODELS_AVAILABLE else MODEL_DEFAULT_ENV
async_openai_client = None

if not OPENAI_API_KEY:
    try:
        from huggingface_hub import HfApi
        api = HfApi()
        space_id = os.getenv("SPACE_ID") 
        if space_id:
            secrets = api.get_space_secrets(repo_id=space_id)
            OPENAI_API_KEY = secrets.get("OPENAI_API_KEY")
            NSFW_API_URL_TEMPLATE = secrets.get("NSFW_API_URL_TEMPLATE", NSFW_API_URL_TEMPLATE)
            MODEL_DEFAULT_FROM_HUB = secrets.get("MODEL_DEFAULT", EFFECTIVE_MODEL_DEFAULT)
            EFFECTIVE_MODEL_DEFAULT = MODEL_DEFAULT_FROM_HUB if MODEL_DEFAULT_FROM_HUB in TTS_MODELS_AVAILABLE else EFFECTIVE_MODEL_DEFAULT
            print("Loaded secrets from Hugging Face Hub.")
    except Exception as e:
        print(f"Could not retrieve secrets from Hugging Face Hub: {e}. OPENAI_API_KEY might be missing.")

if OPENAI_API_KEY:
    async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
else:
    print("CRITICAL ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")


with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
    gr.Markdown("# Dialogue Script to Speech (OpenAI TTS) - Using @gr.render")
    if not OPENAI_API_KEY or not async_openai_client:
        gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")

    speaker_configs_state = gr.State({}) 

    (script_input, tts_model_dropdown, pause_input,
     global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)

    (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
     detailed_per_speaker_ui_group_container,
     load_per_speaker_ui_button) = create_speaker_config_components()

    (calculate_cost_button, generate_button, cost_output,
     individual_lines_zip_output, merged_dialogue_mp3_output,
     status_output) = create_action_and_output_components()

    with detailed_per_speaker_ui_group_container: # Define @gr.render inside this column
        @gr.render(
            inputs=[script_input, speaker_configs_state, tts_model_dropdown],
            triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
        )
        def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
            """Defines and re-renders the dynamic UI for per-speaker configuration."""
            print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs,dict) else 'Not a dict'}")
            
            unique_speakers = get_speakers_from_script(current_script_text)

            if not unique_speakers:
                gr.Markdown("<p style='color: #888; margin-top:10px;'>Enter a script with speaker tags (e.g., `[SpeakerName] Text`) and click 'Load/Refresh' to see per-speaker settings.</p>")
                return

            for speaker_idx, speaker_name in enumerate(unique_speakers):
                # Ensure current_speaker_configs is a valid dictionary
                if not isinstance(current_speaker_configs, dict):
                    current_speaker_configs = {} 
                
                speaker_specific_config = current_speaker_configs.get(speaker_name, {})
                accordion_elem_id = f"accordion_spk_{speaker_idx}_{speaker_name.replace(' ','_').lower()}"

                with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False, elem_id=accordion_elem_id):
                    gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")

                    default_voice = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
                    voice_dd_elem_id = f"voice_dd_spk_{speaker_idx}"
                    voice_dropdown = gr.Dropdown(
                        APP_AVAILABLE_VOICES, 
                        value=default_voice, 
                        label="Voice",
                        elem_id=voice_dd_elem_id
                    )
                    voice_dropdown.change(
                        fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="voice"),
                        inputs=[voice_dropdown, speaker_configs_state], # Pass current state to handler
                        outputs=[speaker_configs_state]
                    )

                    if current_tts_model in ["tts-1", "tts-1-hd"]:
                        default_speed = float(speaker_specific_config.get("speed", 1.0))
                        speed_slider_elem_id = f"speed_slider_spk_{speaker_idx}"
                        speed_slider = gr.Slider(
                            minimum=0.25, maximum=4.0, value=default_speed, step=0.05, 
                            label="Speed", elem_id=speed_slider_elem_id
                        )
                        speed_slider.change(
                            fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="speed"),
                            inputs=[speed_slider, speaker_configs_state],
                            outputs=[speaker_configs_state]
                        )
                    elif current_tts_model == "gpt-4o-mini-tts":
                        default_vibe = speaker_specific_config.get("vibe", DEFAULT_VIBE)
                        vibe_dd_elem_id = f"vibe_dd_spk_{speaker_idx}"
                        vibe_dropdown = gr.Dropdown(
                            VIBE_CHOICES, value=default_vibe, label="Vibe/Emotion", elem_id=vibe_dd_elem_id
                        )
                        
                        default_custom_instructions = speaker_specific_config.get("custom_instructions", "")
                        custom_instr_tb_elem_id = f"custom_instr_tb_spk_{speaker_idx}"
                        custom_instructions_textbox = gr.Textbox(
                            label="Custom Instructions",
                            value=default_custom_instructions,
                            placeholder="e.g., Speak with a slightly hesitant tone.",
                            lines=2,
                            visible=(default_vibe == "Custom..."), # Initial visibility
                            elem_id=custom_instr_tb_elem_id
                        )

                        vibe_dropdown.change(
                            fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="vibe"),
                            inputs=[vibe_dropdown, speaker_configs_state],
                            outputs=[speaker_configs_state]
                        ).then( # Chain another update for visibility
                            fn=lambda vibe_val: gr.update(visible=(vibe_val == "Custom...")),
                            inputs=[vibe_dropdown],
                            outputs=[custom_instructions_textbox]
                        )
                        
                        custom_instructions_textbox.change(
                            fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="custom_instructions"),
                            inputs=[custom_instructions_textbox, speaker_configs_state],
                            outputs=[speaker_configs_state]
                        )

    # --- Event Listeners for other UI elements ---
    tts_model_dropdown.change(
        fn=handle_tts_model_change,
        inputs=[tts_model_dropdown, speaker_configs_state],
        outputs=[global_speed_input, global_instructions_input, speaker_configs_state]
    )

    speaker_config_method_dropdown.change(
        fn=handle_speaker_config_method_visibility_change,
        inputs=[speaker_config_method_dropdown],
        outputs=[single_voice_group, detailed_per_speaker_ui_group_container]
    )
    
    load_per_speaker_ui_button.click(
        fn=handle_load_refresh_per_speaker_ui_trigger,
        inputs=[script_input, speaker_configs_state, tts_model_dropdown], 
        outputs=[speaker_configs_state] 
    )
    
    calculate_cost_button.click(
        fn=handle_calculate_cost,
        inputs=[script_input, tts_model_dropdown],
        outputs=[cost_output]
    )
    
    generate_button_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
    generate_button.click(
        fn=generate_button_fn,
        inputs=[
            script_input, tts_model_dropdown, pause_input,
            speaker_config_method_dropdown, global_voice_dropdown,
            speaker_configs_state, 
            global_speed_input, global_instructions_input
        ],
        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
    )

    example_inputs_list = [
        script_input, tts_model_dropdown, pause_input,
        speaker_config_method_dropdown, global_voice_dropdown,
        speaker_configs_state, 
        global_speed_input, global_instructions_input
    ]
    example_outputs_list = [individual_lines_zip_output, merged_dialogue_mp3_output, status_output]
    example_process_fn = partial(handle_script_processing, OPENAI_API_KEY, async_openai_client, NSFW_API_URL_TEMPLATE)
    
    _ = create_examples_ui(
        inputs_for_examples=example_inputs_list,
        process_fn=example_process_fn if OPENAI_API_KEY else None,
        outputs_for_examples=example_outputs_list if OPENAI_API_KEY else None
    )

if __name__ == "__main__":
    if os.name == 'nt':
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    demo.queue().launch(debug=True, share=False)