Spaces:
Running
Running
hope
Browse files- app.py +86 -65
- event_handlers.py +95 -133
- ui_layout.py +21 -20
app.py
CHANGED
@@ -4,12 +4,13 @@ import os
|
|
4 |
import asyncio
|
5 |
from openai import AsyncOpenAI
|
6 |
from functools import partial
|
7 |
-
import datetime
|
8 |
|
9 |
from ui_layout import (
|
10 |
create_main_input_components, create_speaker_config_components,
|
11 |
create_action_and_output_components, create_examples_ui,
|
12 |
-
TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES,
|
|
|
13 |
)
|
14 |
|
15 |
from event_handlers import (
|
@@ -17,9 +18,8 @@ from event_handlers import (
|
|
17 |
handle_speaker_config_method_visibility_change,
|
18 |
handle_tts_model_change,
|
19 |
handle_load_refresh_per_speaker_ui_trigger,
|
20 |
-
#
|
21 |
-
|
22 |
-
get_speakers_from_script # Utility function
|
23 |
)
|
24 |
|
25 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
@@ -54,78 +54,104 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
|
|
54 |
if not OPENAI_API_KEY or not async_openai_client:
|
55 |
gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
|
56 |
|
57 |
-
# This state holds the detailed configurations for each speaker.
|
58 |
-
# It's an input to @gr.render and an output of event handlers that modify it.
|
59 |
speaker_configs_state = gr.State({})
|
60 |
|
61 |
(script_input, tts_model_dropdown, pause_input,
|
62 |
global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
|
63 |
|
64 |
(speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
|
65 |
-
detailed_per_speaker_ui_group_container,
|
66 |
load_per_speaker_ui_button) = create_speaker_config_components()
|
67 |
|
68 |
(calculate_cost_button, generate_button, cost_output,
|
69 |
individual_lines_zip_output, merged_dialogue_mp3_output,
|
70 |
status_output) = create_action_and_output_components()
|
71 |
|
72 |
-
|
73 |
-
# This block is defined within the `detailed_per_speaker_ui_group_container` (a gr.Column)
|
74 |
-
with detailed_per_speaker_ui_group_container:
|
75 |
-
# The @gr.render decorator ensures this function re-runs and its UI
|
76 |
-
# is re-drawn when its `inputs` change or its `triggers` are activated.
|
77 |
@gr.render(
|
78 |
inputs=[script_input, speaker_configs_state, tts_model_dropdown],
|
79 |
triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
|
80 |
)
|
81 |
def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
|
82 |
-
"""
|
83 |
-
|
84 |
-
This function's content is re-rendered by Gradio when inputs/triggers change.
|
85 |
-
"""
|
86 |
-
print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State: {str(current_speaker_configs)[:30]}...")
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
#
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
# --- Event Listeners for other UI elements ---
|
131 |
tts_model_dropdown.change(
|
@@ -144,9 +170,6 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
|
|
144 |
fn=handle_load_refresh_per_speaker_ui_trigger,
|
145 |
inputs=[script_input, speaker_configs_state, tts_model_dropdown],
|
146 |
outputs=[speaker_configs_state]
|
147 |
-
# The button click itself is also a trigger for @gr.render,
|
148 |
-
# and this handler updates speaker_configs_state, which is an input to @gr.render,
|
149 |
-
# thus ensuring @gr.render re-runs with potentially new state.
|
150 |
)
|
151 |
|
152 |
calculate_cost_button.click(
|
@@ -183,8 +206,6 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
|
|
183 |
)
|
184 |
|
185 |
if __name__ == "__main__":
|
186 |
-
if os.name == 'nt':
|
187 |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
188 |
-
# Consider adding `prevent_thread_lock=True` if running in a script and want it to exit
|
189 |
-
# or if issues with thread locking occur.
|
190 |
demo.queue().launch(debug=True, share=False)
|
|
|
4 |
import asyncio
|
5 |
from openai import AsyncOpenAI
|
6 |
from functools import partial
|
7 |
+
import datetime
|
8 |
|
9 |
from ui_layout import (
|
10 |
create_main_input_components, create_speaker_config_components,
|
11 |
create_action_and_output_components, create_examples_ui,
|
12 |
+
TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES,
|
13 |
+
DEFAULT_GLOBAL_VOICE, VIBE_CHOICES, DEFAULT_VIBE, PREDEFINED_VIBES
|
14 |
)
|
15 |
|
16 |
from event_handlers import (
|
|
|
18 |
handle_speaker_config_method_visibility_change,
|
19 |
handle_tts_model_change,
|
20 |
handle_load_refresh_per_speaker_ui_trigger,
|
21 |
+
handle_dynamic_accordion_input_change, # For controls inside @gr.render
|
22 |
+
get_speakers_from_script
|
|
|
23 |
)
|
24 |
|
25 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
54 |
if not OPENAI_API_KEY or not async_openai_client:
|
55 |
gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
|
56 |
|
|
|
|
|
57 |
speaker_configs_state = gr.State({})
|
58 |
|
59 |
(script_input, tts_model_dropdown, pause_input,
|
60 |
global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
|
61 |
|
62 |
(speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
|
63 |
+
detailed_per_speaker_ui_group_container,
|
64 |
load_per_speaker_ui_button) = create_speaker_config_components()
|
65 |
|
66 |
(calculate_cost_button, generate_button, cost_output,
|
67 |
individual_lines_zip_output, merged_dialogue_mp3_output,
|
68 |
status_output) = create_action_and_output_components()
|
69 |
|
70 |
+
with detailed_per_speaker_ui_group_container: # Define @gr.render inside this column
|
|
|
|
|
|
|
|
|
71 |
@gr.render(
|
72 |
inputs=[script_input, speaker_configs_state, tts_model_dropdown],
|
73 |
triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
|
74 |
)
|
75 |
def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
|
76 |
+
"""Defines and re-renders the dynamic UI for per-speaker configuration."""
|
77 |
+
print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs,dict) else 'Not a dict'}")
|
|
|
|
|
|
|
78 |
|
79 |
+
unique_speakers = get_speakers_from_script(current_script_text)
|
80 |
+
|
81 |
+
if not unique_speakers:
|
82 |
+
gr.Markdown("<p style='color: #888; margin-top:10px;'>Enter a script with speaker tags (e.g., `[SpeakerName] Text`) and click 'Load/Refresh' to see per-speaker settings.</p>")
|
83 |
+
return
|
84 |
+
|
85 |
+
for speaker_idx, speaker_name in enumerate(unique_speakers):
|
86 |
+
# Ensure current_speaker_configs is a valid dictionary
|
87 |
+
if not isinstance(current_speaker_configs, dict):
|
88 |
+
current_speaker_configs = {}
|
89 |
+
|
90 |
+
speaker_specific_config = current_speaker_configs.get(speaker_name, {})
|
91 |
+
accordion_elem_id = f"accordion_spk_{speaker_idx}_{speaker_name.replace(' ','_').lower()}"
|
92 |
+
|
93 |
+
with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False, elem_id=accordion_elem_id):
|
94 |
+
gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")
|
95 |
+
|
96 |
+
default_voice = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
|
97 |
+
voice_dd_elem_id = f"voice_dd_spk_{speaker_idx}"
|
98 |
+
voice_dropdown = gr.Dropdown(
|
99 |
+
APP_AVAILABLE_VOICES,
|
100 |
+
value=default_voice,
|
101 |
+
label="Voice",
|
102 |
+
elem_id=voice_dd_elem_id
|
103 |
+
)
|
104 |
+
voice_dropdown.change(
|
105 |
+
fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="voice"),
|
106 |
+
inputs=[voice_dropdown, speaker_configs_state], # Pass current state to handler
|
107 |
+
outputs=[speaker_configs_state]
|
108 |
+
)
|
109 |
+
|
110 |
+
if current_tts_model in ["tts-1", "tts-1-hd"]:
|
111 |
+
default_speed = float(speaker_specific_config.get("speed", 1.0))
|
112 |
+
speed_slider_elem_id = f"speed_slider_spk_{speaker_idx}"
|
113 |
+
speed_slider = gr.Slider(
|
114 |
+
minimum=0.25, maximum=4.0, value=default_speed, step=0.05,
|
115 |
+
label="Speed", elem_id=speed_slider_elem_id
|
116 |
+
)
|
117 |
+
speed_slider.change(
|
118 |
+
fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="speed"),
|
119 |
+
inputs=[speed_slider, speaker_configs_state],
|
120 |
+
outputs=[speaker_configs_state]
|
121 |
+
)
|
122 |
+
elif current_tts_model == "gpt-4o-mini-tts":
|
123 |
+
default_vibe = speaker_specific_config.get("vibe", DEFAULT_VIBE)
|
124 |
+
vibe_dd_elem_id = f"vibe_dd_spk_{speaker_idx}"
|
125 |
+
vibe_dropdown = gr.Dropdown(
|
126 |
+
VIBE_CHOICES, value=default_vibe, label="Vibe/Emotion", elem_id=vibe_dd_elem_id
|
127 |
+
)
|
128 |
+
|
129 |
+
default_custom_instructions = speaker_specific_config.get("custom_instructions", "")
|
130 |
+
custom_instr_tb_elem_id = f"custom_instr_tb_spk_{speaker_idx}"
|
131 |
+
custom_instructions_textbox = gr.Textbox(
|
132 |
+
label="Custom Instructions",
|
133 |
+
value=default_custom_instructions,
|
134 |
+
placeholder="e.g., Speak with a slightly hesitant tone.",
|
135 |
+
lines=2,
|
136 |
+
visible=(default_vibe == "Custom..."), # Initial visibility
|
137 |
+
elem_id=custom_instr_tb_elem_id
|
138 |
+
)
|
139 |
+
|
140 |
+
vibe_dropdown.change(
|
141 |
+
fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="vibe"),
|
142 |
+
inputs=[vibe_dropdown, speaker_configs_state],
|
143 |
+
outputs=[speaker_configs_state]
|
144 |
+
).then( # Chain another update for visibility
|
145 |
+
fn=lambda vibe_val: gr.update(visible=(vibe_val == "Custom...")),
|
146 |
+
inputs=[vibe_dropdown],
|
147 |
+
outputs=[custom_instructions_textbox]
|
148 |
+
)
|
149 |
+
|
150 |
+
custom_instructions_textbox.change(
|
151 |
+
fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="custom_instructions"),
|
152 |
+
inputs=[custom_instructions_textbox, speaker_configs_state],
|
153 |
+
outputs=[speaker_configs_state]
|
154 |
+
)
|
155 |
|
156 |
# --- Event Listeners for other UI elements ---
|
157 |
tts_model_dropdown.change(
|
|
|
170 |
fn=handle_load_refresh_per_speaker_ui_trigger,
|
171 |
inputs=[script_input, speaker_configs_state, tts_model_dropdown],
|
172 |
outputs=[speaker_configs_state]
|
|
|
|
|
|
|
173 |
)
|
174 |
|
175 |
calculate_cost_button.click(
|
|
|
206 |
)
|
207 |
|
208 |
if __name__ == "__main__":
|
209 |
+
if os.name == 'nt':
|
210 |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
|
|
|
211 |
demo.queue().launch(debug=True, share=False)
|
event_handlers.py
CHANGED
@@ -9,7 +9,6 @@ import random
|
|
9 |
from functools import partial
|
10 |
import datetime
|
11 |
|
12 |
-
# Assuming utils.script_parser and utils.openai_tts, utils.merge_audio are correctly defined
|
13 |
from utils.script_parser import parse_dialogue_script, calculate_cost
|
14 |
from utils.openai_tts import synthesize_speech_line
|
15 |
from utils.merge_audio import merge_mp3_files
|
@@ -21,45 +20,48 @@ def get_speakers_from_script(script_text: str) -> list:
|
|
21 |
if not script_text or not script_text.strip():
|
22 |
return []
|
23 |
try:
|
24 |
-
parsed_lines, _ = parse_dialogue_script(script_text)
|
25 |
if not parsed_lines:
|
26 |
return []
|
27 |
seen_speakers = set()
|
28 |
ordered_unique_speakers = []
|
29 |
for line_data in parsed_lines:
|
30 |
speaker = line_data.get("speaker")
|
31 |
-
if speaker and speaker not in seen_speakers:
|
32 |
ordered_unique_speakers.append(speaker)
|
33 |
seen_speakers.add(speaker)
|
34 |
return ordered_unique_speakers
|
35 |
-
except ValueError:
|
|
|
36 |
return []
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def handle_dynamic_accordion_input_change(
|
39 |
-
new_value,
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
# target_state_component is implicitly speaker_configs_state when this function is an output to it
|
44 |
):
|
45 |
"""
|
46 |
Updates the speaker_configs_state when a dynamic UI element within an Accordion changes.
|
47 |
-
This function is INTENDED to be used as the 'fn' for .change() listeners of
|
48 |
-
components created inside the @gr.render block.
|
49 |
"""
|
50 |
if not isinstance(current_speaker_configs, dict):
|
51 |
-
print(f"Warning: current_speaker_configs was not a dict
|
52 |
-
current_speaker_configs = {}
|
53 |
|
54 |
-
updated_configs = current_speaker_configs.copy()
|
55 |
|
56 |
if speaker_name not in updated_configs:
|
57 |
updated_configs[speaker_name] = {}
|
58 |
|
59 |
updated_configs[speaker_name][config_key] = new_value
|
60 |
-
updated_configs["_last_config_change"] = f"{speaker_name}_{config_key}_{datetime.datetime.now().isoformat()}" # To ensure state change
|
61 |
|
62 |
-
|
|
|
|
|
63 |
return updated_configs
|
64 |
|
65 |
|
@@ -67,7 +69,7 @@ async def handle_script_processing(
|
|
67 |
openai_api_key: str, async_openai_client, nsfw_api_url_template: str,
|
68 |
dialogue_script: str, tts_model: str, pause_ms: int,
|
69 |
speaker_config_method: str, global_voice_selection: str,
|
70 |
-
speaker_configs_state_dict: dict,
|
71 |
global_speed: float,
|
72 |
global_instructions: str,
|
73 |
progress=gr.Progress(track_tqdm=True)
|
@@ -84,140 +86,120 @@ async def handle_script_processing(
|
|
84 |
try:
|
85 |
parsed_lines, _ = parse_dialogue_script(dialogue_script)
|
86 |
if not parsed_lines:
|
87 |
-
shutil.rmtree(job_audio_path_prefix)
|
88 |
-
return None, None, "Error: No valid lines found in script."
|
89 |
except ValueError as e:
|
90 |
-
shutil.rmtree(job_audio_path_prefix)
|
91 |
-
return None, None, f"Script parsing error: {str(e)}"
|
92 |
|
93 |
if not isinstance(speaker_configs_state_dict, dict):
|
94 |
print(f"Warning: speaker_configs_state_dict was not a dict in handle_script_processing. Re-initializing. Type: {type(speaker_configs_state_dict)}")
|
95 |
speaker_configs_state_dict = {}
|
96 |
|
97 |
-
safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
|
98 |
|
99 |
-
speaker_voice_map = {}
|
100 |
if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
|
101 |
unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
|
102 |
temp_voices_pool = APP_AVAILABLE_VOICES.copy()
|
103 |
-
if not temp_voices_pool: temp_voices_pool = [
|
104 |
|
105 |
if speaker_config_method == "Random per Speaker":
|
106 |
for spk_name in unique_script_speakers_for_map:
|
107 |
speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
|
108 |
-
elif speaker_config_method == "A/B Round Robin" and temp_voices_pool:
|
109 |
for i, spk_name in enumerate(unique_script_speakers_for_map):
|
110 |
speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
|
111 |
|
112 |
-
tasks = []
|
113 |
-
# line_audio_files_map = {} # To store results indexed by original line ID
|
114 |
-
|
115 |
-
# Create a list to store (original_line_id, future_task)
|
116 |
task_info_list = []
|
117 |
-
|
118 |
for i, line_data in enumerate(parsed_lines):
|
119 |
speaker_name = line_data["speaker"]
|
|
|
|
|
|
|
120 |
line_voice = safe_default_global_voice
|
121 |
line_speed = global_speed
|
122 |
line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
|
123 |
|
124 |
if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
|
125 |
spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
|
126 |
-
line_voice = spk_cfg.get("voice", safe_default_global_voice)
|
|
|
127 |
if tts_model in ["tts-1", "tts-1-hd"]:
|
128 |
line_speed = float(spk_cfg.get("speed", global_speed))
|
129 |
-
else:
|
130 |
-
line_speed = 1.0
|
131 |
|
132 |
if tts_model == "gpt-4o-mini-tts":
|
133 |
vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
|
134 |
custom_instr_raw = spk_cfg.get("custom_instructions", "")
|
135 |
custom_instr = custom_instr_raw.strip() if custom_instr_raw else ""
|
136 |
-
|
|
|
137 |
if vibe == "Custom..." and custom_instr:
|
138 |
-
|
139 |
elif vibe != "None" and vibe != "Custom..." and PREDEFINED_VIBES.get(vibe):
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
|
149 |
elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
|
150 |
line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
|
151 |
-
|
152 |
-
|
153 |
-
# For Random/AB, use global instructions if any
|
154 |
-
line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
|
155 |
|
156 |
-
|
157 |
-
if tts_model not in ["tts-1", "tts-1-hd"]:
|
158 |
-
line_speed = 1.0
|
159 |
|
160 |
-
|
161 |
-
out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}_{speaker_name.replace(' ','_')}.mp3")
|
162 |
progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
|
163 |
|
164 |
current_task = synthesize_speech_line(
|
165 |
-
client=async_openai_client, text=
|
166 |
output_path=out_fn, model=tts_model, speed=line_speed,
|
167 |
instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
|
168 |
-
line_index=
|
169 |
)
|
170 |
-
task_info_list.append({"id":
|
171 |
-
|
172 |
-
# Gather results
|
173 |
-
# await asyncio.gather(*(info['task'] for info in task_info_list)) # This runs them, but doesn't handle exceptions well for individual ones
|
174 |
|
175 |
-
|
176 |
for info in task_info_list:
|
177 |
try:
|
178 |
result_path = await info['task']
|
179 |
-
|
180 |
except Exception as e:
|
181 |
print(f"Error synthesizing line ID {info['id']} ({info['speaker']}): {e}")
|
182 |
-
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
-
# Ensure files are processed in the original script order for merging
|
186 |
-
ordered_files_for_merge_and_zip = [None] * len(parsed_lines)
|
187 |
-
for res in processed_results:
|
188 |
-
# Find original index of this line_id to maintain order
|
189 |
-
original_line_index = -1
|
190 |
-
for idx, p_line in enumerate(parsed_lines):
|
191 |
-
if p_line['id'] == res['id']:
|
192 |
-
original_line_index = idx
|
193 |
-
break
|
194 |
-
|
195 |
-
if original_line_index != -1:
|
196 |
-
if res.get("path") and os.path.exists(res["path"]) and os.path.getsize(res["path"]) > 0:
|
197 |
-
ordered_files_for_merge_and_zip[original_line_index] = res["path"]
|
198 |
-
else:
|
199 |
-
print(f"Skipped or failed synthesizing line ID {res['id']} ({res['speaker']})")
|
200 |
-
# ordered_files_for_merge_and_zip remains None at this index
|
201 |
|
202 |
valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
|
203 |
|
204 |
if not valid_files_for_zip:
|
205 |
-
shutil.rmtree(job_audio_path_prefix)
|
206 |
-
return None, None, "Error: No audio was successfully synthesized for any line."
|
207 |
|
208 |
zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
209 |
with zipfile.ZipFile(zip_fn, 'w') as zf:
|
210 |
for f_path in valid_files_for_zip:
|
211 |
zf.write(f_path, os.path.basename(f_path))
|
212 |
|
213 |
-
|
214 |
-
files_to_actually_merge = valid_files_for_zip
|
215 |
merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
|
216 |
merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
|
217 |
|
218 |
status_msg = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
|
219 |
if len(valid_files_for_zip) < len(parsed_lines): status_msg += "Some lines may have failed. "
|
220 |
-
|
221 |
if not merged_path and len(valid_files_for_zip) > 0 : status_msg += "Merging audio failed. "
|
222 |
elif not merged_path: status_msg = "No audio to merge (all lines failed or were skipped)."
|
223 |
else: status_msg += "Merged audio generated."
|
@@ -226,74 +208,55 @@ async def handle_script_processing(
|
|
226 |
merged_path if merged_path and os.path.exists(merged_path) else None,
|
227 |
status_msg)
|
228 |
|
229 |
-
|
230 |
def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
231 |
-
"""Calculates the estimated cost for TTS processing."""
|
232 |
if not dialogue_script or not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
|
233 |
try:
|
234 |
parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
|
235 |
if not parsed_lines: return "Cost: $0.00 (No valid lines in script)"
|
236 |
cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
|
237 |
return f"Estimated Cost for {len(parsed_lines)} lines ({total_chars} chars): ${cost:.6f}"
|
238 |
-
except ValueError as e:
|
239 |
-
|
240 |
-
except Exception as e: # Generic fallback for unexpected errors
|
241 |
-
return f"An unexpected error occurred during cost calculation: {str(e)}"
|
242 |
-
|
243 |
|
244 |
def handle_load_refresh_per_speaker_ui_trigger(script_text: str, current_speaker_configs: dict, tts_model: str):
|
245 |
-
"""
|
246 |
-
Triggered by the 'Load/Refresh' button. Updates speaker_configs_state.
|
247 |
-
The @gr.render function reacts to this state change (and the button click trigger).
|
248 |
-
"""
|
249 |
print(f"DEBUG (Load/Refresh Trigger): Script: '{script_text[:30]}...', Model: {tts_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
|
|
|
|
|
250 |
|
251 |
-
#
|
252 |
-
if not isinstance(current_speaker_configs, dict):
|
253 |
-
current_speaker_configs = {}
|
254 |
-
|
255 |
-
updated_configs = current_speaker_configs.copy() # Work on a copy to ensure change detection
|
256 |
-
|
257 |
-
# Example: Initialize new speakers found in the script if they aren't in the state
|
258 |
# unique_speakers_in_script = get_speakers_from_script(script_text)
|
259 |
# for speaker in unique_speakers_in_script:
|
260 |
# if speaker not in updated_configs:
|
261 |
-
# updated_configs[speaker] = {"voice": DEFAULT_GLOBAL_VOICE} #
|
262 |
-
|
263 |
-
#
|
264 |
-
|
265 |
-
updated_configs["
|
266 |
-
updated_configs["
|
267 |
-
|
268 |
return updated_configs
|
269 |
|
270 |
-
|
271 |
def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
|
272 |
-
"""
|
273 |
-
Handles changes to the TTS model dropdown. Updates global controls and speaker_configs_state.
|
274 |
-
"""
|
275 |
print(f"DEBUG (TTS Model Change): Model: {selected_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
updated_configs["
|
296 |
-
updated_configs["_last_refresh_timestamp"] = datetime.datetime.now().isoformat()
|
297 |
|
298 |
is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
|
299 |
is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
|
@@ -305,7 +268,6 @@ def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
|
|
305 |
)
|
306 |
|
307 |
def handle_speaker_config_method_visibility_change(method: str):
|
308 |
-
"""Handles changes to the speaker configuration method dropdown."""
|
309 |
print(f"DEBUG (Config Method Change): Method: {method}")
|
310 |
is_single_voice_visible = (method == "Single Voice (Global)")
|
311 |
is_detailed_per_speaker_container_visible = (method == "Detailed Configuration (Per Speaker UI)")
|
|
|
9 |
from functools import partial
|
10 |
import datetime
|
11 |
|
|
|
12 |
from utils.script_parser import parse_dialogue_script, calculate_cost
|
13 |
from utils.openai_tts import synthesize_speech_line
|
14 |
from utils.merge_audio import merge_mp3_files
|
|
|
20 |
if not script_text or not script_text.strip():
|
21 |
return []
|
22 |
try:
|
23 |
+
parsed_lines, _ = parse_dialogue_script(script_text) # Assuming this returns (list_of_dicts, total_chars)
|
24 |
if not parsed_lines:
|
25 |
return []
|
26 |
seen_speakers = set()
|
27 |
ordered_unique_speakers = []
|
28 |
for line_data in parsed_lines:
|
29 |
speaker = line_data.get("speaker")
|
30 |
+
if speaker and speaker not in seen_speakers: # Ensure speaker is not None or empty
|
31 |
ordered_unique_speakers.append(speaker)
|
32 |
seen_speakers.add(speaker)
|
33 |
return ordered_unique_speakers
|
34 |
+
except ValueError:
|
35 |
+
print("ValueError during script parsing in get_speakers_from_script.")
|
36 |
return []
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Unexpected error in get_speakers_from_script: {e}")
|
39 |
+
return []
|
40 |
+
|
41 |
|
42 |
def handle_dynamic_accordion_input_change(
|
43 |
+
new_value, # Value from the changed component (e.g., voice_dropdown)
|
44 |
+
current_speaker_configs: dict, # Current value of speaker_configs_state
|
45 |
+
speaker_name: str, # Passed via partial from the event listener
|
46 |
+
config_key: str # Passed via partial from the event listener
|
|
|
47 |
):
|
48 |
"""
|
49 |
Updates the speaker_configs_state when a dynamic UI element within an Accordion changes.
|
|
|
|
|
50 |
"""
|
51 |
if not isinstance(current_speaker_configs, dict):
|
52 |
+
print(f"Warning: current_speaker_configs was not a dict in handle_dynamic_accordion_input_change. Type: {type(current_speaker_configs)}. Re-initializing.")
|
53 |
+
current_speaker_configs = {}
|
54 |
|
55 |
+
updated_configs = current_speaker_configs.copy()
|
56 |
|
57 |
if speaker_name not in updated_configs:
|
58 |
updated_configs[speaker_name] = {}
|
59 |
|
60 |
updated_configs[speaker_name][config_key] = new_value
|
|
|
61 |
|
62 |
+
updated_configs["_last_dynamic_update_details"] = f"Speaker: {speaker_name}, Key: {config_key}, Val: {str(new_value)[:20]}, TS: {datetime.datetime.now().isoformat()}"
|
63 |
+
|
64 |
+
print(f"DEBUG (dynamic_input_change): Speaker '{speaker_name}' config '{config_key}' to '{str(new_value)[:50]}'. New state hint: {updated_configs.get('_last_dynamic_update_details')}")
|
65 |
return updated_configs
|
66 |
|
67 |
|
|
|
69 |
openai_api_key: str, async_openai_client, nsfw_api_url_template: str,
|
70 |
dialogue_script: str, tts_model: str, pause_ms: int,
|
71 |
speaker_config_method: str, global_voice_selection: str,
|
72 |
+
speaker_configs_state_dict: dict,
|
73 |
global_speed: float,
|
74 |
global_instructions: str,
|
75 |
progress=gr.Progress(track_tqdm=True)
|
|
|
86 |
try:
|
87 |
parsed_lines, _ = parse_dialogue_script(dialogue_script)
|
88 |
if not parsed_lines:
|
89 |
+
shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No valid lines found in script."
|
|
|
90 |
except ValueError as e:
|
91 |
+
shutil.rmtree(job_audio_path_prefix); return None, None, f"Script parsing error: {str(e)}"
|
|
|
92 |
|
93 |
if not isinstance(speaker_configs_state_dict, dict):
|
94 |
print(f"Warning: speaker_configs_state_dict was not a dict in handle_script_processing. Re-initializing. Type: {type(speaker_configs_state_dict)}")
|
95 |
speaker_configs_state_dict = {}
|
96 |
|
97 |
+
safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else (APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy")
|
98 |
|
99 |
+
speaker_voice_map = {}
|
100 |
if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
|
101 |
unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
|
102 |
temp_voices_pool = APP_AVAILABLE_VOICES.copy()
|
103 |
+
if not temp_voices_pool: temp_voices_pool = [safe_default_global_voice]
|
104 |
|
105 |
if speaker_config_method == "Random per Speaker":
|
106 |
for spk_name in unique_script_speakers_for_map:
|
107 |
speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
|
108 |
+
elif speaker_config_method == "A/B Round Robin" and temp_voices_pool:
|
109 |
for i, spk_name in enumerate(unique_script_speakers_for_map):
|
110 |
speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
|
111 |
|
|
|
|
|
|
|
|
|
112 |
task_info_list = []
|
|
|
113 |
for i, line_data in enumerate(parsed_lines):
|
114 |
speaker_name = line_data["speaker"]
|
115 |
+
line_text = line_data["text"]
|
116 |
+
line_id = line_data["id"]
|
117 |
+
|
118 |
line_voice = safe_default_global_voice
|
119 |
line_speed = global_speed
|
120 |
line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
|
121 |
|
122 |
if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
|
123 |
spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
|
124 |
+
line_voice = spk_cfg.get("voice", safe_default_global_voice)
|
125 |
+
|
126 |
if tts_model in ["tts-1", "tts-1-hd"]:
|
127 |
line_speed = float(spk_cfg.get("speed", global_speed))
|
128 |
+
else: line_speed = 1.0
|
|
|
129 |
|
130 |
if tts_model == "gpt-4o-mini-tts":
|
131 |
vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
|
132 |
custom_instr_raw = spk_cfg.get("custom_instructions", "")
|
133 |
custom_instr = custom_instr_raw.strip() if custom_instr_raw else ""
|
134 |
+
|
135 |
+
current_line_specific_instructions = None
|
136 |
if vibe == "Custom..." and custom_instr:
|
137 |
+
current_line_specific_instructions = custom_instr
|
138 |
elif vibe != "None" and vibe != "Custom..." and PREDEFINED_VIBES.get(vibe):
|
139 |
+
current_line_specific_instructions = PREDEFINED_VIBES[vibe]
|
140 |
+
|
141 |
+
# If per-speaker instructions are set, they take precedence. Otherwise, fall back to global instructions.
|
142 |
+
line_instructions = current_line_specific_instructions if current_line_specific_instructions is not None else line_instructions
|
143 |
+
else: # tts-1, tts-1-hd do not use vibe/custom_instructions from per-speaker UI
|
144 |
+
# They will use the global_instructions if set.
|
145 |
+
pass # line_instructions already set to global_instructions or None
|
|
|
146 |
|
147 |
elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
|
148 |
line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
|
149 |
+
# For these methods, speed and instructions remain global
|
150 |
+
if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
|
|
|
|
|
151 |
|
152 |
+
if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
|
|
|
|
|
153 |
|
154 |
+
out_fn = os.path.join(job_audio_path_prefix, f"line_{line_id}_{speaker_name.replace(' ','_')}.mp3")
|
|
|
155 |
progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
|
156 |
|
157 |
current_task = synthesize_speech_line(
|
158 |
+
client=async_openai_client, text=line_text, voice=line_voice,
|
159 |
output_path=out_fn, model=tts_model, speed=line_speed,
|
160 |
instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
|
161 |
+
line_index=line_id
|
162 |
)
|
163 |
+
task_info_list.append({"id": line_id, "speaker": speaker_name, "task": current_task, "out_fn": out_fn})
|
|
|
|
|
|
|
164 |
|
165 |
+
processed_results_map = {} # Store by line_id for easier lookup
|
166 |
for info in task_info_list:
|
167 |
try:
|
168 |
result_path = await info['task']
|
169 |
+
processed_results_map[info['id']] = {"path": result_path, "speaker": info['speaker']}
|
170 |
except Exception as e:
|
171 |
print(f"Error synthesizing line ID {info['id']} ({info['speaker']}): {e}")
|
172 |
+
processed_results_map[info['id']] = {"path": None, "error": e, "speaker": info['speaker']}
|
173 |
|
174 |
+
ordered_files_for_merge_and_zip = []
|
175 |
+
for p_line in parsed_lines: # Iterate through original parsed lines to maintain order
|
176 |
+
line_id = p_line['id']
|
177 |
+
res = processed_results_map.get(line_id)
|
178 |
+
if res and res.get("path") and os.path.exists(res["path"]) and os.path.getsize(res["path"]) > 0:
|
179 |
+
ordered_files_for_merge_and_zip.append(res["path"])
|
180 |
+
else:
|
181 |
+
# File was not successfully created or result not found, append None placeholder
|
182 |
+
ordered_files_for_merge_and_zip.append(None)
|
183 |
+
if res: print(f"Skipped or failed synthesizing line ID {line_id} ({res.get('speaker', 'Unknown')}) for merge/zip.")
|
184 |
+
else: print(f"Result for line ID {line_id} not found in processed_results_map.")
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
|
188 |
|
189 |
if not valid_files_for_zip:
|
190 |
+
shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio was successfully synthesized for any line."
|
|
|
191 |
|
192 |
zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
193 |
with zipfile.ZipFile(zip_fn, 'w') as zf:
|
194 |
for f_path in valid_files_for_zip:
|
195 |
zf.write(f_path, os.path.basename(f_path))
|
196 |
|
197 |
+
files_to_actually_merge = valid_files_for_zip # Already ordered and filtered
|
|
|
198 |
merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
|
199 |
merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
|
200 |
|
201 |
status_msg = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
|
202 |
if len(valid_files_for_zip) < len(parsed_lines): status_msg += "Some lines may have failed. "
|
|
|
203 |
if not merged_path and len(valid_files_for_zip) > 0 : status_msg += "Merging audio failed. "
|
204 |
elif not merged_path: status_msg = "No audio to merge (all lines failed or were skipped)."
|
205 |
else: status_msg += "Merged audio generated."
|
|
|
208 |
merged_path if merged_path and os.path.exists(merged_path) else None,
|
209 |
status_msg)
|
210 |
|
|
|
211 |
def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
|
|
212 |
if not dialogue_script or not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
|
213 |
try:
|
214 |
parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
|
215 |
if not parsed_lines: return "Cost: $0.00 (No valid lines in script)"
|
216 |
cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
|
217 |
return f"Estimated Cost for {len(parsed_lines)} lines ({total_chars} chars): ${cost:.6f}"
|
218 |
+
except ValueError as e: return f"Cost calculation error: {str(e)}"
|
219 |
+
except Exception as e: return f"An unexpected error: {str(e)}"
|
|
|
|
|
|
|
220 |
|
221 |
def handle_load_refresh_per_speaker_ui_trigger(script_text: str, current_speaker_configs: dict, tts_model: str):
|
|
|
|
|
|
|
|
|
222 |
print(f"DEBUG (Load/Refresh Trigger): Script: '{script_text[:30]}...', Model: {tts_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
|
223 |
+
if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
|
224 |
+
updated_configs = current_speaker_configs.copy()
|
225 |
|
226 |
+
# Optionally, you might want to pre-populate settings for new speakers here
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
# unique_speakers_in_script = get_speakers_from_script(script_text)
|
228 |
# for speaker in unique_speakers_in_script:
|
229 |
# if speaker not in updated_configs:
|
230 |
+
# updated_configs[speaker] = {"voice": DEFAULT_GLOBAL_VOICE} # Default init
|
231 |
+
# if tts_model in ["tts-1", "tts-1-hd"]: updated_configs[speaker]["speed"] = 1.0
|
232 |
+
# elif tts_model == "gpt-4o-mini-tts": updated_configs[speaker]["vibe"] = DEFAULT_VIBE
|
233 |
+
|
234 |
+
updated_configs["_last_action_source"] = "load_refresh_button"
|
235 |
+
updated_configs["_last_action_timestamp"] = datetime.datetime.now().isoformat()
|
|
|
236 |
return updated_configs
|
237 |
|
|
|
238 |
def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
|
|
|
|
|
|
|
239 |
print(f"DEBUG (TTS Model Change): Model: {selected_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
|
240 |
+
if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
|
241 |
+
updated_configs = current_speaker_configs.copy()
|
242 |
+
|
243 |
+
# When model changes, you might want to reset or adjust model-specific settings for all speakers
|
244 |
+
# For example, 'speed' is for tts-1, 'vibe' for gpt-4o-mini-tts
|
245 |
+
for speaker_name_key in list(updated_configs.keys()): # Iterate over keys if modifying dict
|
246 |
+
if isinstance(updated_configs[speaker_name_key], dict): # Check if it's a speaker config dict
|
247 |
+
if selected_model == "gpt-4o-mini-tts":
|
248 |
+
updated_configs[speaker_name_key].pop("speed", None)
|
249 |
+
if "vibe" not in updated_configs[speaker_name_key]:
|
250 |
+
updated_configs[speaker_name_key]["vibe"] = DEFAULT_VIBE
|
251 |
+
elif selected_model in ["tts-1", "tts-1-hd"]:
|
252 |
+
updated_configs[speaker_name_key].pop("vibe", None)
|
253 |
+
updated_configs[speaker_name_key].pop("custom_instructions", None)
|
254 |
+
if "speed" not in updated_configs[speaker_name_key]:
|
255 |
+
updated_configs[speaker_name_key]["speed"] = 1.0
|
256 |
+
# Add other model-specific adjustments if needed
|
257 |
+
|
258 |
+
updated_configs["_last_action_source"] = "tts_model_change"
|
259 |
+
updated_configs["_last_action_timestamp"] = datetime.datetime.now().isoformat()
|
|
|
260 |
|
261 |
is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
|
262 |
is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
|
|
|
268 |
)
|
269 |
|
270 |
def handle_speaker_config_method_visibility_change(method: str):
|
|
|
271 |
print(f"DEBUG (Config Method Change): Method: {method}")
|
272 |
is_single_voice_visible = (method == "Single Voice (Global)")
|
273 |
is_detailed_per_speaker_container_visible = (method == "Detailed Configuration (Per Speaker UI)")
|
ui_layout.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
# FILE: ui_layout.py
|
2 |
import gradio as gr
|
3 |
-
from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES # Assuming this import path is correct
|
4 |
|
5 |
# --- UI Constants and Configuration ---
|
6 |
TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
|
@@ -12,10 +11,17 @@ SPEAKER_CONFIG_METHODS = [
|
|
12 |
"A/B Round Robin",
|
13 |
"Detailed Configuration (Per Speaker UI)"
|
14 |
]
|
15 |
-
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
|
20 |
|
21 |
|
@@ -34,17 +40,17 @@ PREDEFINED_VIBES = {
|
|
34 |
}
|
35 |
VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
|
36 |
DEFAULT_VIBE = "None"
|
37 |
-
DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0]
|
38 |
|
39 |
|
40 |
# --- UI Element Creation Functions ---
|
41 |
|
42 |
def create_main_input_components(model_default_value):
|
43 |
"""Creates the main input components for script, model, pause, and global settings."""
|
44 |
-
with gr.Row():
|
45 |
-
with gr.Column(scale=2):
|
46 |
script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\\n[Speaker2] How are you today?", lines=10, elem_id="script_input_area")
|
47 |
-
with gr.Column(scale=1):
|
48 |
tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value, elem_id="tts_model_dropdown")
|
49 |
pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50, elem_id="pause_input")
|
50 |
|
@@ -71,20 +77,16 @@ def create_speaker_config_components():
|
|
71 |
elem_id="speaker_config_method_dropdown"
|
72 |
)
|
73 |
|
74 |
-
with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
|
75 |
global_voice_dropdown = gr.Dropdown(
|
76 |
APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True,
|
77 |
elem_id="global_voice_dropdown"
|
78 |
)
|
79 |
|
80 |
-
# This Column group is the container where the @gr.render function in app.py
|
81 |
-
# will place its dynamically generated UI.
|
82 |
detailed_per_speaker_ui_group_container = gr.Column(
|
83 |
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)"),
|
84 |
elem_id="detailed_per_speaker_ui_group_container"
|
85 |
)
|
86 |
-
# The button and its help text are part of this container, defined statically.
|
87 |
-
# The dynamic content from @gr.render will appear below them within this column.
|
88 |
with detailed_per_speaker_ui_group_container:
|
89 |
load_per_speaker_ui_button = gr.Button(
|
90 |
"Load/Refresh Per-Speaker Settings UI (from Script Above)",
|
@@ -94,24 +96,24 @@ def create_speaker_config_components():
|
|
94 |
"<small>Click button above to populate settings for each speaker found in the script. "
|
95 |
"Settings are applied per-speaker. If script changes, click again to refresh.</small>"
|
96 |
)
|
97 |
-
# The
|
98 |
|
99 |
return (
|
100 |
speaker_config_method_dropdown,
|
101 |
single_voice_group,
|
102 |
global_voice_dropdown,
|
103 |
-
detailed_per_speaker_ui_group_container,
|
104 |
load_per_speaker_ui_button
|
105 |
)
|
106 |
|
107 |
def create_action_and_output_components():
|
108 |
"""Creates buttons for actions (cost, generate) and output display areas."""
|
109 |
-
with gr.Row():
|
110 |
calculate_cost_button = gr.Button("Calculate Estimated Cost", elem_id="calculate_cost_button")
|
111 |
generate_button = gr.Button("Generate Audio", variant="primary", elem_id="generate_button")
|
112 |
|
113 |
cost_output = gr.Textbox(label="Estimated Cost", interactive=False, elem_id="cost_output")
|
114 |
-
with gr.Row():
|
115 |
individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)", elem_id="individual_lines_zip_output")
|
116 |
merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath", elem_id="merged_dialogue_mp3_output")
|
117 |
status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5, elem_id="status_output")
|
@@ -123,7 +125,6 @@ def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=Non
|
|
123 |
example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\\n[Alice] Let's see how this sounds."
|
124 |
example_script_2 = "[Narrator] This is a short story.\\n[CharacterA] Once upon a time...\\n[Narrator] ...there was a Gradio app.\\n[CharacterB] And it could talk!"
|
125 |
|
126 |
-
# Ensure example state matches the number of inputs for `speaker_configs_state` (which is one dictionary)
|
127 |
examples_data = [
|
128 |
[example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
|
129 |
[example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
|
@@ -148,7 +149,7 @@ def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=Non
|
|
148 |
inputs=inputs_for_examples,
|
149 |
outputs=outputs_for_examples,
|
150 |
fn=process_fn,
|
151 |
-
cache_examples=False,
|
152 |
examples_per_page=5,
|
153 |
label="Example Scripts (Click to Load & Run)",
|
154 |
run_on_click=True
|
|
|
1 |
# FILE: ui_layout.py
|
2 |
import gradio as gr
|
|
|
3 |
|
4 |
# --- UI Constants and Configuration ---
|
5 |
TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
|
|
|
11 |
"A/B Round Robin",
|
12 |
"Detailed Configuration (Per Speaker UI)"
|
13 |
]
|
14 |
+
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
15 |
+
|
16 |
+
# Assuming utils.openai_tts.OPENAI_VOICES exists and provides a list of voice names
|
17 |
+
# If not, define APP_AVAILABLE_VOICES directly here for now.
|
18 |
+
try:
|
19 |
+
from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES
|
20 |
+
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
|
21 |
+
if not APP_AVAILABLE_VOICES: # Fallback if OPENAI_VOICES is empty
|
22 |
+
APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
|
23 |
+
except ImportError:
|
24 |
+
print("Warning: Could not import OPENAI_VOICES from utils.openai_tts. Using default list.")
|
25 |
APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
|
26 |
|
27 |
|
|
|
40 |
}
|
41 |
VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
|
42 |
DEFAULT_VIBE = "None"
|
43 |
+
DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"
|
44 |
|
45 |
|
46 |
# --- UI Element Creation Functions ---
|
47 |
|
48 |
def create_main_input_components(model_default_value):
|
49 |
"""Creates the main input components for script, model, pause, and global settings."""
|
50 |
+
with gr.Row(elem_id="main_input_row"):
|
51 |
+
with gr.Column(scale=2, elem_id="script_input_column"):
|
52 |
script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\\n[Speaker2] How are you today?", lines=10, elem_id="script_input_area")
|
53 |
+
with gr.Column(scale=1, elem_id="model_config_column"):
|
54 |
tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value, elem_id="tts_model_dropdown")
|
55 |
pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50, elem_id="pause_input")
|
56 |
|
|
|
77 |
elem_id="speaker_config_method_dropdown"
|
78 |
)
|
79 |
|
80 |
+
with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), elem_id="single_voice_config_group") as single_voice_group:
|
81 |
global_voice_dropdown = gr.Dropdown(
|
82 |
APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True,
|
83 |
elem_id="global_voice_dropdown"
|
84 |
)
|
85 |
|
|
|
|
|
86 |
detailed_per_speaker_ui_group_container = gr.Column(
|
87 |
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)"),
|
88 |
elem_id="detailed_per_speaker_ui_group_container"
|
89 |
)
|
|
|
|
|
90 |
with detailed_per_speaker_ui_group_container:
|
91 |
load_per_speaker_ui_button = gr.Button(
|
92 |
"Load/Refresh Per-Speaker Settings UI (from Script Above)",
|
|
|
96 |
"<small>Click button above to populate settings for each speaker found in the script. "
|
97 |
"Settings are applied per-speaker. If script changes, click again to refresh.</small>"
|
98 |
)
|
99 |
+
# The @gr.render dynamic UI will be placed here by app.py
|
100 |
|
101 |
return (
|
102 |
speaker_config_method_dropdown,
|
103 |
single_voice_group,
|
104 |
global_voice_dropdown,
|
105 |
+
detailed_per_speaker_ui_group_container,
|
106 |
load_per_speaker_ui_button
|
107 |
)
|
108 |
|
109 |
def create_action_and_output_components():
|
110 |
"""Creates buttons for actions (cost, generate) and output display areas."""
|
111 |
+
with gr.Row(elem_id="action_buttons_row"):
|
112 |
calculate_cost_button = gr.Button("Calculate Estimated Cost", elem_id="calculate_cost_button")
|
113 |
generate_button = gr.Button("Generate Audio", variant="primary", elem_id="generate_button")
|
114 |
|
115 |
cost_output = gr.Textbox(label="Estimated Cost", interactive=False, elem_id="cost_output")
|
116 |
+
with gr.Row(elem_id="output_files_row"):
|
117 |
individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)", elem_id="individual_lines_zip_output")
|
118 |
merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath", elem_id="merged_dialogue_mp3_output")
|
119 |
status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5, elem_id="status_output")
|
|
|
125 |
example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\\n[Alice] Let's see how this sounds."
|
126 |
example_script_2 = "[Narrator] This is a short story.\\n[CharacterA] Once upon a time...\\n[Narrator] ...there was a Gradio app.\\n[CharacterB] And it could talk!"
|
127 |
|
|
|
128 |
examples_data = [
|
129 |
[example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
|
130 |
[example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
|
|
|
149 |
inputs=inputs_for_examples,
|
150 |
outputs=outputs_for_examples,
|
151 |
fn=process_fn,
|
152 |
+
cache_examples=False,
|
153 |
examples_per_page=5,
|
154 |
label="Example Scripts (Click to Load & Run)",
|
155 |
run_on_click=True
|