abocha commited on
Commit
49a48a4
·
1 Parent(s): 92c9b3d
Files changed (3) hide show
  1. app.py +86 -65
  2. event_handlers.py +95 -133
  3. ui_layout.py +21 -20
app.py CHANGED
@@ -4,12 +4,13 @@ import os
4
  import asyncio
5
  from openai import AsyncOpenAI
6
  from functools import partial
7
- import datetime # For timestamp in @gr.render debug
8
 
9
  from ui_layout import (
10
  create_main_input_components, create_speaker_config_components,
11
  create_action_and_output_components, create_examples_ui,
12
- TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES, DEFAULT_GLOBAL_VOICE # Added for @gr.render context
 
13
  )
14
 
15
  from event_handlers import (
@@ -17,9 +18,8 @@ from event_handlers import (
17
  handle_speaker_config_method_visibility_change,
18
  handle_tts_model_change,
19
  handle_load_refresh_per_speaker_ui_trigger,
20
- # Placeholder for handling changes from controls inside @gr.render's dynamic UI
21
- # handle_dynamic_accordion_input_change,
22
- get_speakers_from_script # Utility function
23
  )
24
 
25
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -54,78 +54,104 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
54
  if not OPENAI_API_KEY or not async_openai_client:
55
  gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
56
 
57
- # This state holds the detailed configurations for each speaker.
58
- # It's an input to @gr.render and an output of event handlers that modify it.
59
  speaker_configs_state = gr.State({})
60
 
61
  (script_input, tts_model_dropdown, pause_input,
62
  global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
63
 
64
  (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
65
- detailed_per_speaker_ui_group_container, # This is the gr.Column container
66
  load_per_speaker_ui_button) = create_speaker_config_components()
67
 
68
  (calculate_cost_button, generate_button, cost_output,
69
  individual_lines_zip_output, merged_dialogue_mp3_output,
70
  status_output) = create_action_and_output_components()
71
 
72
- # --- Dynamic UI Area using @gr.render ---
73
- # This block is defined within the `detailed_per_speaker_ui_group_container` (a gr.Column)
74
- with detailed_per_speaker_ui_group_container:
75
- # The @gr.render decorator ensures this function re-runs and its UI
76
- # is re-drawn when its `inputs` change or its `triggers` are activated.
77
  @gr.render(
78
  inputs=[script_input, speaker_configs_state, tts_model_dropdown],
79
  triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
80
  )
81
  def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
82
- """
83
- Defines the dynamic UI for per-speaker configuration.
84
- This function's content is re-rendered by Gradio when inputs/triggers change.
85
- """
86
- print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State: {str(current_speaker_configs)[:30]}...")
87
 
88
- # --- For Phase 1 Debugging: Display simple Markdown ---
89
- timestamp = datetime.datetime.now().isoformat()
90
- gr.Markdown(
91
- f"#### Dynamic UI Area (@render)\n"
92
- f"- **TTS Model:** {current_tts_model}\n"
93
- f"- **Timestamp:** {timestamp}\n"
94
- # f"- Script: `{current_script_text[:60].replace('[', '(').replace(']', ')') + ('...' if len(current_script_text) > 60 else '')}`\n" # Be careful with Markdown in script
95
- # f"- State Keys: `{list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}`"
96
- )
97
- # --- End of Phase 1 Debugging ---
98
-
99
- # --- Future Full Implementation (Illustrative Structure) ---
100
- # This is where you would build the actual accordions for each speaker.
101
- # unique_speakers = get_speakers_from_script(current_script_text)
102
- # if not unique_speakers:
103
- # gr.Markdown("<p style='color: #888;'>Enter a script and click 'Load/Refresh' to see per-speaker settings.</p>")
104
- # else:
105
- # for speaker_name in unique_speakers:
106
- # # Ensure speaker_configs_state is a dict
107
- # speaker_specific_config = current_speaker_configs.get(speaker_name, {}) if isinstance(current_speaker_configs, dict) else {}
108
- # default_voice_for_speaker = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
109
- # # ... and so on for other settings like speed, vibe ...
110
- #
111
- # with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False): # Start closed
112
- # gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")
113
- # # Example: Voice Dropdown
114
- # # voice_dd = gr.Dropdown(APP_AVAILABLE_VOICES, value=default_voice_for_speaker, label="Voice")
115
- # # Attach listener to update speaker_configs_state
116
- # # voice_dd.change(
117
- # # fn=lambda val, spk=speaker_name, cfg=current_speaker_configs: \
118
- # # handle_dynamic_accordion_input_change(val, spk, "voice", cfg, speaker_configs_state),
119
- # # inputs=[voice_dd],
120
- # # outputs=[speaker_configs_state] # CRITICAL: This updates the state, which can re-trigger @gr.render
121
- # # ) # if speaker_configs_state is also an input to @gr.render.
122
- # # This creates a reactive loop.
123
- #
124
- # # Add other controls (speed slider, vibe dropdown, custom instructions textbox)
125
- # # based on current_tts_model, similar to the voice_dd example.
126
- # gr.Textbox(label="Placeholder for voice/speed/vibe controls")
127
- # --- End of Future Full Implementation ---
128
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  # --- Event Listeners for other UI elements ---
131
  tts_model_dropdown.change(
@@ -144,9 +170,6 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
144
  fn=handle_load_refresh_per_speaker_ui_trigger,
145
  inputs=[script_input, speaker_configs_state, tts_model_dropdown],
146
  outputs=[speaker_configs_state]
147
- # The button click itself is also a trigger for @gr.render,
148
- # and this handler updates speaker_configs_state, which is an input to @gr.render,
149
- # thus ensuring @gr.render re-runs with potentially new state.
150
  )
151
 
152
  calculate_cost_button.click(
@@ -183,8 +206,6 @@ with gr.Blocks(theme=gr.themes.Soft(), elem_id="main_blocks_ui") as demo:
183
  )
184
 
185
  if __name__ == "__main__":
186
- if os.name == 'nt': # Required for Windows if using asyncio default event loop
187
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
188
- # Consider adding `prevent_thread_lock=True` if running in a script and want it to exit
189
- # or if issues with thread locking occur.
190
  demo.queue().launch(debug=True, share=False)
 
4
  import asyncio
5
  from openai import AsyncOpenAI
6
  from functools import partial
7
+ import datetime
8
 
9
  from ui_layout import (
10
  create_main_input_components, create_speaker_config_components,
11
  create_action_and_output_components, create_examples_ui,
12
+ TTS_MODELS_AVAILABLE, MODEL_DEFAULT_ENV, APP_AVAILABLE_VOICES,
13
+ DEFAULT_GLOBAL_VOICE, VIBE_CHOICES, DEFAULT_VIBE, PREDEFINED_VIBES
14
  )
15
 
16
  from event_handlers import (
 
18
  handle_speaker_config_method_visibility_change,
19
  handle_tts_model_change,
20
  handle_load_refresh_per_speaker_ui_trigger,
21
+ handle_dynamic_accordion_input_change, # For controls inside @gr.render
22
+ get_speakers_from_script
 
23
  )
24
 
25
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
54
  if not OPENAI_API_KEY or not async_openai_client:
55
  gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
56
 
 
 
57
  speaker_configs_state = gr.State({})
58
 
59
  (script_input, tts_model_dropdown, pause_input,
60
  global_speed_input, global_instructions_input) = create_main_input_components(EFFECTIVE_MODEL_DEFAULT)
61
 
62
  (speaker_config_method_dropdown, single_voice_group, global_voice_dropdown,
63
+ detailed_per_speaker_ui_group_container,
64
  load_per_speaker_ui_button) = create_speaker_config_components()
65
 
66
  (calculate_cost_button, generate_button, cost_output,
67
  individual_lines_zip_output, merged_dialogue_mp3_output,
68
  status_output) = create_action_and_output_components()
69
 
70
+ with detailed_per_speaker_ui_group_container: # Define @gr.render inside this column
 
 
 
 
71
  @gr.render(
72
  inputs=[script_input, speaker_configs_state, tts_model_dropdown],
73
  triggers=[load_per_speaker_ui_button.click, tts_model_dropdown.change]
74
  )
75
  def render_dynamic_speaker_ui(current_script_text: str, current_speaker_configs: dict, current_tts_model: str):
76
+ """Defines and re-renders the dynamic UI for per-speaker configuration."""
77
+ print(f"DEBUG: @gr.render CALLED. Model: {current_tts_model}. Script: '{current_script_text[:30]}...'. State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs,dict) else 'Not a dict'}")
 
 
 
78
 
79
+ unique_speakers = get_speakers_from_script(current_script_text)
80
+
81
+ if not unique_speakers:
82
+ gr.Markdown("<p style='color: #888; margin-top:10px;'>Enter a script with speaker tags (e.g., `[SpeakerName] Text`) and click 'Load/Refresh' to see per-speaker settings.</p>")
83
+ return
84
+
85
+ for speaker_idx, speaker_name in enumerate(unique_speakers):
86
+ # Ensure current_speaker_configs is a valid dictionary
87
+ if not isinstance(current_speaker_configs, dict):
88
+ current_speaker_configs = {}
89
+
90
+ speaker_specific_config = current_speaker_configs.get(speaker_name, {})
91
+ accordion_elem_id = f"accordion_spk_{speaker_idx}_{speaker_name.replace(' ','_').lower()}"
92
+
93
+ with gr.Accordion(f"Settings for Speaker: {speaker_name}", open=False, elem_id=accordion_elem_id):
94
+ gr.Markdown(f"Configure voice for **{speaker_name}** using **{current_tts_model}** model.")
95
+
96
+ default_voice = speaker_specific_config.get("voice", DEFAULT_GLOBAL_VOICE)
97
+ voice_dd_elem_id = f"voice_dd_spk_{speaker_idx}"
98
+ voice_dropdown = gr.Dropdown(
99
+ APP_AVAILABLE_VOICES,
100
+ value=default_voice,
101
+ label="Voice",
102
+ elem_id=voice_dd_elem_id
103
+ )
104
+ voice_dropdown.change(
105
+ fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="voice"),
106
+ inputs=[voice_dropdown, speaker_configs_state], # Pass current state to handler
107
+ outputs=[speaker_configs_state]
108
+ )
109
+
110
+ if current_tts_model in ["tts-1", "tts-1-hd"]:
111
+ default_speed = float(speaker_specific_config.get("speed", 1.0))
112
+ speed_slider_elem_id = f"speed_slider_spk_{speaker_idx}"
113
+ speed_slider = gr.Slider(
114
+ minimum=0.25, maximum=4.0, value=default_speed, step=0.05,
115
+ label="Speed", elem_id=speed_slider_elem_id
116
+ )
117
+ speed_slider.change(
118
+ fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="speed"),
119
+ inputs=[speed_slider, speaker_configs_state],
120
+ outputs=[speaker_configs_state]
121
+ )
122
+ elif current_tts_model == "gpt-4o-mini-tts":
123
+ default_vibe = speaker_specific_config.get("vibe", DEFAULT_VIBE)
124
+ vibe_dd_elem_id = f"vibe_dd_spk_{speaker_idx}"
125
+ vibe_dropdown = gr.Dropdown(
126
+ VIBE_CHOICES, value=default_vibe, label="Vibe/Emotion", elem_id=vibe_dd_elem_id
127
+ )
128
+
129
+ default_custom_instructions = speaker_specific_config.get("custom_instructions", "")
130
+ custom_instr_tb_elem_id = f"custom_instr_tb_spk_{speaker_idx}"
131
+ custom_instructions_textbox = gr.Textbox(
132
+ label="Custom Instructions",
133
+ value=default_custom_instructions,
134
+ placeholder="e.g., Speak with a slightly hesitant tone.",
135
+ lines=2,
136
+ visible=(default_vibe == "Custom..."), # Initial visibility
137
+ elem_id=custom_instr_tb_elem_id
138
+ )
139
+
140
+ vibe_dropdown.change(
141
+ fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="vibe"),
142
+ inputs=[vibe_dropdown, speaker_configs_state],
143
+ outputs=[speaker_configs_state]
144
+ ).then( # Chain another update for visibility
145
+ fn=lambda vibe_val: gr.update(visible=(vibe_val == "Custom...")),
146
+ inputs=[vibe_dropdown],
147
+ outputs=[custom_instructions_textbox]
148
+ )
149
+
150
+ custom_instructions_textbox.change(
151
+ fn=partial(handle_dynamic_accordion_input_change, speaker_name=speaker_name, config_key="custom_instructions"),
152
+ inputs=[custom_instructions_textbox, speaker_configs_state],
153
+ outputs=[speaker_configs_state]
154
+ )
155
 
156
  # --- Event Listeners for other UI elements ---
157
  tts_model_dropdown.change(
 
170
  fn=handle_load_refresh_per_speaker_ui_trigger,
171
  inputs=[script_input, speaker_configs_state, tts_model_dropdown],
172
  outputs=[speaker_configs_state]
 
 
 
173
  )
174
 
175
  calculate_cost_button.click(
 
206
  )
207
 
208
  if __name__ == "__main__":
209
+ if os.name == 'nt':
210
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
 
 
211
  demo.queue().launch(debug=True, share=False)
event_handlers.py CHANGED
@@ -9,7 +9,6 @@ import random
9
  from functools import partial
10
  import datetime
11
 
12
- # Assuming utils.script_parser and utils.openai_tts, utils.merge_audio are correctly defined
13
  from utils.script_parser import parse_dialogue_script, calculate_cost
14
  from utils.openai_tts import synthesize_speech_line
15
  from utils.merge_audio import merge_mp3_files
@@ -21,45 +20,48 @@ def get_speakers_from_script(script_text: str) -> list:
21
  if not script_text or not script_text.strip():
22
  return []
23
  try:
24
- parsed_lines, _ = parse_dialogue_script(script_text)
25
  if not parsed_lines:
26
  return []
27
  seen_speakers = set()
28
  ordered_unique_speakers = []
29
  for line_data in parsed_lines:
30
  speaker = line_data.get("speaker")
31
- if speaker and speaker not in seen_speakers:
32
  ordered_unique_speakers.append(speaker)
33
  seen_speakers.add(speaker)
34
  return ordered_unique_speakers
35
- except ValueError: # Handles potential errors from parse_dialogue_script
 
36
  return []
 
 
 
 
37
 
38
  def handle_dynamic_accordion_input_change(
39
- new_value,
40
- speaker_name: str,
41
- config_key: str,
42
- current_speaker_configs: dict
43
- # target_state_component is implicitly speaker_configs_state when this function is an output to it
44
  ):
45
  """
46
  Updates the speaker_configs_state when a dynamic UI element within an Accordion changes.
47
- This function is INTENDED to be used as the 'fn' for .change() listeners of
48
- components created inside the @gr.render block.
49
  """
50
  if not isinstance(current_speaker_configs, dict):
51
- print(f"Warning: current_speaker_configs was not a dict, re-initializing. Type: {type(current_speaker_configs)}")
52
- current_speaker_configs = {} # Defensive: re-initialize if it's not a dict
53
 
54
- updated_configs = current_speaker_configs.copy() # Work on a copy
55
 
56
  if speaker_name not in updated_configs:
57
  updated_configs[speaker_name] = {}
58
 
59
  updated_configs[speaker_name][config_key] = new_value
60
- updated_configs["_last_config_change"] = f"{speaker_name}_{config_key}_{datetime.datetime.now().isoformat()}" # To ensure state change
61
 
62
- print(f"DEBUG (accordion_change): Speaker '{speaker_name}' config '{config_key}' to '{new_value}'. New state hint: {updated_configs.get('_last_config_change')}")
 
 
63
  return updated_configs
64
 
65
 
@@ -67,7 +69,7 @@ async def handle_script_processing(
67
  openai_api_key: str, async_openai_client, nsfw_api_url_template: str,
68
  dialogue_script: str, tts_model: str, pause_ms: int,
69
  speaker_config_method: str, global_voice_selection: str,
70
- speaker_configs_state_dict: dict, # This is the value from gr.State
71
  global_speed: float,
72
  global_instructions: str,
73
  progress=gr.Progress(track_tqdm=True)
@@ -84,140 +86,120 @@ async def handle_script_processing(
84
  try:
85
  parsed_lines, _ = parse_dialogue_script(dialogue_script)
86
  if not parsed_lines:
87
- shutil.rmtree(job_audio_path_prefix)
88
- return None, None, "Error: No valid lines found in script."
89
  except ValueError as e:
90
- shutil.rmtree(job_audio_path_prefix)
91
- return None, None, f"Script parsing error: {str(e)}"
92
 
93
  if not isinstance(speaker_configs_state_dict, dict):
94
  print(f"Warning: speaker_configs_state_dict was not a dict in handle_script_processing. Re-initializing. Type: {type(speaker_configs_state_dict)}")
95
  speaker_configs_state_dict = {}
96
 
97
- safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
98
 
99
- speaker_voice_map = {} # For Random or A/B methods
100
  if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
101
  unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
102
  temp_voices_pool = APP_AVAILABLE_VOICES.copy()
103
- if not temp_voices_pool: temp_voices_pool = [DEFAULT_GLOBAL_VOICE]
104
 
105
  if speaker_config_method == "Random per Speaker":
106
  for spk_name in unique_script_speakers_for_map:
107
  speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
108
- elif speaker_config_method == "A/B Round Robin" and temp_voices_pool: # Check temp_voices_pool not empty
109
  for i, spk_name in enumerate(unique_script_speakers_for_map):
110
  speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
111
 
112
- tasks = []
113
- # line_audio_files_map = {} # To store results indexed by original line ID
114
-
115
- # Create a list to store (original_line_id, future_task)
116
  task_info_list = []
117
-
118
  for i, line_data in enumerate(parsed_lines):
119
  speaker_name = line_data["speaker"]
 
 
 
120
  line_voice = safe_default_global_voice
121
  line_speed = global_speed
122
  line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
123
 
124
  if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
125
  spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
126
- line_voice = spk_cfg.get("voice", safe_default_global_voice) # Fallback to safe global voice
 
127
  if tts_model in ["tts-1", "tts-1-hd"]:
128
  line_speed = float(spk_cfg.get("speed", global_speed))
129
- else: # Other models don't use speed
130
- line_speed = 1.0
131
 
132
  if tts_model == "gpt-4o-mini-tts":
133
  vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
134
  custom_instr_raw = spk_cfg.get("custom_instructions", "")
135
  custom_instr = custom_instr_raw.strip() if custom_instr_raw else ""
136
-
 
137
  if vibe == "Custom..." and custom_instr:
138
- line_instructions = custom_instr
139
  elif vibe != "None" and vibe != "Custom..." and PREDEFINED_VIBES.get(vibe):
140
- line_instructions = PREDEFINED_VIBES[vibe]
141
- elif not line_instructions and global_instructions and global_instructions.strip(): # Fallback to global if still no specific instructions
142
- line_instructions = global_instructions.strip()
143
- elif not line_instructions: # Explicitly None if all else fails
144
- line_instructions = None
145
- else: # tts-1 models don't use vibe/custom instructions from per-speaker UI
146
- line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
147
-
148
 
149
  elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
150
  line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
151
- if tts_model not in ["tts-1", "tts-1-hd"]: # Ensure speed is reset if not tts-1 family
152
- line_speed = 1.0
153
- # For Random/AB, use global instructions if any
154
- line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
155
 
156
- # Final check for speed parameter validity based on model
157
- if tts_model not in ["tts-1", "tts-1-hd"]:
158
- line_speed = 1.0
159
 
160
-
161
- out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}_{speaker_name.replace(' ','_')}.mp3")
162
  progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
163
 
164
  current_task = synthesize_speech_line(
165
- client=async_openai_client, text=line_data["text"], voice=line_voice,
166
  output_path=out_fn, model=tts_model, speed=line_speed,
167
  instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
168
- line_index=line_data['id']
169
  )
170
- task_info_list.append({"id": line_data['id'], "speaker": speaker_name, "task": current_task, "out_fn": out_fn})
171
-
172
- # Gather results
173
- # await asyncio.gather(*(info['task'] for info in task_info_list)) # This runs them, but doesn't handle exceptions well for individual ones
174
 
175
- processed_results = []
176
  for info in task_info_list:
177
  try:
178
  result_path = await info['task']
179
- processed_results.append({"id": info['id'], "path": result_path, "speaker": info['speaker']})
180
  except Exception as e:
181
  print(f"Error synthesizing line ID {info['id']} ({info['speaker']}): {e}")
182
- processed_results.append({"id": info['id'], "path": None, "error": e, "speaker": info['speaker']})
183
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- # Ensure files are processed in the original script order for merging
186
- ordered_files_for_merge_and_zip = [None] * len(parsed_lines)
187
- for res in processed_results:
188
- # Find original index of this line_id to maintain order
189
- original_line_index = -1
190
- for idx, p_line in enumerate(parsed_lines):
191
- if p_line['id'] == res['id']:
192
- original_line_index = idx
193
- break
194
-
195
- if original_line_index != -1:
196
- if res.get("path") and os.path.exists(res["path"]) and os.path.getsize(res["path"]) > 0:
197
- ordered_files_for_merge_and_zip[original_line_index] = res["path"]
198
- else:
199
- print(f"Skipped or failed synthesizing line ID {res['id']} ({res['speaker']})")
200
- # ordered_files_for_merge_and_zip remains None at this index
201
 
202
  valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
203
 
204
  if not valid_files_for_zip:
205
- shutil.rmtree(job_audio_path_prefix) # Cleanup
206
- return None, None, "Error: No audio was successfully synthesized for any line."
207
 
208
  zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
209
  with zipfile.ZipFile(zip_fn, 'w') as zf:
210
  for f_path in valid_files_for_zip:
211
  zf.write(f_path, os.path.basename(f_path))
212
 
213
- # Files to actually merge (only valid ones, already in order)
214
- files_to_actually_merge = valid_files_for_zip
215
  merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
216
  merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
217
 
218
  status_msg = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
219
  if len(valid_files_for_zip) < len(parsed_lines): status_msg += "Some lines may have failed. "
220
-
221
  if not merged_path and len(valid_files_for_zip) > 0 : status_msg += "Merging audio failed. "
222
  elif not merged_path: status_msg = "No audio to merge (all lines failed or were skipped)."
223
  else: status_msg += "Merged audio generated."
@@ -226,74 +208,55 @@ async def handle_script_processing(
226
  merged_path if merged_path and os.path.exists(merged_path) else None,
227
  status_msg)
228
 
229
-
230
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
231
- """Calculates the estimated cost for TTS processing."""
232
  if not dialogue_script or not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
233
  try:
234
  parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
235
  if not parsed_lines: return "Cost: $0.00 (No valid lines in script)"
236
  cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
237
  return f"Estimated Cost for {len(parsed_lines)} lines ({total_chars} chars): ${cost:.6f}"
238
- except ValueError as e: # Specific error from parsing or cost calculation
239
- return f"Cost calculation error: {str(e)}"
240
- except Exception as e: # Generic fallback for unexpected errors
241
- return f"An unexpected error occurred during cost calculation: {str(e)}"
242
-
243
 
244
  def handle_load_refresh_per_speaker_ui_trigger(script_text: str, current_speaker_configs: dict, tts_model: str):
245
- """
246
- Triggered by the 'Load/Refresh' button. Updates speaker_configs_state.
247
- The @gr.render function reacts to this state change (and the button click trigger).
248
- """
249
  print(f"DEBUG (Load/Refresh Trigger): Script: '{script_text[:30]}...', Model: {tts_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
 
 
250
 
251
- # Ensure current_speaker_configs is a dictionary
252
- if not isinstance(current_speaker_configs, dict):
253
- current_speaker_configs = {}
254
-
255
- updated_configs = current_speaker_configs.copy() # Work on a copy to ensure change detection
256
-
257
- # Example: Initialize new speakers found in the script if they aren't in the state
258
  # unique_speakers_in_script = get_speakers_from_script(script_text)
259
  # for speaker in unique_speakers_in_script:
260
  # if speaker not in updated_configs:
261
- # updated_configs[speaker] = {"voice": DEFAULT_GLOBAL_VOICE} # Or more detailed defaults based on tts_model
262
-
263
- # Add/update a key to ensure Gradio recognizes the state object has changed,
264
- # which helps in reliably triggering @gr.render if other inputs didn't change.
265
- updated_configs["_last_refresh_source"] = "button_click"
266
- updated_configs["_last_refresh_timestamp"] = datetime.datetime.now().isoformat()
267
-
268
  return updated_configs
269
 
270
-
271
  def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
272
- """
273
- Handles changes to the TTS model dropdown. Updates global controls and speaker_configs_state.
274
- """
275
  print(f"DEBUG (TTS Model Change): Model: {selected_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
276
-
277
- if not isinstance(current_speaker_configs, dict):
278
- current_speaker_configs = {}
279
-
280
- updated_configs = current_speaker_configs.copy() # Work on a copy
281
-
282
- # Potentially adjust or clear parts of the state that are model-specific
283
- # For example, if switching from tts-1 to gpt-4o-mini-tts, 'speed' settings might be irrelevant
284
- # and 'vibe' settings might need to be initialized.
285
- # for speaker_cfg in updated_configs.values():
286
- # if isinstance(speaker_cfg, dict):
287
- # if selected_model == "gpt-4o-mini-tts":
288
- # speaker_cfg.pop("speed", None) # Remove speed if it exists
289
- # if "vibe" not in speaker_cfg: speaker_cfg["vibe"] = DEFAULT_VIBE
290
- # elif selected_model in ["tts-1", "tts-1-hd"]:
291
- # speaker_cfg.pop("vibe", None)
292
- # speaker_cfg.pop("custom_instructions", None)
293
- # if "speed" not in speaker_cfg: speaker_cfg["speed"] = 1.0
294
-
295
- updated_configs["_last_refresh_source"] = "model_change"
296
- updated_configs["_last_refresh_timestamp"] = datetime.datetime.now().isoformat()
297
 
298
  is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
299
  is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
@@ -305,7 +268,6 @@ def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
305
  )
306
 
307
  def handle_speaker_config_method_visibility_change(method: str):
308
- """Handles changes to the speaker configuration method dropdown."""
309
  print(f"DEBUG (Config Method Change): Method: {method}")
310
  is_single_voice_visible = (method == "Single Voice (Global)")
311
  is_detailed_per_speaker_container_visible = (method == "Detailed Configuration (Per Speaker UI)")
 
9
  from functools import partial
10
  import datetime
11
 
 
12
  from utils.script_parser import parse_dialogue_script, calculate_cost
13
  from utils.openai_tts import synthesize_speech_line
14
  from utils.merge_audio import merge_mp3_files
 
20
  if not script_text or not script_text.strip():
21
  return []
22
  try:
23
+ parsed_lines, _ = parse_dialogue_script(script_text) # Assuming this returns (list_of_dicts, total_chars)
24
  if not parsed_lines:
25
  return []
26
  seen_speakers = set()
27
  ordered_unique_speakers = []
28
  for line_data in parsed_lines:
29
  speaker = line_data.get("speaker")
30
+ if speaker and speaker not in seen_speakers: # Ensure speaker is not None or empty
31
  ordered_unique_speakers.append(speaker)
32
  seen_speakers.add(speaker)
33
  return ordered_unique_speakers
34
+ except ValueError:
35
+ print("ValueError during script parsing in get_speakers_from_script.")
36
  return []
37
+ except Exception as e:
38
+ print(f"Unexpected error in get_speakers_from_script: {e}")
39
+ return []
40
+
41
 
42
  def handle_dynamic_accordion_input_change(
43
+ new_value, # Value from the changed component (e.g., voice_dropdown)
44
+ current_speaker_configs: dict, # Current value of speaker_configs_state
45
+ speaker_name: str, # Passed via partial from the event listener
46
+ config_key: str # Passed via partial from the event listener
 
47
  ):
48
  """
49
  Updates the speaker_configs_state when a dynamic UI element within an Accordion changes.
 
 
50
  """
51
  if not isinstance(current_speaker_configs, dict):
52
+ print(f"Warning: current_speaker_configs was not a dict in handle_dynamic_accordion_input_change. Type: {type(current_speaker_configs)}. Re-initializing.")
53
+ current_speaker_configs = {}
54
 
55
+ updated_configs = current_speaker_configs.copy()
56
 
57
  if speaker_name not in updated_configs:
58
  updated_configs[speaker_name] = {}
59
 
60
  updated_configs[speaker_name][config_key] = new_value
 
61
 
62
+ updated_configs["_last_dynamic_update_details"] = f"Speaker: {speaker_name}, Key: {config_key}, Val: {str(new_value)[:20]}, TS: {datetime.datetime.now().isoformat()}"
63
+
64
+ print(f"DEBUG (dynamic_input_change): Speaker '{speaker_name}' config '{config_key}' to '{str(new_value)[:50]}'. New state hint: {updated_configs.get('_last_dynamic_update_details')}")
65
  return updated_configs
66
 
67
 
 
69
  openai_api_key: str, async_openai_client, nsfw_api_url_template: str,
70
  dialogue_script: str, tts_model: str, pause_ms: int,
71
  speaker_config_method: str, global_voice_selection: str,
72
+ speaker_configs_state_dict: dict,
73
  global_speed: float,
74
  global_instructions: str,
75
  progress=gr.Progress(track_tqdm=True)
 
86
  try:
87
  parsed_lines, _ = parse_dialogue_script(dialogue_script)
88
  if not parsed_lines:
89
+ shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No valid lines found in script."
 
90
  except ValueError as e:
91
+ shutil.rmtree(job_audio_path_prefix); return None, None, f"Script parsing error: {str(e)}"
 
92
 
93
  if not isinstance(speaker_configs_state_dict, dict):
94
  print(f"Warning: speaker_configs_state_dict was not a dict in handle_script_processing. Re-initializing. Type: {type(speaker_configs_state_dict)}")
95
  speaker_configs_state_dict = {}
96
 
97
+ safe_default_global_voice = global_voice_selection if global_voice_selection in APP_AVAILABLE_VOICES else (APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy")
98
 
99
+ speaker_voice_map = {}
100
  if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
101
  unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
102
  temp_voices_pool = APP_AVAILABLE_VOICES.copy()
103
+ if not temp_voices_pool: temp_voices_pool = [safe_default_global_voice]
104
 
105
  if speaker_config_method == "Random per Speaker":
106
  for spk_name in unique_script_speakers_for_map:
107
  speaker_voice_map[spk_name] = random.choice(temp_voices_pool)
108
+ elif speaker_config_method == "A/B Round Robin" and temp_voices_pool:
109
  for i, spk_name in enumerate(unique_script_speakers_for_map):
110
  speaker_voice_map[spk_name] = temp_voices_pool[i % len(temp_voices_pool)]
111
 
 
 
 
 
112
  task_info_list = []
 
113
  for i, line_data in enumerate(parsed_lines):
114
  speaker_name = line_data["speaker"]
115
+ line_text = line_data["text"]
116
+ line_id = line_data["id"]
117
+
118
  line_voice = safe_default_global_voice
119
  line_speed = global_speed
120
  line_instructions = global_instructions.strip() if global_instructions and global_instructions.strip() else None
121
 
122
  if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
123
  spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
124
+ line_voice = spk_cfg.get("voice", safe_default_global_voice)
125
+
126
  if tts_model in ["tts-1", "tts-1-hd"]:
127
  line_speed = float(spk_cfg.get("speed", global_speed))
128
+ else: line_speed = 1.0
 
129
 
130
  if tts_model == "gpt-4o-mini-tts":
131
  vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
132
  custom_instr_raw = spk_cfg.get("custom_instructions", "")
133
  custom_instr = custom_instr_raw.strip() if custom_instr_raw else ""
134
+
135
+ current_line_specific_instructions = None
136
  if vibe == "Custom..." and custom_instr:
137
+ current_line_specific_instructions = custom_instr
138
  elif vibe != "None" and vibe != "Custom..." and PREDEFINED_VIBES.get(vibe):
139
+ current_line_specific_instructions = PREDEFINED_VIBES[vibe]
140
+
141
+ # If per-speaker instructions are set, they take precedence. Otherwise, fall back to global instructions.
142
+ line_instructions = current_line_specific_instructions if current_line_specific_instructions is not None else line_instructions
143
+ else: # tts-1, tts-1-hd do not use vibe/custom_instructions from per-speaker UI
144
+ # They will use the global_instructions if set.
145
+ pass # line_instructions already set to global_instructions or None
 
146
 
147
  elif speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
148
  line_voice = speaker_voice_map.get(speaker_name, safe_default_global_voice)
149
+ # For these methods, speed and instructions remain global
150
+ if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
 
 
151
 
152
+ if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
 
 
153
 
154
+ out_fn = os.path.join(job_audio_path_prefix, f"line_{line_id}_{speaker_name.replace(' ','_')}.mp3")
 
155
  progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
156
 
157
  current_task = synthesize_speech_line(
158
+ client=async_openai_client, text=line_text, voice=line_voice,
159
  output_path=out_fn, model=tts_model, speed=line_speed,
160
  instructions=line_instructions, nsfw_api_url_template=nsfw_api_url_template,
161
+ line_index=line_id
162
  )
163
+ task_info_list.append({"id": line_id, "speaker": speaker_name, "task": current_task, "out_fn": out_fn})
 
 
 
164
 
165
+ processed_results_map = {} # Store by line_id for easier lookup
166
  for info in task_info_list:
167
  try:
168
  result_path = await info['task']
169
+ processed_results_map[info['id']] = {"path": result_path, "speaker": info['speaker']}
170
  except Exception as e:
171
  print(f"Error synthesizing line ID {info['id']} ({info['speaker']}): {e}")
172
+ processed_results_map[info['id']] = {"path": None, "error": e, "speaker": info['speaker']}
173
 
174
+ ordered_files_for_merge_and_zip = []
175
+ for p_line in parsed_lines: # Iterate through original parsed lines to maintain order
176
+ line_id = p_line['id']
177
+ res = processed_results_map.get(line_id)
178
+ if res and res.get("path") and os.path.exists(res["path"]) and os.path.getsize(res["path"]) > 0:
179
+ ordered_files_for_merge_and_zip.append(res["path"])
180
+ else:
181
+ # File was not successfully created or result not found, append None placeholder
182
+ ordered_files_for_merge_and_zip.append(None)
183
+ if res: print(f"Skipped or failed synthesizing line ID {line_id} ({res.get('speaker', 'Unknown')}) for merge/zip.")
184
+ else: print(f"Result for line ID {line_id} not found in processed_results_map.")
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  valid_files_for_zip = [f for f in ordered_files_for_merge_and_zip if f]
188
 
189
  if not valid_files_for_zip:
190
+ shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio was successfully synthesized for any line."
 
191
 
192
  zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
193
  with zipfile.ZipFile(zip_fn, 'w') as zf:
194
  for f_path in valid_files_for_zip:
195
  zf.write(f_path, os.path.basename(f_path))
196
 
197
+ files_to_actually_merge = valid_files_for_zip # Already ordered and filtered
 
198
  merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
199
  merged_path = merge_mp3_files(files_to_actually_merge, merged_fn, pause_ms)
200
 
201
  status_msg = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
202
  if len(valid_files_for_zip) < len(parsed_lines): status_msg += "Some lines may have failed. "
 
203
  if not merged_path and len(valid_files_for_zip) > 0 : status_msg += "Merging audio failed. "
204
  elif not merged_path: status_msg = "No audio to merge (all lines failed or were skipped)."
205
  else: status_msg += "Merged audio generated."
 
208
  merged_path if merged_path and os.path.exists(merged_path) else None,
209
  status_msg)
210
 
 
211
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
 
212
  if not dialogue_script or not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
213
  try:
214
  parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
215
  if not parsed_lines: return "Cost: $0.00 (No valid lines in script)"
216
  cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
217
  return f"Estimated Cost for {len(parsed_lines)} lines ({total_chars} chars): ${cost:.6f}"
218
+ except ValueError as e: return f"Cost calculation error: {str(e)}"
219
+ except Exception as e: return f"An unexpected error: {str(e)}"
 
 
 
220
 
221
  def handle_load_refresh_per_speaker_ui_trigger(script_text: str, current_speaker_configs: dict, tts_model: str):
 
 
 
 
222
  print(f"DEBUG (Load/Refresh Trigger): Script: '{script_text[:30]}...', Model: {tts_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
223
+ if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
224
+ updated_configs = current_speaker_configs.copy()
225
 
226
+ # Optionally, you might want to pre-populate settings for new speakers here
 
 
 
 
 
 
227
  # unique_speakers_in_script = get_speakers_from_script(script_text)
228
  # for speaker in unique_speakers_in_script:
229
  # if speaker not in updated_configs:
230
+ # updated_configs[speaker] = {"voice": DEFAULT_GLOBAL_VOICE} # Default init
231
+ # if tts_model in ["tts-1", "tts-1-hd"]: updated_configs[speaker]["speed"] = 1.0
232
+ # elif tts_model == "gpt-4o-mini-tts": updated_configs[speaker]["vibe"] = DEFAULT_VIBE
233
+
234
+ updated_configs["_last_action_source"] = "load_refresh_button"
235
+ updated_configs["_last_action_timestamp"] = datetime.datetime.now().isoformat()
 
236
  return updated_configs
237
 
 
238
  def handle_tts_model_change(selected_model: str, current_speaker_configs: dict):
 
 
 
239
  print(f"DEBUG (TTS Model Change): Model: {selected_model}, Current State Keys: {list(current_speaker_configs.keys()) if isinstance(current_speaker_configs, dict) else 'Not a dict'}")
240
+ if not isinstance(current_speaker_configs, dict): current_speaker_configs = {}
241
+ updated_configs = current_speaker_configs.copy()
242
+
243
+ # When model changes, you might want to reset or adjust model-specific settings for all speakers
244
+ # For example, 'speed' is for tts-1, 'vibe' for gpt-4o-mini-tts
245
+ for speaker_name_key in list(updated_configs.keys()): # Iterate over keys if modifying dict
246
+ if isinstance(updated_configs[speaker_name_key], dict): # Check if it's a speaker config dict
247
+ if selected_model == "gpt-4o-mini-tts":
248
+ updated_configs[speaker_name_key].pop("speed", None)
249
+ if "vibe" not in updated_configs[speaker_name_key]:
250
+ updated_configs[speaker_name_key]["vibe"] = DEFAULT_VIBE
251
+ elif selected_model in ["tts-1", "tts-1-hd"]:
252
+ updated_configs[speaker_name_key].pop("vibe", None)
253
+ updated_configs[speaker_name_key].pop("custom_instructions", None)
254
+ if "speed" not in updated_configs[speaker_name_key]:
255
+ updated_configs[speaker_name_key]["speed"] = 1.0
256
+ # Add other model-specific adjustments if needed
257
+
258
+ updated_configs["_last_action_source"] = "tts_model_change"
259
+ updated_configs["_last_action_timestamp"] = datetime.datetime.now().isoformat()
 
260
 
261
  is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
262
  is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
 
268
  )
269
 
270
  def handle_speaker_config_method_visibility_change(method: str):
 
271
  print(f"DEBUG (Config Method Change): Method: {method}")
272
  is_single_voice_visible = (method == "Single Voice (Global)")
273
  is_detailed_per_speaker_container_visible = (method == "Detailed Configuration (Per Speaker UI)")
ui_layout.py CHANGED
@@ -1,6 +1,5 @@
1
  # FILE: ui_layout.py
2
  import gradio as gr
3
- from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES # Assuming this import path is correct
4
 
5
  # --- UI Constants and Configuration ---
6
  TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
@@ -12,10 +11,17 @@ SPEAKER_CONFIG_METHODS = [
12
  "A/B Round Robin",
13
  "Detailed Configuration (Per Speaker UI)"
14
  ]
15
- DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker" # Or your preferred default
16
-
17
- APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
18
- if not APP_AVAILABLE_VOICES: # Absolute fallback
 
 
 
 
 
 
 
19
  APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
20
 
21
 
@@ -34,17 +40,17 @@ PREDEFINED_VIBES = {
34
  }
35
  VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
36
  DEFAULT_VIBE = "None"
37
- DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0]
38
 
39
 
40
  # --- UI Element Creation Functions ---
41
 
42
  def create_main_input_components(model_default_value):
43
  """Creates the main input components for script, model, pause, and global settings."""
44
- with gr.Row():
45
- with gr.Column(scale=2):
46
  script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\\n[Speaker2] How are you today?", lines=10, elem_id="script_input_area")
47
- with gr.Column(scale=1):
48
  tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value, elem_id="tts_model_dropdown")
49
  pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50, elem_id="pause_input")
50
 
@@ -71,20 +77,16 @@ def create_speaker_config_components():
71
  elem_id="speaker_config_method_dropdown"
72
  )
73
 
74
- with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
75
  global_voice_dropdown = gr.Dropdown(
76
  APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True,
77
  elem_id="global_voice_dropdown"
78
  )
79
 
80
- # This Column group is the container where the @gr.render function in app.py
81
- # will place its dynamically generated UI.
82
  detailed_per_speaker_ui_group_container = gr.Column(
83
  visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)"),
84
  elem_id="detailed_per_speaker_ui_group_container"
85
  )
86
- # The button and its help text are part of this container, defined statically.
87
- # The dynamic content from @gr.render will appear below them within this column.
88
  with detailed_per_speaker_ui_group_container:
89
  load_per_speaker_ui_button = gr.Button(
90
  "Load/Refresh Per-Speaker Settings UI (from Script Above)",
@@ -94,24 +96,24 @@ def create_speaker_config_components():
94
  "<small>Click button above to populate settings for each speaker found in the script. "
95
  "Settings are applied per-speaker. If script changes, click again to refresh.</small>"
96
  )
97
- # The dynamic UI from @gr.render in app.py will be inserted here.
98
 
99
  return (
100
  speaker_config_method_dropdown,
101
  single_voice_group,
102
  global_voice_dropdown,
103
- detailed_per_speaker_ui_group_container, # Container for @gr.render content
104
  load_per_speaker_ui_button
105
  )
106
 
107
  def create_action_and_output_components():
108
  """Creates buttons for actions (cost, generate) and output display areas."""
109
- with gr.Row():
110
  calculate_cost_button = gr.Button("Calculate Estimated Cost", elem_id="calculate_cost_button")
111
  generate_button = gr.Button("Generate Audio", variant="primary", elem_id="generate_button")
112
 
113
  cost_output = gr.Textbox(label="Estimated Cost", interactive=False, elem_id="cost_output")
114
- with gr.Row():
115
  individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)", elem_id="individual_lines_zip_output")
116
  merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath", elem_id="merged_dialogue_mp3_output")
117
  status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5, elem_id="status_output")
@@ -123,7 +125,6 @@ def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=Non
123
  example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\\n[Alice] Let's see how this sounds."
124
  example_script_2 = "[Narrator] This is a short story.\\n[CharacterA] Once upon a time...\\n[Narrator] ...there was a Gradio app.\\n[CharacterB] And it could talk!"
125
 
126
- # Ensure example state matches the number of inputs for `speaker_configs_state` (which is one dictionary)
127
  examples_data = [
128
  [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
129
  [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
@@ -148,7 +149,7 @@ def create_examples_ui(inputs_for_examples, process_fn, outputs_for_examples=Non
148
  inputs=inputs_for_examples,
149
  outputs=outputs_for_examples,
150
  fn=process_fn,
151
- cache_examples=False, # Caching can be complex with external API calls
152
  examples_per_page=5,
153
  label="Example Scripts (Click to Load & Run)",
154
  run_on_click=True
 
1
  # FILE: ui_layout.py
2
  import gradio as gr
 
3
 
4
  # --- UI Constants and Configuration ---
5
  TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
 
11
  "A/B Round Robin",
12
  "Detailed Configuration (Per Speaker UI)"
13
  ]
14
+ DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
15
+
16
+ # Assuming utils.openai_tts.OPENAI_VOICES exists and provides a list of voice names
17
+ # If not, define APP_AVAILABLE_VOICES directly here for now.
18
+ try:
19
+ from utils.openai_tts import OPENAI_VOICES as ALL_TTS_VOICES
20
+ APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
21
+ if not APP_AVAILABLE_VOICES: # Fallback if OPENAI_VOICES is empty
22
+ APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
23
+ except ImportError:
24
+ print("Warning: Could not import OPENAI_VOICES from utils.openai_tts. Using default list.")
25
  APP_AVAILABLE_VOICES = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
26
 
27
 
 
40
  }
41
  VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
42
  DEFAULT_VIBE = "None"
43
+ DEFAULT_GLOBAL_VOICE = APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy"
44
 
45
 
46
  # --- UI Element Creation Functions ---
47
 
48
  def create_main_input_components(model_default_value):
49
  """Creates the main input components for script, model, pause, and global settings."""
50
+ with gr.Row(elem_id="main_input_row"):
51
+ with gr.Column(scale=2, elem_id="script_input_column"):
52
  script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\\n[Speaker2] How are you today?", lines=10, elem_id="script_input_area")
53
+ with gr.Column(scale=1, elem_id="model_config_column"):
54
  tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=model_default_value, elem_id="tts_model_dropdown")
55
  pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50, elem_id="pause_input")
56
 
 
77
  elem_id="speaker_config_method_dropdown"
78
  )
79
 
80
+ with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), elem_id="single_voice_config_group") as single_voice_group:
81
  global_voice_dropdown = gr.Dropdown(
82
  APP_AVAILABLE_VOICES, label="Global Voice", value=DEFAULT_GLOBAL_VOICE, interactive=True,
83
  elem_id="global_voice_dropdown"
84
  )
85
 
 
 
86
  detailed_per_speaker_ui_group_container = gr.Column(
87
  visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)"),
88
  elem_id="detailed_per_speaker_ui_group_container"
89
  )
 
 
90
  with detailed_per_speaker_ui_group_container:
91
  load_per_speaker_ui_button = gr.Button(
92
  "Load/Refresh Per-Speaker Settings UI (from Script Above)",
 
96
  "<small>Click button above to populate settings for each speaker found in the script. "
97
  "Settings are applied per-speaker. If script changes, click again to refresh.</small>"
98
  )
99
+ # The @gr.render dynamic UI will be placed here by app.py
100
 
101
  return (
102
  speaker_config_method_dropdown,
103
  single_voice_group,
104
  global_voice_dropdown,
105
+ detailed_per_speaker_ui_group_container,
106
  load_per_speaker_ui_button
107
  )
108
 
109
  def create_action_and_output_components():
110
  """Creates buttons for actions (cost, generate) and output display areas."""
111
+ with gr.Row(elem_id="action_buttons_row"):
112
  calculate_cost_button = gr.Button("Calculate Estimated Cost", elem_id="calculate_cost_button")
113
  generate_button = gr.Button("Generate Audio", variant="primary", elem_id="generate_button")
114
 
115
  cost_output = gr.Textbox(label="Estimated Cost", interactive=False, elem_id="cost_output")
116
+ with gr.Row(elem_id="output_files_row"):
117
  individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)", elem_id="individual_lines_zip_output")
118
  merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath", elem_id="merged_dialogue_mp3_output")
119
  status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5, elem_id="status_output")
 
125
  example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\\n[Alice] Let's see how this sounds."
126
  example_script_2 = "[Narrator] This is a short story.\\n[CharacterA] Once upon a time...\\n[Narrator] ...there was a Gradio app.\\n[CharacterB] And it could talk!"
127
 
 
128
  examples_data = [
129
  [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", DEFAULT_GLOBAL_VOICE, {}, 1.0, ""],
130
  [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", DEFAULT_GLOBAL_VOICE, {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
 
149
  inputs=inputs_for_examples,
150
  outputs=outputs_for_examples,
151
  fn=process_fn,
152
+ cache_examples=False,
153
  examples_per_page=5,
154
  label="Example Scripts (Click to Load & Run)",
155
  run_on_click=True