abocha commited on
Commit
d44dfc0
·
1 Parent(s): 59d4d10

new ui approach

Browse files
Files changed (1) hide show
  1. app.py +238 -223
app.py CHANGED
@@ -5,9 +5,10 @@ import tempfile
5
  import shutil
6
  import zipfile
7
  import random
8
- import json
9
- import pandas as pd # Import pandas
10
  from openai import AsyncOpenAI
 
11
 
12
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
13
  from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
@@ -45,188 +46,160 @@ SPEAKER_CONFIG_METHODS = [
45
  "Single Voice (Global)",
46
  "Random per Speaker",
47
  "A/B Round Robin",
48
- "Detailed Configuration (Table)" # Using Table method
49
  ]
50
- DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
51
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
52
- _speaker_config_cache = {}
53
 
54
- # --- Speaker Configuration Table Logic ---
55
- DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
56
- # Column indices for clarity (optional, but can help maintainability)
57
- DATAFRAME_COL_SPEAKER = 0
58
- DATAFRAME_COL_VOICE = 1
59
- DATAFRAME_COL_SPEED = 2
60
- DATAFRAME_COL_INSTRUCTIONS = 3
 
 
 
 
 
 
 
 
 
61
 
62
  def get_speakers_from_script(script_text):
63
- if not script_text.strip():
64
- return []
65
  try:
66
  parsed_lines, _ = parse_dialogue_script(script_text)
67
- unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
68
- return unique_speakers
69
- except ValueError:
70
- return []
71
-
72
- def convert_df_to_config_map(df_data_obj):
73
- config_map = {}
74
- # gr.DataFrame's value is a dict like {'data': [[row1_val1, row1_val2], ...], 'headers': [...]}
75
- # or if constructed with pandas, it might be a pandas DataFrame directly.
76
- # Let's handle if it's the dict structure.
77
-
78
- # Ensure df_data_obj is not None and has 'data'
79
- if not df_data_obj or 'data' not in df_data_obj or not isinstance(df_data_obj['data'], list):
80
- if isinstance(df_data_obj, pd.DataFrame): # If it's already a pandas DF
81
- data_list = df_data_obj.values.tolist()
82
- else: # Unrecognized format or empty
83
- print("DataFrame data is empty or in an unrecognized format for conversion.")
84
- return config_map
85
- else:
86
- data_list = df_data_obj['data']
87
-
88
-
89
- for row in data_list:
90
- # Defensive check for row length, assuming DATAFRAME_HEADERS length
91
- if len(row) < len(DATAFRAME_HEADERS):
92
- print(f"Skipping malformed row in DataFrame data during conversion: {row}")
93
- continue
94
-
95
- speaker_name = str(row[DATAFRAME_COL_SPEAKER]).strip()
96
- voice = str(row[DATAFRAME_COL_VOICE]).strip()
97
- speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
98
- instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
99
-
100
- if not speaker_name or speaker_name == "(Load speakers first)" or speaker_name == "(No speakers detected)":
101
- continue # Skip placeholder/empty speaker names
102
-
103
- if voice not in APP_AVAILABLE_VOICES:
104
- print(f"Warning: Invalid voice '{voice}' for '{speaker_name}'. Using '{APP_AVAILABLE_VOICES[0]}'.")
105
- voice = APP_AVAILABLE_VOICES[0]
106
-
107
- cfg = {"voice": voice}
108
-
109
- if speed_str:
110
- try:
111
- speed_val = float(speed_str)
112
- if 0.25 <= speed_val <= 4.0: cfg["speed"] = speed_val
113
- else: print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range. Ignoring.")
114
- except ValueError: print(f"Warning: Invalid speed '{speed_str}' for '{speaker_name}'. Ignoring.")
115
-
116
- if instructions: cfg["instructions"] = instructions
117
-
118
- config_map[speaker_name] = cfg
119
- return config_map
120
-
121
- def load_speakers_for_table(script_text, current_df_obj, selected_tts_model):
122
- current_config_map = {}
123
- if isinstance(current_df_obj, pd.DataFrame) and not current_df_obj.empty:
124
- for _, row_series in current_df_obj.iterrows():
125
- row = row_series.tolist() # Convert Series to list
126
- if len(row) >= len(DATAFRAME_HEADERS):
127
- speaker = str(row[DATAFRAME_COL_SPEAKER])
128
- current_config_map[speaker] = {
129
- "voice": str(row[DATAFRAME_COL_VOICE]),
130
- "speed": str(row[DATAFRAME_COL_SPEED]),
131
- "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
132
- }
133
- elif isinstance(current_df_obj, dict) and current_df_obj.get('data'): # Handle raw dict from Gradio
134
- for row in current_df_obj['data']:
135
- if len(row) >= len(DATAFRAME_HEADERS):
136
- speaker = str(row[DATAFRAME_COL_SPEAKER])
137
- current_config_map[speaker] = {
138
- "voice": str(row[DATAFRAME_COL_VOICE]),
139
- "speed": str(row[DATAFRAME_COL_SPEED]),
140
- "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
141
- }
142
 
143
 
144
- unique_speakers = get_speakers_from_script(script_text)
145
- new_df_rows = []
146
- default_voice, default_speed, default_instr = APP_AVAILABLE_VOICES[0], "1.0", ""
147
-
148
- for speaker in unique_speakers:
149
- s_cfg = current_config_map.get(speaker, {})
150
- new_df_rows.append([
151
- speaker,
152
- s_cfg.get("voice", default_voice),
153
- s_cfg.get("speed", default_speed),
154
- s_cfg.get("instructions", default_instr)
155
- ])
156
-
157
- speed_info = " (tts-1/hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A)"
158
- instr_info = " (gpt-4o-mini)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A)"
159
 
160
- dynamic_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
161
- f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
162
- f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
163
-
164
- if not new_df_rows:
165
- new_df_rows.append(["(No speakers detected in script)", default_voice, default_speed, default_instr])
166
 
167
- return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=dynamic_headers)
 
168
 
 
 
 
169
 
170
- def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
171
- global_selected_voice, detailed_config_map_from_table): # Renamed last param
172
- global _speaker_config_cache
173
- if _speaker_config_cache.get("__method") != speaker_config_method or \
174
- _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
175
- _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
176
 
177
- base_config = {"voice": APP_AVAILABLE_VOICES[0], "speed": None, "instructions": None}
178
 
179
- if speaker_config_method == "Single Voice (Global)":
180
- base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
181
- return base_config
 
 
 
 
182
 
183
- if speaker_config_method == "Detailed Configuration (Table)":
184
- if speaker_name in detailed_config_map_from_table:
185
- speaker_specific = detailed_config_map_from_table[speaker_name]
186
- return {
187
- "voice": speaker_specific.get("voice", base_config["voice"]),
188
- "speed": speaker_specific.get("speed"),
189
- "instructions": speaker_specific.get("instructions")
 
 
 
 
 
 
 
190
  }
191
- else:
192
- print(f"Warning: Speaker '{speaker_name}' not in Table Config. Using default.")
193
- return base_config
194
-
195
- # For "Random" and "A/B" (same as before)
196
- if speaker_name not in _speaker_config_cache:
197
- if speaker_config_method == "Random per Speaker":
198
- available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
199
- if not _speaker_config_cache.get("__all_assigned_random"):
200
- for i, spk_unique in enumerate(unique_script_speakers):
201
- if spk_unique not in _speaker_config_cache:
202
- _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
203
- _speaker_config_cache["__all_assigned_random"] = True
204
- if speaker_name not in _speaker_config_cache:
205
- _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
206
-
207
- elif speaker_config_method == "A/B Round Robin":
208
- if not _speaker_config_cache.get("__all_assigned_ab"):
209
- for i, spk_unique in enumerate(unique_script_speakers):
210
- if spk_unique not in _speaker_config_cache:
211
- _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
212
- _speaker_config_cache["__all_assigned_ab"] = True
213
- if speaker_name not in _speaker_config_cache:
214
- speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
215
- _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
216
- else:
217
- _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
218
-
219
- cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
220
- return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
 
223
  async def handle_script_processing(
224
  dialogue_script: str, tts_model: str, pause_ms: int,
225
  speaker_config_method: str, global_voice_selection: str,
226
- speaker_details_df_value: pd.DataFrame | dict, # Can be pd.DataFrame or dict from Gradio
 
227
  global_speed: float,
228
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
229
- global _speaker_config_cache; _speaker_config_cache = {}
230
 
231
  if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
232
  if not dialogue_script.strip(): return None, None, "Error: Script empty."
@@ -240,36 +213,60 @@ async def handle_script_processing(
240
  if not parsed_lines: return None, None, "Error: No valid lines."
241
  except ValueError as e: return None, None, f"Script error: {str(e)}"
242
 
243
- unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
244
-
245
- parsed_detailed_config_map = {}
246
- if speaker_config_method == "Detailed Configuration (Table)":
247
- try:
248
- parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
249
- except Exception as e: return None, None, f"Error processing Speaker Table: {str(e)}"
250
 
251
  tasks, line_audio_files = [], [None] * len(parsed_lines)
252
  for i, line_data in enumerate(parsed_lines):
253
  speaker_name = line_data["speaker"]
254
- speaker_base_cfg = get_config_for_speaker(
255
- speaker_name, speaker_config_method, unique_speakers,
256
- global_voice_selection, parsed_detailed_config_map
257
- )
258
- line_voice = speaker_base_cfg["voice"]
259
 
260
- eff_speed = global_speed
261
- if speaker_base_cfg.get("speed") is not None: eff_speed = speaker_base_cfg["speed"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- eff_instr = global_instructions if global_instructions and global_instructions.strip() else None
264
- if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
265
- eff_instr = speaker_base_cfg["instructions"]
266
 
267
  out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
268
  progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
269
  tasks.append(synthesize_speech_line(
270
  client=async_openai_client, text=line_data["text"], voice=line_voice,
271
- output_path=out_fn, model=tts_model, speed=eff_speed,
272
- instructions=eff_instr, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
273
  line_index=line_data['id']))
274
 
275
  results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -298,7 +295,9 @@ async def handle_script_processing(
298
  merged_path if merged_path and os.path.exists(merged_path) else None,
299
  status)
300
 
 
301
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
 
302
  if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
303
  try:
304
  parsed, chars = parse_dialogue_script(dialogue_script)
@@ -307,11 +306,15 @@ def handle_calculate_cost(dialogue_script: str, tts_model: str):
307
  return f"Est. Cost: ${cost:.6f}"
308
  except Exception as e: return f"Cost calc error: {str(e)}"
309
 
 
310
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
311
- gr.Markdown("# Dialogue Script to Speech (Table Config)")
312
  if not OPENAI_API_KEY or not async_openai_client:
313
  gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
314
 
 
 
 
315
  with gr.Row():
316
  with gr.Column(scale=2):
317
  script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
@@ -321,26 +324,25 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
321
  global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
322
  global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
323
 
324
- gr.Markdown("### Speaker Configuration")
325
- with gr.Row():
326
- speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
327
- global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
328
 
329
- # Detailed Table Configuration UI
330
- # Visibility of this whole column is controlled by the dropdown selection
331
- with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")) as detailed_table_config_group:
332
- load_speakers_button = gr.Button("Load/Refresh Speakers into Table (from Script Above)")
333
- gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. "
334
- "Edit table below. Speed/Instructions column applicability depends on TTS model.</small>")
335
- speaker_details_df = gr.DataFrame(
336
- label="Per-Speaker Settings",
337
- headers=DATAFRAME_HEADERS, # Initial static headers
338
- value=pd.DataFrame([["(Load speakers first or type here)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
339
- interactive=True,
340
- row_count=(1, "dynamic"), # Allow dynamic rows, starting with 1
341
- col_count=(len(DATAFRAME_HEADERS), "fixed")
342
  )
343
-
 
 
 
 
 
 
 
 
344
  with gr.Row():
345
  calculate_cost_button = gr.Button("Calculate Cost")
346
  generate_button = gr.Button("Generate Audio", variant="primary")
@@ -351,61 +353,74 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
351
  merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
352
  status_output = gr.Textbox(label="Status", interactive=False, lines=1)
353
 
354
- def update_model_controls_visibility(selected_model):
 
 
 
 
 
355
  is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
356
  is_gpt_mini = selected_model == "gpt-4o-mini-tts"
357
-
358
- speed_info = " (tts-1/hd)" if is_tts1 else " (N/A)"
359
- instr_info = " (gpt-4o-mini)" if is_gpt_mini else " (N/A)"
360
- new_df_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
361
- f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
362
- f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
363
  return {
364
  global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
365
  global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
366
- speaker_details_df: gr.update(headers=new_df_headers) # Update DF headers dynamically
 
367
  }
368
  tts_model_dropdown.change(
369
  fn=update_model_controls_visibility,
370
- inputs=[tts_model_dropdown],
371
- outputs=[global_speed_input, global_instructions_input, speaker_details_df]
372
  )
373
 
374
  def update_speaker_config_method_visibility(method):
375
  is_single = (method == "Single Voice (Global)")
376
- is_detailed_table = (method == "Detailed Configuration (Table)")
 
377
  return {
378
- global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
379
- detailed_table_config_group: gr.update(visible=is_detailed_table),
380
  }
381
  speaker_config_method_dropdown.change(
382
  fn=update_speaker_config_method_visibility,
383
  inputs=[speaker_config_method_dropdown],
384
- outputs=[global_voice_dropdown, detailed_table_config_group]
385
  )
386
 
387
- load_speakers_button.click(
388
- fn=load_speakers_for_table,
389
- inputs=[script_input, speaker_details_df, tts_model_dropdown],
390
- outputs=[speaker_details_df]
 
391
  )
392
 
393
  calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
 
 
394
  generate_button.click(
395
  fn=handle_script_processing,
396
- inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
397
- global_voice_dropdown, speaker_details_df,
398
- global_speed_input, global_instructions_input],
 
 
 
399
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
400
 
401
  gr.Markdown("## Examples")
402
  gr.Examples(
403
  examples=[
404
- ["[Alice] This table config is new!\n[Bob] Indeed it is.", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""],
405
- ["[Narrator] Global voice example.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
 
 
 
 
 
 
 
 
406
  ],
407
- inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
408
- speaker_details_df, global_speed_input, global_instructions_input],
409
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
410
  fn=handle_script_processing, cache_examples=False)
411
 
 
5
  import shutil
6
  import zipfile
7
  import random
8
+ import json # Keep for other potential uses, though not primary for this config
9
+ import pandas as pd # Keep for now, in case other features might use it
10
  from openai import AsyncOpenAI
11
+ from functools import partial # For dynamic event handlers
12
 
13
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
14
  from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
 
46
  "Single Voice (Global)",
47
  "Random per Speaker",
48
  "A/B Round Robin",
49
+ "Detailed Configuration (Per Speaker UI)" # New Method Name
50
  ]
51
+ DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
52
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
 
53
 
54
+ PREDEFINED_VIBES = {
55
+ "None": "", # No specific instruction
56
+ "Calm": "Speak in a calm, composed, and relaxed manner.",
57
+ "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
58
+ "Happy": "Speak with a cheerful, bright, and joyful voice.",
59
+ "Sad": "Speak with a sorrowful, melancholic, and dejected tone.",
60
+ "Whisper": "Speak softly, as if whispering.",
61
+ "Angry": "Speak with a strong, firm, and possibly agitated voice.",
62
+ "Fearful": "Speak with a trembling, hesitant, and scared voice.",
63
+ "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
64
+ "Authoritative": "Speak with a commanding, confident, and firm voice.",
65
+ "Friendly": "Speak in a warm, approachable, and amiable manner.",
66
+ "Custom...": "CUSTOM" # Special value indicating custom text should be used
67
+ }
68
+ VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
69
+ DEFAULT_VIBE = "None"
70
 
71
  def get_speakers_from_script(script_text):
72
+ if not script_text.strip(): return []
 
73
  try:
74
  parsed_lines, _ = parse_dialogue_script(script_text)
75
+ return sorted(list(set(p["speaker"] for p in parsed_lines)))
76
+ except ValueError: return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
+ def handle_dynamic_input_change(new_value, current_configs_state_dict, speaker_name, config_key, tts_model):
80
+ """
81
+ Updates the gr.State dictionary when a dynamic UI element changes.
82
+ current_configs_state_dict is the raw dictionary from gr.State.
83
+ """
84
+ if speaker_name not in current_configs_state_dict:
85
+ current_configs_state_dict[speaker_name] = {}
 
 
 
 
 
 
 
 
86
 
87
+ current_configs_state_dict[speaker_name][config_key] = new_value
 
 
 
 
 
88
 
89
+ # Special handling for Vibe -> Custom Instructions visibility (Simpler: custom textbox always visible)
90
+ # For this iteration, custom textbox is always visible. Backend decides to use it.
91
 
92
+ # Determine visibility/interactivity of speed slider for this specific speaker's UI (if we were to update it directly)
93
+ # This is complex to do from a generic handler. Better to set initial visibility in load_refresh_per_speaker_ui.
94
+ # Global tts_model_dropdown change will refresh the whole dynamic UI if needed for speed/instr applicability.
95
 
96
+ return current_configs_state_dict
 
 
 
 
 
97
 
 
98
 
99
+ def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_model):
100
+ """
101
+ Generates the dynamic UI components (accordions) for each speaker.
102
+ Returns a list of Gradio components and the updated state.
103
+ """
104
+ unique_speakers = get_speakers_from_script(script_text)
105
+ new_ui_components = []
106
 
107
+ # Ensure state dict is not None (Gradio might pass None initially for gr.State)
108
+ if current_configs_state_dict is None:
109
+ current_configs_state_dict = {}
110
+
111
+ # Update state for any new speakers or remove speakers no longer in script
112
+ # (Optional: more complex logic could be to remove speakers from state if not in script)
113
+ # For now, just add new ones with defaults if not present.
114
+ for speaker_name in unique_speakers:
115
+ if speaker_name not in current_configs_state_dict:
116
+ current_configs_state_dict[speaker_name] = {
117
+ "voice": APP_AVAILABLE_VOICES[0],
118
+ "speed": 1.0,
119
+ "vibe": DEFAULT_VIBE,
120
+ "custom_instructions": ""
121
  }
122
+ # Ensure all keys exist for existing speakers (e.g., if new fields added)
123
+ current_configs_state_dict[speaker_name].setdefault("voice", APP_AVAILABLE_VOICES[0])
124
+ current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
125
+ current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
126
+ current_configs_state_dict[speaker_name].setdefault("custom_instructions", "")
127
+
128
+
129
+ if not unique_speakers:
130
+ new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again."))
131
+ # Return current (possibly empty) state and the markdown message
132
+ return new_ui_components, current_configs_state_dict
133
+
134
+
135
+ for speaker_name in unique_speakers:
136
+ speaker_cfg = current_configs_state_dict[speaker_name] # Should exist now
137
+
138
+ # Determine if speed/instructions are applicable for the current global TTS model
139
+ speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
140
+ instructions_relevant = tts_model == "gpt-4o-mini-tts" # Vibe/Custom is primarily for this
141
+
142
+ with gr.Accordion(label=f"Settings for: {speaker_name}", open=False) as speaker_accordion:
143
+ # Voice Dropdown
144
+ voice_dd = gr.Dropdown(
145
+ label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg["voice"], interactive=True
146
+ )
147
+ voice_dd.change(
148
+ fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
149
+ inputs=[voice_dd, speaker_configs_state], # Pass the component itself and the state
150
+ outputs=[speaker_configs_state]
151
+ )
152
+
153
+ # Speed Slider
154
+ speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
155
+ speed_slider = gr.Slider(
156
+ label=speed_slider_label, minimum=0.25, maximum=4.0, value=speaker_cfg["speed"],
157
+ step=0.05, interactive=speed_interactive
158
+ )
159
+ if speed_interactive: # Only attach listener if interactive
160
+ speed_slider.release( # Use release to avoid too many updates during drag
161
+ fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
162
+ inputs=[speed_slider, speaker_configs_state],
163
+ outputs=[speaker_configs_state]
164
+ )
165
+
166
+ # Vibe Dropdown
167
+ vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
168
+ vibe_dd = gr.Dropdown(
169
+ label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg["vibe"], interactive=True
170
+ )
171
+ vibe_dd.change(
172
+ fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="vibe", tts_model=tts_model),
173
+ inputs=[vibe_dd, speaker_configs_state],
174
+ outputs=[speaker_configs_state]
175
+ )
176
+
177
+ # Custom Instructions Textbox
178
+ custom_instr_label = "Custom Instructions"
179
+ custom_instr_placeholder = "Only used if Vibe is 'Custom...'. Overrides Vibe."
180
+ custom_instr_tb = gr.Textbox(
181
+ label=custom_instr_label,
182
+ value=speaker_cfg["custom_instructions"],
183
+ placeholder=custom_instr_placeholder,
184
+ lines=2, interactive=True # Always interactive, backend logic decides if used
185
+ )
186
+ custom_instr_tb.input( # Use input for real-time typing updates
187
+ fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
188
+ inputs=[custom_instr_tb, speaker_configs_state],
189
+ outputs=[speaker_configs_state]
190
+ )
191
+ new_ui_components.append(speaker_accordion)
192
+
193
+ return new_ui_components, current_configs_state_dict
194
 
195
 
196
  async def handle_script_processing(
197
  dialogue_script: str, tts_model: str, pause_ms: int,
198
  speaker_config_method: str, global_voice_selection: str,
199
+ # No more df_value, instead we use speaker_configs_state_dict from gr.State
200
+ speaker_configs_state_dict: dict,
201
  global_speed: float,
202
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
 
203
 
204
  if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
205
  if not dialogue_script.strip(): return None, None, "Error: Script empty."
 
213
  if not parsed_lines: return None, None, "Error: No valid lines."
214
  except ValueError as e: return None, None, f"Script error: {str(e)}"
215
 
216
+ # Ensure state dict is usable
217
+ if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
 
 
 
 
 
218
 
219
  tasks, line_audio_files = [], [None] * len(parsed_lines)
220
  for i, line_data in enumerate(parsed_lines):
221
  speaker_name = line_data["speaker"]
 
 
 
 
 
222
 
223
+ # Determine voice, speed, instructions for this line
224
+ line_voice = global_voice_selection
225
+ line_speed = global_speed
226
+ line_instructions = global_instructions if global_instructions and global_instructions.strip() else None
227
+
228
+ if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
229
+ spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
230
+ line_voice = spk_cfg.get("voice", global_voice_selection) # Fallback to global if needed
231
+
232
+ # Speed: per-speaker if tts-1/hd and set, else global if tts-1/hd, else API default
233
+ if tts_model in ["tts-1", "tts-1-hd"]:
234
+ line_speed = spk_cfg.get("speed", global_speed)
235
+
236
+ # Instructions: primarily for gpt-4o-mini-tts
237
+ if tts_model == "gpt-4o-mini-tts":
238
+ vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
239
+ custom_instr = spk_cfg.get("custom_instructions", "").strip()
240
+ if vibe == "Custom..." and custom_instr:
241
+ line_instructions = custom_instr
242
+ elif vibe != "None" and vibe != "Custom...":
243
+ line_instructions = PREDEFINED_VIBES.get(vibe, "")
244
+ # If vibe is None or Custom with no text, line_instructions might remain global or become ""
245
+ if not line_instructions and global_instructions and global_instructions.strip(): # Fallback to global if specific instructions are empty
246
+ line_instructions = global_instructions
247
+ elif not line_instructions : # Ensure it's None if truly no instruction
248
+ line_instructions = None
249
+
250
+
251
+ elif speaker_config_method == "Random per Speaker":
252
+ # Simplified: assign random now, could be cached as before for consistency within run
253
+ line_voice = random.choice(APP_AVAILABLE_VOICES)
254
+ elif speaker_config_method == "A/B Round Robin":
255
+ # Simplified: assign A/B now
256
+ unique_script_speakers = get_speakers_from_script(dialogue_script) # Re-get for this logic
257
+ speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
258
+ line_voice = APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]
259
 
260
+ # Fallback for speed if not tts-1/hd (API won't use it anyway)
261
+ if tts_model not in ["tts-1", "tts-1-hd"]:
262
+ line_speed = 1.0 # API default, won't be sent
263
 
264
  out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
265
  progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
266
  tasks.append(synthesize_speech_line(
267
  client=async_openai_client, text=line_data["text"], voice=line_voice,
268
+ output_path=out_fn, model=tts_model, speed=line_speed,
269
+ instructions=line_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
270
  line_index=line_data['id']))
271
 
272
  results = await asyncio.gather(*tasks, return_exceptions=True)
 
295
  merged_path if merged_path and os.path.exists(merged_path) else None,
296
  status)
297
 
298
+
299
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
300
+ # ... (same as before) ...
301
  if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
302
  try:
303
  parsed, chars = parse_dialogue_script(dialogue_script)
 
306
  return f"Est. Cost: ${cost:.6f}"
307
  except Exception as e: return f"Cost calc error: {str(e)}"
308
 
309
+
310
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
311
+ gr.Markdown("# Dialogue Script to Speech (Dynamic Per-Speaker UI)")
312
  if not OPENAI_API_KEY or not async_openai_client:
313
  gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
314
 
315
+ # State to hold detailed speaker configurations
316
+ speaker_configs_state = gr.State({})
317
+
318
  with gr.Row():
319
  with gr.Column(scale=2):
320
  script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
 
324
  global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
325
  global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
326
 
327
+ gr.Markdown("### Speaker Configuration Method")
328
+ speaker_config_method_dropdown = gr.Dropdown(
329
+ SPEAKER_CONFIG_METHODS, label="Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
330
+ )
331
 
332
+ # UI for "Single Voice (Global)"
333
+ with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
334
+ global_voice_dropdown = gr.Dropdown(
335
+ APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], interactive=True
 
 
 
 
 
 
 
 
 
336
  )
337
+
338
+ # UI for "Detailed Configuration (Per Speaker UI)"
339
+ with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
340
+ load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
341
+ gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are per-speaker.</small>")
342
+ # This column will be populated by the output of load_per_speaker_ui_button
343
+ dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
344
+
345
+
346
  with gr.Row():
347
  calculate_cost_button = gr.Button("Calculate Cost")
348
  generate_button = gr.Button("Generate Audio", variant="primary")
 
353
  merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
354
  status_output = gr.Textbox(label="Status", interactive=False, lines=1)
355
 
356
+ # --- Event Handlers ---
357
+ def update_model_controls_visibility(selected_model, script_text_for_refresh, current_speaker_configs_for_refresh):
358
+ # When model changes, also refresh the dynamic UI because speed/instr applicability changes
359
+ # This means load_refresh_per_speaker_ui will be called.
360
+ new_dynamic_ui, updated_state = load_refresh_per_speaker_ui(script_text_for_refresh, current_speaker_configs_for_refresh, selected_model)
361
+
362
  is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
363
  is_gpt_mini = selected_model == "gpt-4o-mini-tts"
 
 
 
 
 
 
364
  return {
365
  global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
366
  global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
367
+ dynamic_speaker_ui_area: new_dynamic_ui, # Return the actual list of components
368
+ speaker_configs_state: updated_state
369
  }
370
  tts_model_dropdown.change(
371
  fn=update_model_controls_visibility,
372
+ inputs=[tts_model_dropdown, script_input, speaker_configs_state],
373
+ outputs=[global_speed_input, global_instructions_input, dynamic_speaker_ui_area, speaker_configs_state]
374
  )
375
 
376
  def update_speaker_config_method_visibility(method):
377
  is_single = (method == "Single Voice (Global)")
378
+ is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
379
+ # Add more if other methods exist...
380
  return {
381
+ single_voice_group: gr.update(visible=is_single),
382
+ detailed_per_speaker_ui_group: gr.update(visible=is_detailed_per_speaker),
383
  }
384
  speaker_config_method_dropdown.change(
385
  fn=update_speaker_config_method_visibility,
386
  inputs=[speaker_config_method_dropdown],
387
+ outputs=[single_voice_group, detailed_per_speaker_ui_group]
388
  )
389
 
390
+ load_per_speaker_ui_button.click(
391
+ fn=load_refresh_per_speaker_ui,
392
+ inputs=[script_input, speaker_configs_state, tts_model_dropdown],
393
+ # Output the list of components to the column, and the updated state to the state component
394
+ outputs=[dynamic_speaker_ui_area, speaker_configs_state]
395
  )
396
 
397
  calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
398
+
399
+ # Generate button now takes speaker_configs_state as input
400
  generate_button.click(
401
  fn=handle_script_processing,
402
+ inputs=[
403
+ script_input, tts_model_dropdown, pause_input,
404
+ speaker_config_method_dropdown, global_voice_dropdown,
405
+ speaker_configs_state, # Pass the state object
406
+ global_speed_input, global_instructions_input
407
+ ],
408
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
409
 
410
  gr.Markdown("## Examples")
411
  gr.Examples(
412
  examples=[
413
+ ["[Alice] Hello from Alice!\n[Bob] Bob here, testing the dynamic UI.", "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", APP_AVAILABLE_VOICES[0], {}, 1.0, ""],
414
+ ["[Narrator] Just one line, using global.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
415
+ ],
416
+ # Note: speaker_configs_state is passed as an empty dict {} for examples.
417
+ # The user would click "Load/Refresh Per-Speaker UI" after an example loads.
418
+ inputs=[
419
+ script_input, tts_model_dropdown, pause_input,
420
+ speaker_config_method_dropdown, global_voice_dropdown,
421
+ speaker_configs_state,
422
+ global_speed_input, global_instructions_input
423
  ],
 
 
424
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
425
  fn=handle_script_processing, cache_examples=False)
426