abocha commited on
Commit
59d4d10
·
1 Parent(s): a2f0e99
Files changed (1) hide show
  1. app.py +160 -184
app.py CHANGED
@@ -45,14 +45,15 @@ SPEAKER_CONFIG_METHODS = [
45
  "Single Voice (Global)",
46
  "Random per Speaker",
47
  "A/B Round Robin",
48
- "Detailed Configuration (Table)" # Changed from JSON
49
  ]
50
- DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker" # Or "Detailed Configuration (Table)" if preferred
51
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
52
- _speaker_config_cache = {} # For Random/AB
53
 
54
  # --- Speaker Configuration Table Logic ---
55
  DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
 
56
  DATAFRAME_COL_SPEAKER = 0
57
  DATAFRAME_COL_VOICE = 1
58
  DATAFRAME_COL_SPEED = 2
@@ -62,29 +63,45 @@ def get_speakers_from_script(script_text):
62
  if not script_text.strip():
63
  return []
64
  try:
65
- # Use a simplified parsing just for speaker names if full parsing is too heavy here
66
- # For now, leverage existing robust parser.
67
  parsed_lines, _ = parse_dialogue_script(script_text)
68
  unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
69
  return unique_speakers
70
- except ValueError: # Catch parsing errors like script too long
71
- return [] # Or raise/notify
72
 
73
- def convert_df_to_config_map(df_data):
74
- """Converts DataFrame data (list of lists) to the internal config map."""
75
  config_map = {}
76
- if not df_data or not df_data['data']: # df_data can be {'data': [], 'headers': []}
77
- return config_map
78
-
79
- for row in df_data['data']:
80
- speaker_name = row[DATAFRAME_COL_SPEAKER]
81
- voice = row[DATAFRAME_COL_VOICE]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
83
  instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
84
 
85
- # Validate voice
 
 
86
  if voice not in APP_AVAILABLE_VOICES:
87
- print(f"Warning: Invalid voice '{voice}' for speaker '{speaker_name}' in table. Using default '{APP_AVAILABLE_VOICES[0]}'.")
88
  voice = APP_AVAILABLE_VOICES[0]
89
 
90
  cfg = {"voice": voice}
@@ -92,85 +109,66 @@ def convert_df_to_config_map(df_data):
92
  if speed_str:
93
  try:
94
  speed_val = float(speed_str)
95
- if 0.25 <= speed_val <= 4.0:
96
- cfg["speed"] = speed_val
97
- else:
98
- print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range (0.25-4.0). Ignoring.")
99
- except ValueError:
100
- print(f"Warning: Invalid speed value '{speed_str}' for '{speaker_name}'. Ignoring.")
101
 
102
- if instructions:
103
- cfg["instructions"] = instructions
104
 
105
  config_map[speaker_name] = cfg
106
  return config_map
107
 
108
- def load_speakers_for_table(script_text, current_df_data_obj, selected_tts_model):
109
- """
110
- Populates a DataFrame structure based on speakers in the script.
111
- Tries to preserve existing settings from current_df_data_obj.
112
- current_df_data_obj is the raw object from gr.DataFrame, like {'data': [...], 'headers': ...}
113
- """
114
  current_config_map = {}
115
- if current_df_data_obj and current_df_data_obj.get('data'):
116
- # Convert existing DataFrame to a temporary map to easily fetch settings
117
- for row in current_df_data_obj['data']:
118
- # Ensure row has enough elements to prevent IndexError
 
 
 
 
 
 
 
 
119
  if len(row) >= len(DATAFRAME_HEADERS):
120
- speaker = row[DATAFRAME_COL_SPEAKER]
121
  current_config_map[speaker] = {
122
- "voice": row[DATAFRAME_COL_VOICE],
123
  "speed": str(row[DATAFRAME_COL_SPEED]),
124
  "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
125
  }
126
- else:
127
- print(f"Skipping malformed row in current DataFrame data: {row}")
128
 
129
 
130
  unique_speakers = get_speakers_from_script(script_text)
131
  new_df_rows = []
132
-
133
- default_voice = APP_AVAILABLE_VOICES[0]
134
- default_speed_str = "1.0"
135
- default_instructions_str = ""
136
 
137
  for speaker in unique_speakers:
138
- existing_settings = current_config_map.get(speaker, {})
139
  new_df_rows.append([
140
  speaker,
141
- existing_settings.get("voice", default_voice),
142
- existing_settings.get("speed", default_speed_str),
143
- existing_settings.get("instructions", default_instructions_str)
144
  ])
145
 
146
- # Provide info about which columns are relevant
147
- speed_info = " (for tts-1/tts-1-hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A for this model)"
148
- instr_info = " (for gpt-4o-mini-tts)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A for this model)"
149
 
150
- headers_with_model_info = [
151
- DATAFRAME_HEADERS[0],
152
- DATAFRAME_HEADERS[1],
153
- f"{DATAFRAME_HEADERS[2]}{speed_info}",
154
- f"{DATAFRAME_HEADERS[3]}{instr_info}",
155
- ]
156
-
157
- # If no speakers, provide a dummy row to keep the DataFrame component from erroring/disappearing if it needs data
158
- if not new_df_rows:
159
- new_df_rows.append(["(No speakers detected)", default_voice, default_speed_str, default_instructions_str])
160
-
161
- # Return a new DataFrame object compatible with gr.DataFrame.update
162
- # This structure for gr.DataFrame update might be df_obj or pd.DataFrame(new_df_rows, columns=...)
163
- # Let's try returning a pandas DataFrame
164
- return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=headers_with_model_info)
165
 
 
 
 
 
166
 
167
- # --- Existing parsing and config logic (slightly adapted if needed) ---
168
- def parse_detailed_speaker_config(json_text, parsed_script_lines): # Kept for reference or other uses, but not for Table UI
169
- # ... (implementation from previous version for JSON parsing)
170
- pass # For now, this function is not directly used by the Table UI.
171
 
172
  def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
173
- global_selected_voice, detailed_config_map_from_table_or_json):
174
  global _speaker_config_cache
175
  if _speaker_config_cache.get("__method") != speaker_config_method or \
176
  _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
@@ -182,23 +180,21 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
182
  base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
183
  return base_config
184
 
185
- # This part now uses the map derived from either JSON (if we kept that option) or Table
186
- if speaker_config_method == "Detailed Configuration (Table)": # or "Detailed Configuration (JSON)"
187
- if speaker_name in detailed_config_map_from_table_or_json:
188
- speaker_specific = detailed_config_map_from_table_or_json[speaker_name]
189
- return { # Ensure all keys exist, falling back to None if not in map
190
  "voice": speaker_specific.get("voice", base_config["voice"]),
191
  "speed": speaker_specific.get("speed"),
192
  "instructions": speaker_specific.get("instructions")
193
  }
194
  else:
195
- print(f"Warning: Speaker '{speaker_name}' not found in Detailed Config. Using default voice '{base_config['voice']}'.")
196
  return base_config
197
 
198
- # For "Random" and "A/B"
199
  if speaker_name not in _speaker_config_cache:
200
  if speaker_config_method == "Random per Speaker":
201
- # ... (same logic as before) ...
202
  available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
203
  if not _speaker_config_cache.get("__all_assigned_random"):
204
  for i, spk_unique in enumerate(unique_script_speakers):
@@ -209,7 +205,6 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
209
  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
210
 
211
  elif speaker_config_method == "A/B Round Robin":
212
- # ... (same logic as before) ...
213
  if not _speaker_config_cache.get("__all_assigned_ab"):
214
  for i, spk_unique in enumerate(unique_script_speakers):
215
  if spk_unique not in _speaker_config_cache:
@@ -228,140 +223,123 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
228
  async def handle_script_processing(
229
  dialogue_script: str, tts_model: str, pause_ms: int,
230
  speaker_config_method: str, global_voice_selection: str,
231
- speaker_details_df_value: dict, # This will be the dict from gr.DataFrame value
232
  global_speed: float,
233
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
234
- global _speaker_config_cache
235
- _speaker_config_cache = {}
236
 
237
- if not OPENAI_API_KEY or not async_openai_client:
238
- return None, None, "Error: OPENAI_API_KEY is not configured."
239
- if not dialogue_script.strip():
240
- return None, None, "Error: Script is empty."
241
 
242
  job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
243
  if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
244
  os.makedirs(job_audio_path_prefix, exist_ok=True)
245
 
246
  try:
247
- parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
248
- if not parsed_lines: return None, None, "Error: No valid dialogue lines found."
249
- except ValueError as e: return None, None, f"Script parsing error: {str(e)}"
250
 
251
  unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
252
 
253
- # This is where we get the detailed config map
254
  parsed_detailed_config_map = {}
255
  if speaker_config_method == "Detailed Configuration (Table)":
256
  try:
257
  parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
258
- except Exception as e: # Catch any error during DataFrame conversion
259
- return None, None, f"Error processing Speaker Table: {str(e)}"
260
- # (If you had a JSON option, you'd parse JSON here too)
261
 
262
  tasks, line_audio_files = [], [None] * len(parsed_lines)
263
  for i, line_data in enumerate(parsed_lines):
264
  speaker_name = line_data["speaker"]
265
  speaker_base_cfg = get_config_for_speaker(
266
  speaker_name, speaker_config_method, unique_speakers,
267
- global_voice_selection, parsed_detailed_config_map # Pass the map
268
  )
269
  line_voice = speaker_base_cfg["voice"]
270
 
271
- effective_speed = global_speed
272
- if speaker_base_cfg.get("speed") is not None: effective_speed = speaker_base_cfg["speed"]
273
 
274
- effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
275
  if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
276
- effective_instructions = speaker_base_cfg["instructions"]
277
 
278
- output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
279
- progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
280
  tasks.append(synthesize_speech_line(
281
  client=async_openai_client, text=line_data["text"], voice=line_voice,
282
- output_path=output_filename, model=tts_model, speed=effective_speed,
283
- instructions=effective_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
284
  line_index=line_data['id']))
285
 
286
- synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
287
- for idx, result in enumerate(synthesis_results):
288
- if isinstance(result, Exception): print(f"Error for line {parsed_lines[idx]['id']}: {result}")
289
- elif result is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
290
- else: line_audio_files[idx] = result
291
 
292
- valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
293
- if not valid_audio_files:
294
- shutil.rmtree(job_audio_path_prefix)
295
- return None, None, "Error: No audio files successfully synthesized."
296
 
297
- zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
298
- with zipfile.ZipFile(zip_filename, 'w') as zf:
299
- for pth in valid_audio_files: zf.write(pth, os.path.basename(pth))
300
 
301
- merged_mp3_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
302
- merged_out_path = merge_mp3_files([f for f in line_audio_files if f], merged_mp3_fn, pause_ms)
303
-
304
- status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
305
- if len(valid_audio_files) < len(parsed_lines): status += "Some lines failed/skipped. "
306
- if not merged_out_path and len(valid_audio_files) > 0: status += "Merged audio failed. "
307
- elif not merged_out_path and len(valid_audio_files) == 0: status += "No audio generated."
308
- else: status += "Outputs generated."
309
 
310
- return (zip_filename if os.path.exists(zip_filename) else None,
311
- merged_out_path if merged_out_path and os.path.exists(merged_out_path) else None,
312
  status)
313
 
314
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
315
- # ... (same as before) ...
316
- if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
317
  try:
318
  parsed, chars = parse_dialogue_script(dialogue_script)
319
- if not parsed: return "Cost: $0.000000 (No valid lines)"
320
  cost = calculate_cost(chars, len(parsed), tts_model)
321
- return f"Est. Cost: ${cost:.6f} ({chars} chars, {len(parsed)} lines)"
322
- except ValueError as e: return f"Error: {str(e)}"
323
  except Exception as e: return f"Cost calc error: {str(e)}"
324
 
325
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
326
- gr.Markdown("# Dialogue Script to Speech Converter (Enhanced Table Config)")
327
- # ... (intro markdown same) ...
328
  if not OPENAI_API_KEY or not async_openai_client:
329
- gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
330
 
331
  with gr.Row():
332
  with gr.Column(scale=2):
333
- script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10, info=f"Max {MAX_SCRIPT_LENGTH} chars.")
334
  with gr.Column(scale=1):
335
- tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
336
- pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
337
- global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
338
- global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
339
 
340
  gr.Markdown("### Speaker Configuration")
341
  with gr.Row():
342
  speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
343
- global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice')", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
344
-
345
- # --- New UI for Detailed Table Configuration ---
346
- current_visibility_detailed_table = (DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")
347
 
348
- with gr.Column(visible=current_visibility_detailed_table) as detailed_table_config_group:
349
- load_speakers_button = gr.Button("Load/Refresh Speakers for Table from Script Above")
350
- gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. Type a voice name into the 'Voice' column. "
351
- "Speed and Instructions applicability depends on the globally selected TTS model.</small>")
 
 
352
  speaker_details_df = gr.DataFrame(
353
- label="Per-Speaker Settings (Editable Table)",
354
- headers=DATAFRAME_HEADERS,
355
- # Initialize with a placeholder row or empty data
356
- value=pd.DataFrame([["(Load speakers first)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
357
  interactive=True,
358
- # `max_rows=20` or some sensible limit might be good for UI
359
- # `datatype` can be used to specify types per column, e.g., ["str", "str", "number", "str"]
360
- # For now, all will be string input in table, conversion happens in backend.
361
- # Consider making "Speaker Name" column read-only if possible with Gradio versions
362
  )
363
-
364
- # --- End New UI ---
365
 
366
  with gr.Row():
367
  calculate_cost_button = gr.Button("Calculate Cost")
@@ -371,38 +349,44 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
371
  with gr.Row():
372
  individual_lines_zip_output = gr.File(label="Download ZIP")
373
  merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
374
- status_output = gr.Textbox(label="Status", interactive=False, lines=2)
375
 
376
- # --- Event Handlers ---
377
- def update_model_controls(selected_model):
378
- # ... (same as before) ...
379
  is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
380
  is_gpt_mini = selected_model == "gpt-4o-mini-tts"
381
- # Also update headers for DataFrame when model changes
382
- speed_info = " (for tts-1/tts-1-hd)" if is_tts1 else " (N/A for this model)"
383
- instr_info = " (for gpt-4o-mini-tts)" if is_gpt_mini else " (N/A for this model)"
384
- new_headers = [DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1], f"{DATAFRAME_HEADERS[2]}{speed_info}", f"{DATAFRAME_HEADERS[3]}{instr_info}"]
385
-
 
386
  return {
387
  global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
388
  global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
389
- speaker_details_df: gr.update(headers=new_headers) # Update headers dynamically
390
  }
391
- tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input, speaker_details_df])
 
 
 
 
392
 
393
- def update_speaker_controls(method):
394
  is_single = (method == "Single Voice (Global)")
395
  is_detailed_table = (method == "Detailed Configuration (Table)")
396
  return {
397
  global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
398
- detailed_table_config_group: gr.update(visible=is_detailed_table), # Show/hide the whole group
399
  }
400
- speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_table_config_group])
 
 
 
 
401
 
402
- # Connect Load Speakers button
403
  load_speakers_button.click(
404
  fn=load_speakers_for_table,
405
- inputs=[script_input, speaker_details_df, tts_model_dropdown], # Pass current DF to preserve, and model for header info
406
  outputs=[speaker_details_df]
407
  )
408
 
@@ -410,29 +394,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
410
  generate_button.click(
411
  fn=handle_script_processing,
412
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
413
- global_voice_dropdown,
414
- speaker_details_df, # Changed from detailed_speaker_config_input (JSON)
415
  global_speed_input, global_instructions_input],
416
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
417
 
418
  gr.Markdown("## Examples")
419
- # Examples will need to be adjusted if the "Detailed Configuration (Table)" is default and needs pre-filled table
420
- # For simplicity, examples might focus on other config methods or a very simple table.
421
- # For "Detailed Configuration (Table)", the flow is: type script -> click "Load Speakers" -> edit table -> generate.
422
- # So, an example would just set the script and method. The table data itself isn't directly an input to an example click.
423
- # We can, however, provide an example script.
424
  gr.Examples(
425
  examples=[
426
- ["[Alice] Hi Bob.\n[Bob] Hello Alice.\n[Alice] This table config is new!", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""], # DF input is None for example
427
- ["[Narrator] A single line, global voice.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
428
- ["[Random1] Voice 1\n[Random2] Voice 2", "gpt-4o-mini-tts", 400, "Random per Speaker", APP_AVAILABLE_VOICES[0], None, 1.0, "Speak clearly."]
429
  ],
430
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
431
- speaker_details_df, # This will be None for example clicks unless we can construct a df
432
- global_speed_input, global_instructions_input],
433
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
434
  fn=handle_script_processing, cache_examples=False)
435
 
436
  if __name__ == "__main__":
437
  if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
438
- demo.launch(debug=True) # Added debug=True for easier testing of UI changes
 
45
  "Single Voice (Global)",
46
  "Random per Speaker",
47
  "A/B Round Robin",
48
+ "Detailed Configuration (Table)" # Using Table method
49
  ]
50
+ DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
51
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
52
+ _speaker_config_cache = {}
53
 
54
  # --- Speaker Configuration Table Logic ---
55
  DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
56
+ # Column indices for clarity (optional, but can help maintainability)
57
  DATAFRAME_COL_SPEAKER = 0
58
  DATAFRAME_COL_VOICE = 1
59
  DATAFRAME_COL_SPEED = 2
 
63
  if not script_text.strip():
64
  return []
65
  try:
 
 
66
  parsed_lines, _ = parse_dialogue_script(script_text)
67
  unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
68
  return unique_speakers
69
+ except ValueError:
70
+ return []
71
 
72
+ def convert_df_to_config_map(df_data_obj):
 
73
  config_map = {}
74
+ # gr.DataFrame's value is a dict like {'data': [[row1_val1, row1_val2], ...], 'headers': [...]}
75
+ # or if constructed with pandas, it might be a pandas DataFrame directly.
76
+ # Let's handle if it's the dict structure.
77
+
78
+ # Ensure df_data_obj is not None and has 'data'
79
+ if not df_data_obj or 'data' not in df_data_obj or not isinstance(df_data_obj['data'], list):
80
+ if isinstance(df_data_obj, pd.DataFrame): # If it's already a pandas DF
81
+ data_list = df_data_obj.values.tolist()
82
+ else: # Unrecognized format or empty
83
+ print("DataFrame data is empty or in an unrecognized format for conversion.")
84
+ return config_map
85
+ else:
86
+ data_list = df_data_obj['data']
87
+
88
+
89
+ for row in data_list:
90
+ # Defensive check for row length, assuming DATAFRAME_HEADERS length
91
+ if len(row) < len(DATAFRAME_HEADERS):
92
+ print(f"Skipping malformed row in DataFrame data during conversion: {row}")
93
+ continue
94
+
95
+ speaker_name = str(row[DATAFRAME_COL_SPEAKER]).strip()
96
+ voice = str(row[DATAFRAME_COL_VOICE]).strip()
97
  speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
98
  instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
99
 
100
+ if not speaker_name or speaker_name == "(Load speakers first)" or speaker_name == "(No speakers detected)":
101
+ continue # Skip placeholder/empty speaker names
102
+
103
  if voice not in APP_AVAILABLE_VOICES:
104
+ print(f"Warning: Invalid voice '{voice}' for '{speaker_name}'. Using '{APP_AVAILABLE_VOICES[0]}'.")
105
  voice = APP_AVAILABLE_VOICES[0]
106
 
107
  cfg = {"voice": voice}
 
109
  if speed_str:
110
  try:
111
  speed_val = float(speed_str)
112
+ if 0.25 <= speed_val <= 4.0: cfg["speed"] = speed_val
113
+ else: print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range. Ignoring.")
114
+ except ValueError: print(f"Warning: Invalid speed '{speed_str}' for '{speaker_name}'. Ignoring.")
 
 
 
115
 
116
+ if instructions: cfg["instructions"] = instructions
 
117
 
118
  config_map[speaker_name] = cfg
119
  return config_map
120
 
121
+ def load_speakers_for_table(script_text, current_df_obj, selected_tts_model):
 
 
 
 
 
122
  current_config_map = {}
123
+ if isinstance(current_df_obj, pd.DataFrame) and not current_df_obj.empty:
124
+ for _, row_series in current_df_obj.iterrows():
125
+ row = row_series.tolist() # Convert Series to list
126
+ if len(row) >= len(DATAFRAME_HEADERS):
127
+ speaker = str(row[DATAFRAME_COL_SPEAKER])
128
+ current_config_map[speaker] = {
129
+ "voice": str(row[DATAFRAME_COL_VOICE]),
130
+ "speed": str(row[DATAFRAME_COL_SPEED]),
131
+ "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
132
+ }
133
+ elif isinstance(current_df_obj, dict) and current_df_obj.get('data'): # Handle raw dict from Gradio
134
+ for row in current_df_obj['data']:
135
  if len(row) >= len(DATAFRAME_HEADERS):
136
+ speaker = str(row[DATAFRAME_COL_SPEAKER])
137
  current_config_map[speaker] = {
138
+ "voice": str(row[DATAFRAME_COL_VOICE]),
139
  "speed": str(row[DATAFRAME_COL_SPEED]),
140
  "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
141
  }
 
 
142
 
143
 
144
  unique_speakers = get_speakers_from_script(script_text)
145
  new_df_rows = []
146
+ default_voice, default_speed, default_instr = APP_AVAILABLE_VOICES[0], "1.0", ""
 
 
 
147
 
148
  for speaker in unique_speakers:
149
+ s_cfg = current_config_map.get(speaker, {})
150
  new_df_rows.append([
151
  speaker,
152
+ s_cfg.get("voice", default_voice),
153
+ s_cfg.get("speed", default_speed),
154
+ s_cfg.get("instructions", default_instr)
155
  ])
156
 
157
+ speed_info = " (tts-1/hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A)"
158
+ instr_info = " (gpt-4o-mini)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A)"
 
159
 
160
+ dynamic_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
161
+ f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
162
+ f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ if not new_df_rows:
165
+ new_df_rows.append(["(No speakers detected in script)", default_voice, default_speed, default_instr])
166
+
167
+ return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=dynamic_headers)
168
 
 
 
 
 
169
 
170
  def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
171
+ global_selected_voice, detailed_config_map_from_table): # Renamed last param
172
  global _speaker_config_cache
173
  if _speaker_config_cache.get("__method") != speaker_config_method or \
174
  _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
 
180
  base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
181
  return base_config
182
 
183
+ if speaker_config_method == "Detailed Configuration (Table)":
184
+ if speaker_name in detailed_config_map_from_table:
185
+ speaker_specific = detailed_config_map_from_table[speaker_name]
186
+ return {
 
187
  "voice": speaker_specific.get("voice", base_config["voice"]),
188
  "speed": speaker_specific.get("speed"),
189
  "instructions": speaker_specific.get("instructions")
190
  }
191
  else:
192
+ print(f"Warning: Speaker '{speaker_name}' not in Table Config. Using default.")
193
  return base_config
194
 
195
+ # For "Random" and "A/B" (same as before)
196
  if speaker_name not in _speaker_config_cache:
197
  if speaker_config_method == "Random per Speaker":
 
198
  available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
199
  if not _speaker_config_cache.get("__all_assigned_random"):
200
  for i, spk_unique in enumerate(unique_script_speakers):
 
205
  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
206
 
207
  elif speaker_config_method == "A/B Round Robin":
 
208
  if not _speaker_config_cache.get("__all_assigned_ab"):
209
  for i, spk_unique in enumerate(unique_script_speakers):
210
  if spk_unique not in _speaker_config_cache:
 
223
  async def handle_script_processing(
224
  dialogue_script: str, tts_model: str, pause_ms: int,
225
  speaker_config_method: str, global_voice_selection: str,
226
+ speaker_details_df_value: pd.DataFrame | dict, # Can be pd.DataFrame or dict from Gradio
227
  global_speed: float,
228
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
229
+ global _speaker_config_cache; _speaker_config_cache = {}
 
230
 
231
+ if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
232
+ if not dialogue_script.strip(): return None, None, "Error: Script empty."
 
 
233
 
234
  job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
235
  if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
236
  os.makedirs(job_audio_path_prefix, exist_ok=True)
237
 
238
  try:
239
+ parsed_lines, _ = parse_dialogue_script(dialogue_script)
240
+ if not parsed_lines: return None, None, "Error: No valid lines."
241
+ except ValueError as e: return None, None, f"Script error: {str(e)}"
242
 
243
  unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
244
 
 
245
  parsed_detailed_config_map = {}
246
  if speaker_config_method == "Detailed Configuration (Table)":
247
  try:
248
  parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
249
+ except Exception as e: return None, None, f"Error processing Speaker Table: {str(e)}"
 
 
250
 
251
  tasks, line_audio_files = [], [None] * len(parsed_lines)
252
  for i, line_data in enumerate(parsed_lines):
253
  speaker_name = line_data["speaker"]
254
  speaker_base_cfg = get_config_for_speaker(
255
  speaker_name, speaker_config_method, unique_speakers,
256
+ global_voice_selection, parsed_detailed_config_map
257
  )
258
  line_voice = speaker_base_cfg["voice"]
259
 
260
+ eff_speed = global_speed
261
+ if speaker_base_cfg.get("speed") is not None: eff_speed = speaker_base_cfg["speed"]
262
 
263
+ eff_instr = global_instructions if global_instructions and global_instructions.strip() else None
264
  if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
265
+ eff_instr = speaker_base_cfg["instructions"]
266
 
267
+ out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
268
+ progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
269
  tasks.append(synthesize_speech_line(
270
  client=async_openai_client, text=line_data["text"], voice=line_voice,
271
+ output_path=out_fn, model=tts_model, speed=eff_speed,
272
+ instructions=eff_instr, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
273
  line_index=line_data['id']))
274
 
275
+ results = await asyncio.gather(*tasks, return_exceptions=True)
276
+ for idx, res in enumerate(results):
277
+ if isinstance(res, Exception): print(f"Error line {parsed_lines[idx]['id']}: {res}")
278
+ elif res is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
279
+ else: line_audio_files[idx] = res
280
 
281
+ valid_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
282
+ if not valid_files:
283
+ shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio synthesized."
 
284
 
285
+ zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
286
+ with zipfile.ZipFile(zip_fn, 'w') as zf: [zf.write(p, os.path.basename(p)) for p in valid_files]
 
287
 
288
+ merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
289
+ merged_path = merge_mp3_files([f for f in line_audio_files if f], merged_fn, pause_ms)
290
+
291
+ status = f"{len(valid_files)}/{len(parsed_lines)} lines. "
292
+ if len(valid_files) < len(parsed_lines): status += "Some failed. "
293
+ if not merged_path and len(valid_files) > 0: status += "Merge failed. "
294
+ elif not merged_path: status += "No audio."
295
+ else: status += "Generated."
296
 
297
+ return (zip_fn if os.path.exists(zip_fn) else None,
298
+ merged_path if merged_path and os.path.exists(merged_path) else None,
299
  status)
300
 
301
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
302
+ if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
 
303
  try:
304
  parsed, chars = parse_dialogue_script(dialogue_script)
305
+ if not parsed: return "Cost: $0.00 (No lines)"
306
  cost = calculate_cost(chars, len(parsed), tts_model)
307
+ return f"Est. Cost: ${cost:.6f}"
 
308
  except Exception as e: return f"Cost calc error: {str(e)}"
309
 
310
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
311
+ gr.Markdown("# Dialogue Script to Speech (Table Config)")
 
312
  if not OPENAI_API_KEY or not async_openai_client:
313
+ gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
314
 
315
  with gr.Row():
316
  with gr.Column(scale=2):
317
+ script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
318
  with gr.Column(scale=1):
319
+ tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT)
320
+ pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50)
321
+ global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
322
+ global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
323
 
324
  gr.Markdown("### Speaker Configuration")
325
  with gr.Row():
326
  speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
327
+ global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
 
 
 
328
 
329
+ # Detailed Table Configuration UI
330
+ # Visibility of this whole column is controlled by the dropdown selection
331
+ with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")) as detailed_table_config_group:
332
+ load_speakers_button = gr.Button("Load/Refresh Speakers into Table (from Script Above)")
333
+ gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. "
334
+ "Edit table below. Speed/Instructions column applicability depends on TTS model.</small>")
335
  speaker_details_df = gr.DataFrame(
336
+ label="Per-Speaker Settings",
337
+ headers=DATAFRAME_HEADERS, # Initial static headers
338
+ value=pd.DataFrame([["(Load speakers first or type here)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
 
339
  interactive=True,
340
+ row_count=(1, "dynamic"), # Allow dynamic rows, starting with 1
341
+ col_count=(len(DATAFRAME_HEADERS), "fixed")
 
 
342
  )
 
 
343
 
344
  with gr.Row():
345
  calculate_cost_button = gr.Button("Calculate Cost")
 
349
  with gr.Row():
350
  individual_lines_zip_output = gr.File(label="Download ZIP")
351
  merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
352
+ status_output = gr.Textbox(label="Status", interactive=False, lines=1)
353
 
354
+ def update_model_controls_visibility(selected_model):
 
 
355
  is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
356
  is_gpt_mini = selected_model == "gpt-4o-mini-tts"
357
+
358
+ speed_info = " (tts-1/hd)" if is_tts1 else " (N/A)"
359
+ instr_info = " (gpt-4o-mini)" if is_gpt_mini else " (N/A)"
360
+ new_df_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
361
+ f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
362
+ f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
363
  return {
364
  global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
365
  global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
366
+ speaker_details_df: gr.update(headers=new_df_headers) # Update DF headers dynamically
367
  }
368
+ tts_model_dropdown.change(
369
+ fn=update_model_controls_visibility,
370
+ inputs=[tts_model_dropdown],
371
+ outputs=[global_speed_input, global_instructions_input, speaker_details_df]
372
+ )
373
 
374
+ def update_speaker_config_method_visibility(method):
375
  is_single = (method == "Single Voice (Global)")
376
  is_detailed_table = (method == "Detailed Configuration (Table)")
377
  return {
378
  global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
379
+ detailed_table_config_group: gr.update(visible=is_detailed_table),
380
  }
381
+ speaker_config_method_dropdown.change(
382
+ fn=update_speaker_config_method_visibility,
383
+ inputs=[speaker_config_method_dropdown],
384
+ outputs=[global_voice_dropdown, detailed_table_config_group]
385
+ )
386
 
 
387
  load_speakers_button.click(
388
  fn=load_speakers_for_table,
389
+ inputs=[script_input, speaker_details_df, tts_model_dropdown],
390
  outputs=[speaker_details_df]
391
  )
392
 
 
394
  generate_button.click(
395
  fn=handle_script_processing,
396
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
397
+ global_voice_dropdown, speaker_details_df,
 
398
  global_speed_input, global_instructions_input],
399
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
400
 
401
  gr.Markdown("## Examples")
 
 
 
 
 
402
  gr.Examples(
403
  examples=[
404
+ ["[Alice] This table config is new!\n[Bob] Indeed it is.", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""],
405
+ ["[Narrator] Global voice example.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
 
406
  ],
407
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
408
+ speaker_details_df, global_speed_input, global_instructions_input],
 
409
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
410
  fn=handle_script_processing, cache_examples=False)
411
 
412
  if __name__ == "__main__":
413
  if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
414
+ demo.launch(debug=True)