abocha commited on
Commit
a2f0e99
·
1 Parent(s): 954cec7

ui change (pandas)

Browse files
Files changed (2) hide show
  1. app.py +212 -107
  2. requirements.txt +2 -1
app.py CHANGED
@@ -6,6 +6,7 @@ import shutil
6
  import zipfile
7
  import random
8
  import json
 
9
  from openai import AsyncOpenAI
10
 
11
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
@@ -44,57 +45,133 @@ SPEAKER_CONFIG_METHODS = [
44
  "Single Voice (Global)",
45
  "Random per Speaker",
46
  "A/B Round Robin",
47
- "Detailed Configuration (JSON)"
48
  ]
49
- DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
50
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
51
- _speaker_config_cache = {}
52
-
53
- def parse_detailed_speaker_config(json_text, parsed_script_lines):
54
- config_map = {}
55
- default_voice = APP_AVAILABLE_VOICES[0]
 
 
 
 
 
 
 
56
  try:
57
- if not json_text.strip():
58
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- config_list = json.loads(json_text)
61
- if not isinstance(config_list, list):
62
- raise ValueError("JSON config must be a list of speaker objects.")
63
 
64
- for item in config_list:
65
- if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
66
- print(f"Skipping malformed item in JSON config: {item}")
67
- continue
68
- if item["voice"] not in APP_AVAILABLE_VOICES:
69
- print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
70
- item["voice"] = default_voice
71
-
72
- if "speed" in item:
73
- try:
74
- item["speed"] = float(item["speed"])
75
- if not (0.25 <= item["speed"] <= 4.0):
76
- print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping.")
77
- item["speed"] = max(0.25, min(item["speed"], 4.0))
78
- except ValueError:
79
- print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
80
- item.pop("speed", None)
81
-
82
- config_map[item["speaker"]] = {
83
- "voice": item["voice"],
84
- "speed": item.get("speed"),
85
- "instructions": item.get("instructions")
86
- }
87
- return config_map
88
- except json.JSONDecodeError as e:
89
- raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
90
- except ValueError as e:
91
- raise e
92
- except Exception as e:
93
- raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
96
- global_selected_voice, detailed_config_map):
97
- global _speaker_config_cache
98
  if _speaker_config_cache.get("__method") != speaker_config_method or \
99
  _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
100
  _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
@@ -105,20 +182,23 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
105
  base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
106
  return base_config
107
 
108
- if speaker_config_method == "Detailed Configuration (JSON)":
109
- if speaker_name in detailed_config_map:
110
- speaker_specific = detailed_config_map[speaker_name]
111
- return {
 
112
  "voice": speaker_specific.get("voice", base_config["voice"]),
113
- "speed": speaker_specific.get("speed"),
114
- "instructions": speaker_specific.get("instructions")
115
  }
116
- else:
117
- print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
118
  return base_config
119
 
 
120
  if speaker_name not in _speaker_config_cache:
121
  if speaker_config_method == "Random per Speaker":
 
122
  available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
123
  if not _speaker_config_cache.get("__all_assigned_random"):
124
  for i, spk_unique in enumerate(unique_script_speakers):
@@ -127,7 +207,9 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
127
  _speaker_config_cache["__all_assigned_random"] = True
128
  if speaker_name not in _speaker_config_cache:
129
  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
 
130
  elif speaker_config_method == "A/B Round Robin":
 
131
  if not _speaker_config_cache.get("__all_assigned_ab"):
132
  for i, spk_unique in enumerate(unique_script_speakers):
133
  if spk_unique not in _speaker_config_cache:
@@ -138,14 +220,16 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
138
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
139
  else:
140
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
141
-
142
  cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
143
  return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
144
 
 
145
  async def handle_script_processing(
146
  dialogue_script: str, tts_model: str, pause_ms: int,
147
  speaker_config_method: str, global_voice_selection: str,
148
- detailed_speaker_json: str, global_speed: float,
 
149
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
150
  global _speaker_config_cache
151
  _speaker_config_cache = {}
@@ -156,36 +240,36 @@ async def handle_script_processing(
156
  return None, None, "Error: Script is empty."
157
 
158
  job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
159
- if os.path.exists(job_audio_path_prefix):
160
- shutil.rmtree(job_audio_path_prefix)
161
  os.makedirs(job_audio_path_prefix, exist_ok=True)
162
 
163
  try:
164
  parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
165
- if not parsed_lines:
166
- return None, None, "Error: No valid dialogue lines found."
167
- except ValueError as e:
168
- return None, None, f"Script parsing error: {str(e)}"
169
 
170
  unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
 
 
171
  parsed_detailed_config_map = {}
172
- if speaker_config_method == "Detailed Configuration (JSON)":
173
  try:
174
- parsed_detailed_config_map = parse_detailed_speaker_config(detailed_speaker_json, parsed_lines)
175
- except ValueError as e:
176
- return None, None, f"Configuration Error: {str(e)}"
 
177
 
178
  tasks, line_audio_files = [], [None] * len(parsed_lines)
179
  for i, line_data in enumerate(parsed_lines):
180
  speaker_name = line_data["speaker"]
181
  speaker_base_cfg = get_config_for_speaker(
182
  speaker_name, speaker_config_method, unique_speakers,
183
- global_voice_selection, parsed_detailed_config_map)
 
184
  line_voice = speaker_base_cfg["voice"]
185
 
186
  effective_speed = global_speed
187
- if speaker_base_cfg.get("speed") is not None:
188
- effective_speed = speaker_base_cfg["speed"]
189
 
190
  effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
191
  if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
@@ -228,6 +312,7 @@ async def handle_script_processing(
228
  status)
229
 
230
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
 
231
  if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
232
  try:
233
  parsed, chars = parse_dialogue_script(dialogue_script)
@@ -238,8 +323,8 @@ def handle_calculate_cost(dialogue_script: str, tts_model: str):
238
  except Exception as e: return f"Cost calc error: {str(e)}"
239
 
240
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
241
- gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
242
- gr.Markdown("Convert scripts with control over voices, speed, and instructions.")
243
  if not OPENAI_API_KEY or not async_openai_client:
244
  gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
245
 
@@ -249,33 +334,34 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
249
  with gr.Column(scale=1):
250
  tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
251
  pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
252
- global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/tts-1-hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
253
- global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini-tts)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
254
 
255
  gr.Markdown("### Speaker Configuration")
256
  with gr.Row():
257
  speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
258
- global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice' method)", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
259
 
260
- initial_json_config_example = """[
261
- {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
262
- {"speaker": "Bob", "voice": "echo"},
263
- {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
264
- ]""".strip()
265
-
266
- detailed_speaker_config_input = gr.Code(
267
- label="Detailed Speaker Configuration (JSON)",
268
- language="json",
269
- lines=7,
270
- value=initial_json_config_example,
271
- visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
272
- )
273
- # New Markdown for info text, visibility tied to dropdown
274
- detailed_config_info_md = gr.Markdown(
275
- "<small>Define voice per speaker. Optionally, `speed` (0.25-4.0) for `tts-1`/`tts-1-hd` models, "
276
- "and `instructions` (text) for `gpt-4o-mini-tts`.</small>",
277
- visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
278
- )
 
279
 
280
  with gr.Row():
281
  calculate_cost_button = gr.Button("Calculate Cost")
@@ -287,47 +373,66 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
287
  merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
288
  status_output = gr.Textbox(label="Status", interactive=False, lines=2)
289
 
 
290
  def update_model_controls(selected_model):
 
291
  is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
292
  is_gpt_mini = selected_model == "gpt-4o-mini-tts"
 
 
 
 
 
293
  return {
294
  global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
295
- global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
 
296
  }
297
- tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input])
298
 
299
  def update_speaker_controls(method):
300
  is_single = (method == "Single Voice (Global)")
301
- is_detailed = (method == "Detailed Configuration (JSON)")
302
  return {
303
  global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
304
- detailed_speaker_config_input: gr.update(visible=is_detailed, interactive=is_detailed),
305
- detailed_config_info_md: gr.update(visible=is_detailed) # Control visibility of new Markdown
306
  }
307
- speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_speaker_config_input, detailed_config_info_md])
 
 
 
 
 
 
 
308
 
309
  calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
310
  generate_button.click(
311
  fn=handle_script_processing,
312
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
313
- global_voice_dropdown, detailed_speaker_config_input, global_speed_input, global_instructions_input],
 
 
314
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
315
 
316
  gr.Markdown("## Examples")
317
- ex_script1 = "[A] Hi B.\n[B] Hey A.\n[A] What's up?"
318
- ex_json1 = '[{"speaker": "A", "voice": "nova", "instr...": "curious"}, {"speaker": "B", "voice": "echo", "speed": 0.9}]'.replace("instr...", "instructions") # Hack for brevity
319
- ex_script2 = "[Cpt] Status?\n[Comp] Nominal. Slow.\n[Cpt] Good."
320
- ex_json2 = '[{"speaker": "Cpt", "voice": "alloy"}, {"speaker": "Comp", "voice": "onyx", "speed": 0.8, "instr...": "robotic"}]'.replace("instr...", "instructions")
 
321
  gr.Examples(
322
  examples=[
323
- [ex_script1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json1, 1.0, "Speak naturally."],
324
- [ex_script2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json2, 1.1, ""],
325
- ["[N] Single line.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]],
 
326
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
327
- detailed_speaker_config_input, global_speed_input, global_instructions_input],
 
328
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
329
  fn=handle_script_processing, cache_examples=False)
330
 
331
  if __name__ == "__main__":
332
  if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
333
- demo.launch()
 
6
  import zipfile
7
  import random
8
  import json
9
+ import pandas as pd # Import pandas
10
  from openai import AsyncOpenAI
11
 
12
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
 
45
  "Single Voice (Global)",
46
  "Random per Speaker",
47
  "A/B Round Robin",
48
+ "Detailed Configuration (Table)" # Changed from JSON
49
  ]
50
+ DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker" # Or "Detailed Configuration (Table)" if preferred
51
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
52
+ _speaker_config_cache = {} # For Random/AB
53
+
54
+ # --- Speaker Configuration Table Logic ---
55
+ DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
56
+ DATAFRAME_COL_SPEAKER = 0
57
+ DATAFRAME_COL_VOICE = 1
58
+ DATAFRAME_COL_SPEED = 2
59
+ DATAFRAME_COL_INSTRUCTIONS = 3
60
+
61
+ def get_speakers_from_script(script_text):
62
+ if not script_text.strip():
63
+ return []
64
  try:
65
+ # Use a simplified parsing just for speaker names if full parsing is too heavy here
66
+ # For now, leverage existing robust parser.
67
+ parsed_lines, _ = parse_dialogue_script(script_text)
68
+ unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
69
+ return unique_speakers
70
+ except ValueError: # Catch parsing errors like script too long
71
+ return [] # Or raise/notify
72
+
73
+ def convert_df_to_config_map(df_data):
74
+ """Converts DataFrame data (list of lists) to the internal config map."""
75
+ config_map = {}
76
+ if not df_data or not df_data['data']: # df_data can be {'data': [], 'headers': []}
77
+ return config_map
78
+
79
+ for row in df_data['data']:
80
+ speaker_name = row[DATAFRAME_COL_SPEAKER]
81
+ voice = row[DATAFRAME_COL_VOICE]
82
+ speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
83
+ instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
84
+
85
+ # Validate voice
86
+ if voice not in APP_AVAILABLE_VOICES:
87
+ print(f"Warning: Invalid voice '{voice}' for speaker '{speaker_name}' in table. Using default '{APP_AVAILABLE_VOICES[0]}'.")
88
+ voice = APP_AVAILABLE_VOICES[0]
89
+
90
+ cfg = {"voice": voice}
91
+
92
+ if speed_str:
93
+ try:
94
+ speed_val = float(speed_str)
95
+ if 0.25 <= speed_val <= 4.0:
96
+ cfg["speed"] = speed_val
97
+ else:
98
+ print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range (0.25-4.0). Ignoring.")
99
+ except ValueError:
100
+ print(f"Warning: Invalid speed value '{speed_str}' for '{speaker_name}'. Ignoring.")
101
 
102
+ if instructions:
103
+ cfg["instructions"] = instructions
 
104
 
105
+ config_map[speaker_name] = cfg
106
+ return config_map
107
+
108
+ def load_speakers_for_table(script_text, current_df_data_obj, selected_tts_model):
109
+ """
110
+ Populates a DataFrame structure based on speakers in the script.
111
+ Tries to preserve existing settings from current_df_data_obj.
112
+ current_df_data_obj is the raw object from gr.DataFrame, like {'data': [...], 'headers': ...}
113
+ """
114
+ current_config_map = {}
115
+ if current_df_data_obj and current_df_data_obj.get('data'):
116
+ # Convert existing DataFrame to a temporary map to easily fetch settings
117
+ for row in current_df_data_obj['data']:
118
+ # Ensure row has enough elements to prevent IndexError
119
+ if len(row) >= len(DATAFRAME_HEADERS):
120
+ speaker = row[DATAFRAME_COL_SPEAKER]
121
+ current_config_map[speaker] = {
122
+ "voice": row[DATAFRAME_COL_VOICE],
123
+ "speed": str(row[DATAFRAME_COL_SPEED]),
124
+ "instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
125
+ }
126
+ else:
127
+ print(f"Skipping malformed row in current DataFrame data: {row}")
128
+
129
+
130
+ unique_speakers = get_speakers_from_script(script_text)
131
+ new_df_rows = []
132
+
133
+ default_voice = APP_AVAILABLE_VOICES[0]
134
+ default_speed_str = "1.0"
135
+ default_instructions_str = ""
136
+
137
+ for speaker in unique_speakers:
138
+ existing_settings = current_config_map.get(speaker, {})
139
+ new_df_rows.append([
140
+ speaker,
141
+ existing_settings.get("voice", default_voice),
142
+ existing_settings.get("speed", default_speed_str),
143
+ existing_settings.get("instructions", default_instructions_str)
144
+ ])
145
+
146
+ # Provide info about which columns are relevant
147
+ speed_info = " (for tts-1/tts-1-hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A for this model)"
148
+ instr_info = " (for gpt-4o-mini-tts)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A for this model)"
149
+
150
+ headers_with_model_info = [
151
+ DATAFRAME_HEADERS[0],
152
+ DATAFRAME_HEADERS[1],
153
+ f"{DATAFRAME_HEADERS[2]}{speed_info}",
154
+ f"{DATAFRAME_HEADERS[3]}{instr_info}",
155
+ ]
156
+
157
+ # If no speakers, provide a dummy row to keep the DataFrame component from erroring/disappearing if it needs data
158
+ if not new_df_rows:
159
+ new_df_rows.append(["(No speakers detected)", default_voice, default_speed_str, default_instructions_str])
160
+
161
+ # Return a new DataFrame object compatible with gr.DataFrame.update
162
+ # This structure for gr.DataFrame update might be df_obj or pd.DataFrame(new_df_rows, columns=...)
163
+ # Let's try returning a pandas DataFrame
164
+ return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=headers_with_model_info)
165
+
166
+
167
+ # --- Existing parsing and config logic (slightly adapted if needed) ---
168
+ def parse_detailed_speaker_config(json_text, parsed_script_lines): # Kept for reference or other uses, but not for Table UI
169
+ # ... (implementation from previous version for JSON parsing)
170
+ pass # For now, this function is not directly used by the Table UI.
171
 
172
  def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
173
+ global_selected_voice, detailed_config_map_from_table_or_json):
174
+ global _speaker_config_cache
175
  if _speaker_config_cache.get("__method") != speaker_config_method or \
176
  _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
177
  _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
 
182
  base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
183
  return base_config
184
 
185
+ # This part now uses the map derived from either JSON (if we kept that option) or Table
186
+ if speaker_config_method == "Detailed Configuration (Table)": # or "Detailed Configuration (JSON)"
187
+ if speaker_name in detailed_config_map_from_table_or_json:
188
+ speaker_specific = detailed_config_map_from_table_or_json[speaker_name]
189
+ return { # Ensure all keys exist, falling back to None if not in map
190
  "voice": speaker_specific.get("voice", base_config["voice"]),
191
+ "speed": speaker_specific.get("speed"),
192
+ "instructions": speaker_specific.get("instructions")
193
  }
194
+ else:
195
+ print(f"Warning: Speaker '{speaker_name}' not found in Detailed Config. Using default voice '{base_config['voice']}'.")
196
  return base_config
197
 
198
+ # For "Random" and "A/B"
199
  if speaker_name not in _speaker_config_cache:
200
  if speaker_config_method == "Random per Speaker":
201
+ # ... (same logic as before) ...
202
  available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
203
  if not _speaker_config_cache.get("__all_assigned_random"):
204
  for i, spk_unique in enumerate(unique_script_speakers):
 
207
  _speaker_config_cache["__all_assigned_random"] = True
208
  if speaker_name not in _speaker_config_cache:
209
  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
210
+
211
  elif speaker_config_method == "A/B Round Robin":
212
+ # ... (same logic as before) ...
213
  if not _speaker_config_cache.get("__all_assigned_ab"):
214
  for i, spk_unique in enumerate(unique_script_speakers):
215
  if spk_unique not in _speaker_config_cache:
 
220
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
221
  else:
222
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
223
+
224
  cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
225
  return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
226
 
227
+
228
  async def handle_script_processing(
229
  dialogue_script: str, tts_model: str, pause_ms: int,
230
  speaker_config_method: str, global_voice_selection: str,
231
+ speaker_details_df_value: dict, # This will be the dict from gr.DataFrame value
232
+ global_speed: float,
233
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
234
  global _speaker_config_cache
235
  _speaker_config_cache = {}
 
240
  return None, None, "Error: Script is empty."
241
 
242
  job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
243
+ if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
 
244
  os.makedirs(job_audio_path_prefix, exist_ok=True)
245
 
246
  try:
247
  parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
248
+ if not parsed_lines: return None, None, "Error: No valid dialogue lines found."
249
+ except ValueError as e: return None, None, f"Script parsing error: {str(e)}"
 
 
250
 
251
  unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
252
+
253
+ # This is where we get the detailed config map
254
  parsed_detailed_config_map = {}
255
+ if speaker_config_method == "Detailed Configuration (Table)":
256
  try:
257
+ parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
258
+ except Exception as e: # Catch any error during DataFrame conversion
259
+ return None, None, f"Error processing Speaker Table: {str(e)}"
260
+ # (If you had a JSON option, you'd parse JSON here too)
261
 
262
  tasks, line_audio_files = [], [None] * len(parsed_lines)
263
  for i, line_data in enumerate(parsed_lines):
264
  speaker_name = line_data["speaker"]
265
  speaker_base_cfg = get_config_for_speaker(
266
  speaker_name, speaker_config_method, unique_speakers,
267
+ global_voice_selection, parsed_detailed_config_map # Pass the map
268
+ )
269
  line_voice = speaker_base_cfg["voice"]
270
 
271
  effective_speed = global_speed
272
+ if speaker_base_cfg.get("speed") is not None: effective_speed = speaker_base_cfg["speed"]
 
273
 
274
  effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
275
  if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
 
312
  status)
313
 
314
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
315
+ # ... (same as before) ...
316
  if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
317
  try:
318
  parsed, chars = parse_dialogue_script(dialogue_script)
 
323
  except Exception as e: return f"Cost calc error: {str(e)}"
324
 
325
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
326
+ gr.Markdown("# Dialogue Script to Speech Converter (Enhanced Table Config)")
327
+ # ... (intro markdown same) ...
328
  if not OPENAI_API_KEY or not async_openai_client:
329
  gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
330
 
 
334
  with gr.Column(scale=1):
335
  tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
336
  pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
337
+ global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
338
+ global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
339
 
340
  gr.Markdown("### Speaker Configuration")
341
  with gr.Row():
342
  speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
343
+ global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice')", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
344
 
345
+ # --- New UI for Detailed Table Configuration ---
346
+ current_visibility_detailed_table = (DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")
347
+
348
+ with gr.Column(visible=current_visibility_detailed_table) as detailed_table_config_group:
349
+ load_speakers_button = gr.Button("Load/Refresh Speakers for Table from Script Above")
350
+ gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. Type a voice name into the 'Voice' column. "
351
+ "Speed and Instructions applicability depends on the globally selected TTS model.</small>")
352
+ speaker_details_df = gr.DataFrame(
353
+ label="Per-Speaker Settings (Editable Table)",
354
+ headers=DATAFRAME_HEADERS,
355
+ # Initialize with a placeholder row or empty data
356
+ value=pd.DataFrame([["(Load speakers first)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
357
+ interactive=True,
358
+ # `max_rows=20` or some sensible limit might be good for UI
359
+ # `datatype` can be used to specify types per column, e.g., ["str", "str", "number", "str"]
360
+ # For now, all will be string input in table, conversion happens in backend.
361
+ # Consider making "Speaker Name" column read-only if possible with Gradio versions
362
+ )
363
+
364
+ # --- End New UI ---
365
 
366
  with gr.Row():
367
  calculate_cost_button = gr.Button("Calculate Cost")
 
373
  merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
374
  status_output = gr.Textbox(label="Status", interactive=False, lines=2)
375
 
376
+ # --- Event Handlers ---
377
  def update_model_controls(selected_model):
378
+ # ... (same as before) ...
379
  is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
380
  is_gpt_mini = selected_model == "gpt-4o-mini-tts"
381
+ # Also update headers for DataFrame when model changes
382
+ speed_info = " (for tts-1/tts-1-hd)" if is_tts1 else " (N/A for this model)"
383
+ instr_info = " (for gpt-4o-mini-tts)" if is_gpt_mini else " (N/A for this model)"
384
+ new_headers = [DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1], f"{DATAFRAME_HEADERS[2]}{speed_info}", f"{DATAFRAME_HEADERS[3]}{instr_info}"]
385
+
386
  return {
387
  global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
388
+ global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
389
+ speaker_details_df: gr.update(headers=new_headers) # Update headers dynamically
390
  }
391
+ tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input, speaker_details_df])
392
 
393
  def update_speaker_controls(method):
394
  is_single = (method == "Single Voice (Global)")
395
+ is_detailed_table = (method == "Detailed Configuration (Table)")
396
  return {
397
  global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
398
+ detailed_table_config_group: gr.update(visible=is_detailed_table), # Show/hide the whole group
 
399
  }
400
+ speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_table_config_group])
401
+
402
+ # Connect Load Speakers button
403
+ load_speakers_button.click(
404
+ fn=load_speakers_for_table,
405
+ inputs=[script_input, speaker_details_df, tts_model_dropdown], # Pass current DF to preserve, and model for header info
406
+ outputs=[speaker_details_df]
407
+ )
408
 
409
  calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
410
  generate_button.click(
411
  fn=handle_script_processing,
412
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
413
+ global_voice_dropdown,
414
+ speaker_details_df, # Changed from detailed_speaker_config_input (JSON)
415
+ global_speed_input, global_instructions_input],
416
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
417
 
418
  gr.Markdown("## Examples")
419
+ # Examples will need to be adjusted if the "Detailed Configuration (Table)" is default and needs pre-filled table
420
+ # For simplicity, examples might focus on other config methods or a very simple table.
421
+ # For "Detailed Configuration (Table)", the flow is: type script -> click "Load Speakers" -> edit table -> generate.
422
+ # So, an example would just set the script and method. The table data itself isn't directly an input to an example click.
423
+ # We can, however, provide an example script.
424
  gr.Examples(
425
  examples=[
426
+ ["[Alice] Hi Bob.\n[Bob] Hello Alice.\n[Alice] This table config is new!", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""], # DF input is None for example
427
+ ["[Narrator] A single line, global voice.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
428
+ ["[Random1] Voice 1\n[Random2] Voice 2", "gpt-4o-mini-tts", 400, "Random per Speaker", APP_AVAILABLE_VOICES[0], None, 1.0, "Speak clearly."]
429
+ ],
430
  inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
431
+ speaker_details_df, # This will be None for example clicks unless we can construct a df
432
+ global_speed_input, global_instructions_input],
433
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
434
  fn=handle_script_processing, cache_examples=False)
435
 
436
  if __name__ == "__main__":
437
  if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
438
+ demo.launch(debug=True) # Added debug=True for easier testing of UI changes
requirements.txt CHANGED
@@ -3,4 +3,5 @@ pydub
3
  ffmpeg-python
4
  gradio>=4.27
5
  gradio_client
6
- httpx
 
 
3
  ffmpeg-python
4
  gradio>=4.27
5
  gradio_client
6
+ httpx
7
+ pandas